From 4f7b1f0e63c1ba28a6a6feec75d2570ccd1dcc2b Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 17 Sep 2025 16:00:09 +0100 Subject: [PATCH 01/34] RSDEV-782-Jupyter-Notebooks: first cut at scripts for jupyterLite and Hub. Password secret works in Hub and save of notebook works for both --- jupyter_notebooks/provenance_jupyter_hub | 168 ++++++++++++++++++++++ jupyter_notebooks/provenance_jupyter_lite | 155 ++++++++++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 jupyter_notebooks/provenance_jupyter_hub create mode 100644 jupyter_notebooks/provenance_jupyter_lite diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub new file mode 100644 index 0000000..1f5ebc8 --- /dev/null +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -0,0 +1,168 @@ +import json +%pip install -q rspace-client==2.6.1 +%pip install -q pickleshare +%conda install -q notebook +%pip install -q keyring +from rspace_client.eln import eln +import os +import hashlib +import json + +rspace_client = None + +def get_rspace_client(): + """ + Returns rspace ELN API client + """ + try: + import getpass + import keyring + + # Define the service name (e.g., the notebook name the secret is for) + service_id = "RSpaceJupyterDemoApp" + # Define the username associated with the secret + username = "myuser" # use your own username + + retrieved_password = keyring.get_password(service_id, username) + if retrieved_password is None: + retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") + keyring.set_password(service_id, username, retrieved_password) + URL='https://researchspace2.eu.ngrok.io/' + # API_KEY='GWdGGT7evHd4Wjh7X1B38gYeabdVAg54' + global rspace_client + if rspace_client is None: + rspace_client = eln.ELNClient(URL, retrieved_password) + print(rspace_client.get_status()) + return rspace_client + except Exception as e: + print(f"Error connecting to RSpace: {e}") + return None + + +def get_notebook_as_dict(): + """ + Saves notebook using ipylab and then writes notebook to Rspace document as + an attachment + """ + %pip install -q ipylab + from ipylab import JupyterFrontEnd + from ipywidgets import Output + app = JupyterFrontEnd() + app.commands.execute('docmanager:save') + # %history + # print(%dirs) + # print(app) + # print(app.sessions) + # print(app.sessions.running()) + # print(app.sessions.current_session) + # print(globals()['__session__']) + # print(os.environ) + # %pip install -q ipynbname + # import ipynbname + # nb_fname = ipynbname.name() + # nb_path = ipynbname.path() + # print(f"{nb_fname=}") + # print(f"{nb_path=}") + try: + import glob + + notebook_files = glob.glob("*.ipynb") + if notebook_files: + raw_notebook_file_id = 477 + # gallery_file_id = None + attachment_file_id = 476 + # raw_notebook_file_id = 444 + gallery_file_id = 443 + # FIXME - Uses the most recently modified notebook which might not be this notebook + # latest_notebook = max(notebook_files, key=os.path.getmtime) + latest_notebook = 'RSpaceJupyterLab.ipynb' + attachedData = "spectroscopy_data.csv" + attachments = None + updateDocAttachments = False + if raw_notebook_file_id: + print(f"A document with attachement to this notebook saved previously with RSpaceID {raw_notebook_file_id}" ) + else: + print("NO document with attachement to this notebook saved previously in RSpace") + if gallery_file_id: + print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) + else: + print("Notebook not previously saved to RSpace Gallery") + # with open(latest_notebook) as f: + # d = json.load(f) + # print(d) + with open(attachedData, 'r', encoding='utf-8') as attch: + client = get_rspace_client() + if attachment_file_id is None: + print('start upload attachments') + attachment_file = client.upload_file(attch)['id'] + print(f"Attachment file ID is: {attachment_file}") + print('done upload attachments') + else: + print('start update attachments') + attachment_file = client.update_file(attch,attachment_file_id)['id'] + print(f"Attachment file ID is: {attachment_file}") + print('done update attachments') + attachment_file_id = attachment_file + with open(latest_notebook, 'r', encoding='utf-8') as f: + client = get_rspace_client() + if gallery_file_id is None: + print('start upload to gallery') + gallery_file_id = client.upload_file(f)['id'] + print(f"Gallery file ID is: {gallery_file_id}") + print('done upload to gallery') + else: + print('start update to gallery') + gallery_file_id = client.update_file(f,gallery_file_id)['id'] + print('end update to gallery') + print(f"Gallery file ID is: {gallery_file_id}") + location = os.getcwd() + if raw_notebook_file_id is None: + new_doc = client.create_document(name="DocumentFor_"+latest_notebook) + content = f""" +

A link to jupyter notebook inserted into gallery. + Notebook located at :{location} on server + data: +

+

A link to data used by this notebook. + data: +

+ """ + + updated_doc = client.append_content(new_doc['id'], content) + print(f"Document with this notebook as attachement has ID: {new_doc['id']}") + elif updateDocAttachments: + updated_doc = client.append_content(raw_notebook_file_id, newContent) + print(f"Updated document with this notebook as attachement has ID: {new_doc['id']}") + # with open(latest_notebook, 'r', encoding='utf-8') as f: + # notebook_dict = json.load(f) + # docName = f.name + # client = get_rspace_client() + # if(raw_notebook_file_id): + # print('start doc update') + # raw_data_file = client.update_document( + # raw_notebook_file_id, + # name = docName, + # tags = ["Python", "API", "example"], + # fields = [{"content": json.dumps(notebook_dict)}], + # ) + # print(f"Updated notebook: {latest_notebook}") + # else: + # raw_data_file = client.create_document( + # name = docName, + # tags = ["Python", "API", "example"], + # fields = [{"content": json.dumps(notebook_dict)}], + # ) + # print(json.dumps(notebook_dict)) + # print(f"Created notebook: {latest_notebook}") + # raw_notebook_file_id = raw_data_file['id'] + # print(raw_notebook_file_id) + else: + print("No .ipynb files found in current directory") + return None + + except Exception as e: + print(f"Error reading notebook file: {e}") + return None + +notebook_dict = get_notebook_as_dict() + diff --git a/jupyter_notebooks/provenance_jupyter_lite b/jupyter_notebooks/provenance_jupyter_lite new file mode 100644 index 0000000..ffccfe3 --- /dev/null +++ b/jupyter_notebooks/provenance_jupyter_lite @@ -0,0 +1,155 @@ +import json +%pip install -q rspace-client==2.6.1 +%pip install -q pickleshare +from rspace_client.eln import eln +import os +import hashlib +import json + +rspace_client = None + +def get_rspace_client(): + """ + Returns rspace ELN API client + """ + try: + import getpass + retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") + URL='https://researchspace2.eu.ngrok.io/' + global rspace_client + if rspace_client is None: + rspace_client = eln.ELNClient(URL, retrieved_password) + print(rspace_client.get_status()) + return rspace_client + except Exception as e: + print(f"Error connecting to RSpace: {e}") + return None + + +def get_notebook_as_dict(): + """ + Saves notebook using ipylab and then writes notebook to Rspace document as + an attachment + """ + # %pip install -q ipylab + # from ipylab import JupyterFrontEnd + # from ipywidgets import Output + # app = JupyterFrontEnd() + # app.commands.execute('docmanager:save') + # %history + # print(%dirs) + # print(app) + # print(app.sessions) + # print(app.sessions.running()) + # print(app.sessions.current_session) + # print(globals()['__session__']) + # print(os.environ) + # %pip install -q ipynbname + # import ipynbname + # nb_fname = ipynbname.name() + # nb_path = ipynbname.path() + # print(f"{nb_fname=}") + # print(f"{nb_path=}") + try: + import glob + + notebook_files = glob.glob("*.ipynb") + if notebook_files: + raw_notebook_file_id = 477 + # gallery_file_id = None + attachment_file_id = 476 + # raw_notebook_file_id = 444 + gallery_file_id = 443 + # FIXME - Uses the most recently modified notebook which might not be this notebook + # latest_notebook = max(notebook_files, key=os.path.getmtime) + latest_notebook = 'RSpaceDemoCopyData.ipynb' + attachedData = "spectroscopy_data.csv" + attachments = None + updateDocAttachments = False + if raw_notebook_file_id: + print(f"A document with attachement to this notebook saved previously with RSpaceID {raw_notebook_file_id}" ) + else: + print("NO document with attachement to this notebook saved previously in RSpace") + if gallery_file_id: + print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) + else: + print("Notebook not previously saved to RSpace Gallery") + # with open(latest_notebook) as f: + # d = json.load(f) + # print(d) + with open(attachedData, 'r', encoding='utf-8') as attch: + client = get_rspace_client() + if attachment_file_id is None: + print('start upload attachments') + attachment_file = client.upload_file(attch)['id'] + print(f"Attachment file ID is: {attachment_file}") + print('done upload attachments') + else: + print('start update attachments') + attachment_file = client.update_file(attch,attachment_file_id)['id'] + print(f"Attachment file ID is: {attachment_file}") + print('done update attachments') + attachment_file_id = attachment_file + with open(latest_notebook, 'r', encoding='utf-8') as f: + client = get_rspace_client() + if gallery_file_id is None: + print('start upload to gallery') + gallery_file_id = client.upload_file(f)['id'] + print(f"Gallery file ID is: {gallery_file_id}") + print('done upload to gallery') + else: + print('start update to gallery') + gallery_file_id = client.update_file(f,gallery_file_id)['id'] + print('end update to gallery') + print(f"Gallery file ID is: {gallery_file_id}") + location = os.getcwd() + if raw_notebook_file_id is None: + new_doc = client.create_document(name="DocumentFor_"+latest_notebook) + content = f""" +

A link to jupyter notebook inserted into gallery. + Notebook located at :{location} on server + data: +

+

A link to data used by this notebook. + data: +

+ """ + + updated_doc = client.append_content(new_doc['id'], content) + print(f"Document with this notebook as attachement has ID: {new_doc['id']}") + elif updateDocAttachments: + updated_doc = client.append_content(raw_notebook_file_id, newContent) + print(f"Updated document with this notebook as attachement has ID: {new_doc['id']}") + # with open(latest_notebook, 'r', encoding='utf-8') as f: + # notebook_dict = json.load(f) + # docName = f.name + # client = get_rspace_client() + # if(raw_notebook_file_id): + # print('start doc update') + # raw_data_file = client.update_document( + # raw_notebook_file_id, + # name = docName, + # tags = ["Python", "API", "example"], + # fields = [{"content": json.dumps(notebook_dict)}], + # ) + # print(f"Updated notebook: {latest_notebook}") + # else: + # raw_data_file = client.create_document( + # name = docName, + # tags = ["Python", "API", "example"], + # fields = [{"content": json.dumps(notebook_dict)}], + # ) + # print(json.dumps(notebook_dict)) + # print(f"Created notebook: {latest_notebook}") + # raw_notebook_file_id = raw_data_file['id'] + # print(raw_notebook_file_id) + else: + print("No .ipynb files found in current directory") + return None + + except Exception as e: + print(f"Error reading notebook file: {e}") + return None + +notebook_dict = get_notebook_as_dict() + From aba302d8b812d083e7b74e4473b8f031ef148660 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 18 Sep 2025 08:37:23 +0100 Subject: [PATCH 02/34] RSDEV-782-Jupyter-Notebooks: password secrets and starts to use dill to store variables --- jupyter_notebooks/provenance_jupyter_hub | 189 +++++++++++++--------- jupyter_notebooks/provenance_jupyter_lite | 38 ++++- 2 files changed, 153 insertions(+), 74 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 1f5ebc8..e00aeff 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -7,30 +7,43 @@ from rspace_client.eln import eln import os import hashlib import json +%pip install -q dill +import dill rspace_client = None -def get_rspace_client(): +def get_password(): """ - Returns rspace ELN API client + Retrieves password from (or saves a new password to) keyring """ try: import getpass import keyring - # Define the service name (e.g., the notebook name the secret is for) + # TODO - Define the service name (e.g., the notebook name the secret is for) service_id = "RSpaceJupyterDemoApp" - # Define the username associated with the secret + # TODO - Define the username associated with the secret username = "myuser" # use your own username retrieved_password = keyring.get_password(service_id, username) if retrieved_password is None: retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") keyring.set_password(service_id, username, retrieved_password) + return retrieved_password + except Exception as e: + print(f"Error getting password: {e}") + return None + +def get_rspace_client(): + """ + Returns rspace ELN API client + """ + try: + # TODO - Your RSpace instance goes here URL='https://researchspace2.eu.ngrok.io/' - # API_KEY='GWdGGT7evHd4Wjh7X1B38gYeabdVAg54' global rspace_client if rspace_client is None: + retrieved_password = get_password() rspace_client = eln.ELNClient(URL, retrieved_password) print(rspace_client.get_status()) return rspace_client @@ -38,6 +51,38 @@ def get_rspace_client(): print(f"Error connecting to RSpace: {e}") return None +def save_data(rspace_doc, attachments, gallery_file): + # Define the filename to save the state + state_filename = "notebook_state.pkl" + + # Save the variables to the file using dill + with open(state_filename, 'wb') as f: + dill.dump({'rspace_doc_for_notebook': rspace_doc, 'attachments_for_notebook': attachments, 'gallery_file_for_notebook': gallery_file}, f) + print(f"Variables saved to {state_filename}") + +def load_data(): + # Define the filename where the state was saved + state_filename = "notebook_state.pkl" + + # Check if the state file exists before attempting to load + if os.path.exists(state_filename): + # Load the variables from the file using dill + with open(state_filename, 'rb') as f: + try: + loaded_state = dill.load(f) + except Exception as e: + loaded_state = {} + rspace_doc = loaded_state.get('rspace_doc_for_notebook') + attachments = loaded_state.get('attachments_for_notebook') + gallery_file = loaded_state.get('gallery_file_for_notebook') + + print(f"Variables loaded from {state_filename}") + print(f"rspace_doc: {rspace_doc}") + print(f"attachments: {attachments}") + print(f"gallery_file: {gallery_file}") + + else: + print(f"State file '{state_filename}' not found. No variables loaded.") def get_notebook_as_dict(): """ @@ -49,6 +94,7 @@ def get_notebook_as_dict(): from ipywidgets import Output app = JupyterFrontEnd() app.commands.execute('docmanager:save') + # print(locals()) # %history # print(%dirs) # print(app) @@ -65,74 +111,74 @@ def get_notebook_as_dict(): # print(f"{nb_path=}") try: import glob + load_data() + raw_notebook_file_id = 477 + # gallery_file_id = None + attachment_file_id = 476 + # raw_notebook_file_id = 444 + gallery_file_id = 443 - notebook_files = glob.glob("*.ipynb") - if notebook_files: - raw_notebook_file_id = 477 - # gallery_file_id = None - attachment_file_id = 476 - # raw_notebook_file_id = 444 - gallery_file_id = 443 - # FIXME - Uses the most recently modified notebook which might not be this notebook - # latest_notebook = max(notebook_files, key=os.path.getmtime) - latest_notebook = 'RSpaceJupyterLab.ipynb' - attachedData = "spectroscopy_data.csv" - attachments = None - updateDocAttachments = False - if raw_notebook_file_id: - print(f"A document with attachement to this notebook saved previously with RSpaceID {raw_notebook_file_id}" ) + save_data(raw_notebook_file_id,attachment_file_id,gallery_file_id) + # FIXME - Uses the most recently modified notebook which might not be this notebook + # latest_notebook = max(notebook_files, key=os.path.getmtime) + latest_notebook = 'RSpaceJupyterLab.ipynb' + attachedData = "spectroscopy_data.csv" + attachments = None + updateDocAttachments = False + if raw_notebook_file_id: + print(f"A document with attachement to this notebook saved previously with RSpaceID {raw_notebook_file_id}" ) + else: + print("NO document with attachement to this notebook saved previously in RSpace") + if gallery_file_id: + print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) + else: + print("Notebook not previously saved to RSpace Gallery") + # with open(latest_notebook) as f: + # d = json.load(f) + # print(d) + with open(attachedData, 'r', encoding='utf-8') as attch: + client = get_rspace_client() + if attachment_file_id is None: + print('start upload attachments') + attachment_file = client.upload_file(attch)['id'] + print(f"Attachment file ID is: {attachment_file}") + print('done upload attachments') else: - print("NO document with attachement to this notebook saved previously in RSpace") - if gallery_file_id: - print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) + print('start update attachments') + attachment_file = client.update_file(attch,attachment_file_id)['id'] + print(f"Attachment file ID is: {attachment_file}") + print('done update attachments') + attachment_file_id = attachment_file + with open(latest_notebook, 'r', encoding='utf-8') as f: + client = get_rspace_client() + if gallery_file_id is None: + print('start upload to gallery') + gallery_file_id = client.upload_file(f)['id'] + print(f"Gallery file ID is: {gallery_file_id}") + print('done upload to gallery') else: - print("Notebook not previously saved to RSpace Gallery") - # with open(latest_notebook) as f: - # d = json.load(f) - # print(d) - with open(attachedData, 'r', encoding='utf-8') as attch: - client = get_rspace_client() - if attachment_file_id is None: - print('start upload attachments') - attachment_file = client.upload_file(attch)['id'] - print(f"Attachment file ID is: {attachment_file}") - print('done upload attachments') - else: - print('start update attachments') - attachment_file = client.update_file(attch,attachment_file_id)['id'] - print(f"Attachment file ID is: {attachment_file}") - print('done update attachments') - attachment_file_id = attachment_file - with open(latest_notebook, 'r', encoding='utf-8') as f: - client = get_rspace_client() - if gallery_file_id is None: - print('start upload to gallery') - gallery_file_id = client.upload_file(f)['id'] - print(f"Gallery file ID is: {gallery_file_id}") - print('done upload to gallery') - else: - print('start update to gallery') - gallery_file_id = client.update_file(f,gallery_file_id)['id'] - print('end update to gallery') - print(f"Gallery file ID is: {gallery_file_id}") - location = os.getcwd() - if raw_notebook_file_id is None: - new_doc = client.create_document(name="DocumentFor_"+latest_notebook) - content = f""" -

A link to jupyter notebook inserted into gallery. - Notebook located at :{location} on server - data: -

-

A link to data used by this notebook. - data: -

- """ - - updated_doc = client.append_content(new_doc['id'], content) - print(f"Document with this notebook as attachement has ID: {new_doc['id']}") - elif updateDocAttachments: - updated_doc = client.append_content(raw_notebook_file_id, newContent) - print(f"Updated document with this notebook as attachement has ID: {new_doc['id']}") + print('start update to gallery') + gallery_file_id = client.update_file(f,gallery_file_id)['id'] + print('end update to gallery') + print(f"Gallery file ID is: {gallery_file_id}") + location = os.getcwd() + if raw_notebook_file_id is None: + new_doc = client.create_document(name="DocumentFor_"+latest_notebook) + content = f""" +

A link to jupyter notebook inserted into gallery. + Notebook located at :{location} on server + data: +

+

A link to data used by this notebook. + data: +

+ """ + + updated_doc = client.append_content(new_doc['id'], content) + print(f"Document with this notebook as attachement has ID: {new_doc['id']}") + elif updateDocAttachments: + updated_doc = client.append_content(raw_notebook_file_id, newContent) + print(f"Updated document with this notebook as attachement has ID: {new_doc['id']}") # with open(latest_notebook, 'r', encoding='utf-8') as f: # notebook_dict = json.load(f) # docName = f.name @@ -156,9 +202,6 @@ def get_notebook_as_dict(): # print(f"Created notebook: {latest_notebook}") # raw_notebook_file_id = raw_data_file['id'] # print(raw_notebook_file_id) - else: - print("No .ipynb files found in current directory") - return None except Exception as e: print(f"Error reading notebook file: {e}") diff --git a/jupyter_notebooks/provenance_jupyter_lite b/jupyter_notebooks/provenance_jupyter_lite index ffccfe3..dda01d6 100644 --- a/jupyter_notebooks/provenance_jupyter_lite +++ b/jupyter_notebooks/provenance_jupyter_lite @@ -5,6 +5,8 @@ from rspace_client.eln import eln import os import hashlib import json +%pip install -q dill +import dill rspace_client = None @@ -25,6 +27,38 @@ def get_rspace_client(): print(f"Error connecting to RSpace: {e}") return None +def save_data(rspace_doc, attachments, gallery_file): + # Define the filename to save the state + state_filename = "notebook_state.pkl" + + # Save the variables to the file using dill + with open(state_filename, 'wb') as f: + dill.dump({'rspace_doc_for_notebook': rspace_doc, 'attachments_for_notebook': attachments, 'gallery_file_for_notebook': gallery_file}, f) + print(f"Variables saved to {state_filename}") + +def load_data(): + # Define the filename where the state was saved + state_filename = "notebook_state.pkl" + + # Check if the state file exists before attempting to load + if os.path.exists(state_filename): + # Load the variables from the file using dill + with open(state_filename, 'rb') as f: + try: + loaded_state = dill.load(f) + except Exception as e: + loaded_state = {} + rspace_doc = loaded_state.get('rspace_doc_for_notebook') + attachments = loaded_state.get('attachments_for_notebook') + gallery_file = loaded_state.get('gallery_file_for_notebook') + + print(f"Variables loaded from {state_filename}") + print(f"rspace_doc: {rspace_doc}") + print(f"attachments: {attachments}") + print(f"gallery_file: {gallery_file}") + + else: + print(f"State file '{state_filename}' not found. No variables loaded.") def get_notebook_as_dict(): """ @@ -52,7 +86,7 @@ def get_notebook_as_dict(): # print(f"{nb_path=}") try: import glob - + load_data() notebook_files = glob.glob("*.ipynb") if notebook_files: raw_notebook_file_id = 477 @@ -60,6 +94,8 @@ def get_notebook_as_dict(): attachment_file_id = 476 # raw_notebook_file_id = 444 gallery_file_id = 443 + + save_data(raw_notebook_file_id,attachment_file_id,gallery_file_id) # FIXME - Uses the most recently modified notebook which might not be this notebook # latest_notebook = max(notebook_files, key=os.path.getmtime) latest_notebook = 'RSpaceDemoCopyData.ipynb' From e4fd183ba6fdb71860bac6eaea9ff6b916d30d08 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 18 Sep 2025 12:13:14 +0100 Subject: [PATCH 03/34] RSDEV-782-Jupyter-Notebooks: notebook filename for JupyterHub but no solution for JupyterLite --- jupyter_notebooks/provenance_jupyter_hub | 97 +++++++++++------------- 1 file changed, 45 insertions(+), 52 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index e00aeff..fc70035 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -9,8 +9,27 @@ import hashlib import json %pip install -q dill import dill +%pip install -q ipynbname +import ipynbname +%pip install -q ipylab +from ipylab import JupyterFrontEnd +import traceback rspace_client = None +RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' +ATTACHMENTS_FOR_NOTEBOOK = 'attachments_for_notebook' +GALLERY_FILE_FOR_NOTEBOOK = 'gallery_file_for_notebook' + + +def get_notebook_name(): + nb_fname = ipynbname.name() + nb_path = str(ipynbname.path()) + ext_pos=(''+nb_path).rfind('.') + ext=nb_path[ext_pos:] + print(f"{ext=}") + print(f"{nb_fname=}") + print(f"{nb_path=}") + return {'name':nb_fname+ext, 'path':nb_path} def get_password(): """ @@ -48,6 +67,7 @@ def get_rspace_client(): print(rspace_client.get_status()) return rspace_client except Exception as e: + print(traceback.format_exc()) print(f"Error connecting to RSpace: {e}") return None @@ -57,7 +77,7 @@ def save_data(rspace_doc, attachments, gallery_file): # Save the variables to the file using dill with open(state_filename, 'wb') as f: - dill.dump({'rspace_doc_for_notebook': rspace_doc, 'attachments_for_notebook': attachments, 'gallery_file_for_notebook': gallery_file}, f) + dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, GALLERY_FILE_FOR_NOTEBOOK: gallery_file}, f) print(f"Variables saved to {state_filename}") def load_data(): @@ -72,56 +92,28 @@ def load_data(): loaded_state = dill.load(f) except Exception as e: loaded_state = {} - rspace_doc = loaded_state.get('rspace_doc_for_notebook') - attachments = loaded_state.get('attachments_for_notebook') - gallery_file = loaded_state.get('gallery_file_for_notebook') - - print(f"Variables loaded from {state_filename}") - print(f"rspace_doc: {rspace_doc}") - print(f"attachments: {attachments}") - print(f"gallery_file: {gallery_file}") - else: print(f"State file '{state_filename}' not found. No variables loaded.") + return loaded_state + +def save_notebook(): + app = JupyterFrontEnd() + app.commands.execute('docmanager:save') def get_notebook_as_dict(): """ Saves notebook using ipylab and then writes notebook to Rspace document as an attachment """ - %pip install -q ipylab - from ipylab import JupyterFrontEnd - from ipywidgets import Output - app = JupyterFrontEnd() - app.commands.execute('docmanager:save') - # print(locals()) - # %history - # print(%dirs) - # print(app) - # print(app.sessions) - # print(app.sessions.running()) - # print(app.sessions.current_session) - # print(globals()['__session__']) - # print(os.environ) - # %pip install -q ipynbname - # import ipynbname - # nb_fname = ipynbname.name() - # nb_path = ipynbname.path() - # print(f"{nb_fname=}") - # print(f"{nb_path=}") + save_notebook() try: import glob - load_data() - raw_notebook_file_id = 477 - # gallery_file_id = None - attachment_file_id = 476 - # raw_notebook_file_id = 444 - gallery_file_id = 443 - - save_data(raw_notebook_file_id,attachment_file_id,gallery_file_id) - # FIXME - Uses the most recently modified notebook which might not be this notebook - # latest_notebook = max(notebook_files, key=os.path.getmtime) - latest_notebook = 'RSpaceJupyterLab.ipynb' + loaded_state = load_data() + raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + attachment_file_id = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) + gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) + + current_notebook = get_notebook_name()['name'] attachedData = "spectroscopy_data.csv" attachments = None updateDocAttachments = False @@ -133,9 +125,6 @@ def get_notebook_as_dict(): print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) else: print("Notebook not previously saved to RSpace Gallery") - # with open(latest_notebook) as f: - # d = json.load(f) - # print(d) with open(attachedData, 'r', encoding='utf-8') as attch: client = get_rspace_client() if attachment_file_id is None: @@ -149,21 +138,23 @@ def get_notebook_as_dict(): print(f"Attachment file ID is: {attachment_file}") print('done update attachments') attachment_file_id = attachment_file - with open(latest_notebook, 'r', encoding='utf-8') as f: + with open(current_notebook, 'r', encoding='utf-8') as f: client = get_rspace_client() if gallery_file_id is None: print('start upload to gallery') - gallery_file_id = client.upload_file(f)['id'] - print(f"Gallery file ID is: {gallery_file_id}") + gallery_file = client.upload_file(f)['id'] + print(f"Gallery file ID is: {gallery_file}") print('done upload to gallery') else: print('start update to gallery') - gallery_file_id = client.update_file(f,gallery_file_id)['id'] + gallery_file = client.update_file(f,gallery_file_id)['id'] print('end update to gallery') print(f"Gallery file ID is: {gallery_file_id}") + gallery_file_id = gallery_file location = os.getcwd() if raw_notebook_file_id is None: - new_doc = client.create_document(name="DocumentFor_"+latest_notebook) + new_doc = client.create_document(name="DocumentFor_"+current_notebook) + raw_notebook_file_id = new_doc['id'] content = f"""

A link to jupyter notebook inserted into gallery. Notebook located at :{location} on server @@ -179,7 +170,8 @@ def get_notebook_as_dict(): elif updateDocAttachments: updated_doc = client.append_content(raw_notebook_file_id, newContent) print(f"Updated document with this notebook as attachement has ID: {new_doc['id']}") - # with open(latest_notebook, 'r', encoding='utf-8') as f: + + # with open(current_notebook, 'r', encoding='utf-8') as f: # notebook_dict = json.load(f) # docName = f.name # client = get_rspace_client() @@ -191,7 +183,7 @@ def get_notebook_as_dict(): # tags = ["Python", "API", "example"], # fields = [{"content": json.dumps(notebook_dict)}], # ) - # print(f"Updated notebook: {latest_notebook}") + # print(f"Updated notebook: {current_notebook}") # else: # raw_data_file = client.create_document( # name = docName, @@ -199,11 +191,12 @@ def get_notebook_as_dict(): # fields = [{"content": json.dumps(notebook_dict)}], # ) # print(json.dumps(notebook_dict)) - # print(f"Created notebook: {latest_notebook}") + # print(f"Created notebook: {current_notebook}") # raw_notebook_file_id = raw_data_file['id'] # print(raw_notebook_file_id) except Exception as e: + print(traceback.format_exc()) print(f"Error reading notebook file: {e}") return None From b03df8bc11db21fb3c9b45b0ecac54e9a352e89a Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Tue, 23 Sep 2025 14:33:33 +0100 Subject: [PATCH 04/34] RSDEV-782-Jupyter-Notebooks: removes attachments on update of rspace doc if they match saved ids for jupyter_notebook or its data_files --- jupyter_notebooks/provenance_jupyter_hub | 89 +++++++++++++++--------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index fc70035..e746170 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -1,7 +1,10 @@ import json %pip install -q rspace-client==2.6.1 %pip install -q pickleshare -%conda install -q notebook +try: + from notebook import app +except: + %conda install -q notebook %pip install -q keyring from rspace_client.eln import eln import os @@ -14,6 +17,8 @@ import ipynbname %pip install -q ipylab from ipylab import JupyterFrontEnd import traceback +%pip install -q lxml +from bs4 import BeautifulSoup rspace_client = None RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' @@ -26,10 +31,10 @@ def get_notebook_name(): nb_path = str(ipynbname.path()) ext_pos=(''+nb_path).rfind('.') ext=nb_path[ext_pos:] - print(f"{ext=}") - print(f"{nb_fname=}") - print(f"{nb_path=}") - return {'name':nb_fname+ext, 'path':nb_path} + # print(f"{ext=}") + # print(f"{nb_fname=}") + # print(f"{nb_path=}") + return {'name':nb_fname+ext, 'part_name':nb_fname,'path':nb_path} def get_password(): """ @@ -71,9 +76,9 @@ def get_rspace_client(): print(f"Error connecting to RSpace: {e}") return None -def save_data(rspace_doc, attachments, gallery_file): +def save_rspace_data_ids(rspace_doc, attachments, gallery_file): # Define the filename to save the state - state_filename = "notebook_state.pkl" + state_filename = get_notebook_name()['part_name']+"_state.pkl" # Save the variables to the file using dill with open(state_filename, 'wb') as f: @@ -82,7 +87,7 @@ def save_data(rspace_doc, attachments, gallery_file): def load_data(): # Define the filename where the state was saved - state_filename = "notebook_state.pkl" + state_filename = get_notebook_name()['part_name']+"_state.pkl" # Check if the state file exists before attempting to load if os.path.exists(state_filename): @@ -100,14 +105,38 @@ def save_notebook(): app = JupyterFrontEnd() app.commands.execute('docmanager:save') -def get_notebook_as_dict(): +def make_content(gallery_file_id,attachment_file_id, location): + content = f""" + + + """ + return content + +def remove_jupyter_attachment_divs(content, gallery_file_id, attachment_file_id): + soup = BeautifulSoup(content, 'html.parser') + attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) + for attachment_div in attachment_divs: + href_tag = attachment_div.find('a') + attachment_link = '/Streamfile/' + str(attachment_file_id) + gallery_link = '/Streamfile/' + str(gallery_file_id) + # print(f"attachment_link content: {attachment_link}") + # print(f"href_tag content: {href_tag['href']}") + # print(f"541 in this div:{'541' in href_tag['href']}") + # print(attachment_link in href_tag['href'] or gallery_link in href_tag['href']) + if attachment_link in href_tag['href'] or gallery_link in href_tag['href']: + attachment_div.decompose() + return soup.prettify() + +def get_notebook(): + raw_notebook_file_id = None + attachment_file_id = None + gallery_file_id = None """ Saves notebook using ipylab and then writes notebook to Rspace document as an attachment """ save_notebook() try: - import glob loaded_state = load_data() raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) attachment_file_id = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) @@ -117,11 +146,11 @@ def get_notebook_as_dict(): attachedData = "spectroscopy_data.csv" attachments = None updateDocAttachments = False - if raw_notebook_file_id: + if raw_notebook_file_id is not None: print(f"A document with attachement to this notebook saved previously with RSpaceID {raw_notebook_file_id}" ) else: print("NO document with attachement to this notebook saved previously in RSpace") - if gallery_file_id: + if gallery_file_id is not None: print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) else: print("Notebook not previously saved to RSpace Gallery") @@ -149,28 +178,26 @@ def get_notebook_as_dict(): print('start update to gallery') gallery_file = client.update_file(f,gallery_file_id)['id'] print('end update to gallery') - print(f"Gallery file ID is: {gallery_file_id}") + print(f"Gallery file ID is: {gallery_file}") gallery_file_id = gallery_file location = os.getcwd() if raw_notebook_file_id is None: - new_doc = client.create_document(name="DocumentFor_"+current_notebook) + new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) raw_notebook_file_id = new_doc['id'] - content = f""" -

A link to jupyter notebook inserted into gallery. - Notebook located at :{location} on server - data: -

-

A link to data used by this notebook. - data: -

- """ - - updated_doc = client.append_content(new_doc['id'], content) - print(f"Document with this notebook as attachement has ID: {new_doc['id']}") - elif updateDocAttachments: - updated_doc = client.append_content(raw_notebook_file_id, newContent) - print(f"Updated document with this notebook as attachement has ID: {new_doc['id']}") - + content = make_content(gallery_file_id,attachment_file_id, location) + client.append_content(new_doc['id'], content) + print(f"New document with this notebook as attachement has ID: {new_doc['id']}") + else: + previous_content = client.get_document(raw_notebook_file_id)['fields'][0]['content'] + print(f"previous content BEFORE metadata removed : {previous_content}") + previous_content = remove_jupyter_attachment_divs(previous_content, gallery_file_id, attachment_file_id) + print(f"previous content after metadata removed : {previous_content}") + new_content = previous_content + make_content(gallery_file_id,attachment_file_id, location) + updated_doc = client.update_document(raw_notebook_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) + # client.append_content(updated_doc['id'], content) + raw_notebook_file_id = updated_doc['id'] + print(f"Updated document with this notebook as attachement has ID: {updated_doc['id']}") + save_rspace_data_ids(raw_notebook_file_id, attachment_file_id, gallery_file_id) # with open(current_notebook, 'r', encoding='utf-8') as f: # notebook_dict = json.load(f) # docName = f.name @@ -200,5 +227,5 @@ def get_notebook_as_dict(): print(f"Error reading notebook file: {e}") return None -notebook_dict = get_notebook_as_dict() +notebook_dict = get_notebook() From f7e7c9cd7501965e1703f1247737d75d499eea5b Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 24 Sep 2025 18:35:49 +0100 Subject: [PATCH 05/34] RSDEV-782-Jupyter-Notebooks: async calls to save and reload notebook --- jupyter_notebooks/provenance_jupyter_hub | 58 ++++++++++++++++++------ 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index e746170..69a8d71 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -19,12 +19,16 @@ from ipylab import JupyterFrontEnd import traceback %pip install -q lxml from bs4 import BeautifulSoup +import nbformat +import asyncio -rspace_client = None RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' ATTACHMENTS_FOR_NOTEBOOK = 'attachments_for_notebook' GALLERY_FILE_FOR_NOTEBOOK = 'gallery_file_for_notebook' +rspace_client = None +app = JupyterFrontEnd() + def get_notebook_name(): nb_fname = ipynbname.name() @@ -101,11 +105,35 @@ def load_data(): print(f"State file '{state_filename}' not found. No variables loaded.") return loaded_state -def save_notebook(): - app = JupyterFrontEnd() +async def save_notebook(): app.commands.execute('docmanager:save') + await asyncio.sleep(1) + +async def reload_notebook(): + app.commands.execute('docmanager:reload') + await asyncio.sleep(1) + +async def add_to_notebook_metadata(fname, text): + await save_notebook() + with open(fname, 'r') as original: + nb = nbformat.read(original, nbformat.NO_CONVERT) + meta_data_cell = nbformat.v4.new_raw_cell(text) + meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": []} } + nb["cells"].extend([meta_data_cell]) + # for i, cell in enumerate(nb['cells']): + # print(f"Cell {i+1}:") + # print(cell['metadata']) + # data = json.load(original) + # print(data['metadata']) + # data['metadata']['rspace'] = text; + # print(data['metadata']) + # with open(fname, 'w') as modified: + # modified.write(data); + with open(fname, 'w', encoding='utf-8') as modified: + nbformat.write(nb, modified) + await reload_notebook() -def make_content(gallery_file_id,attachment_file_id, location): +def make_content(gallery_file_id,attachment_file_id): content = f""" @@ -121,13 +149,12 @@ def remove_jupyter_attachment_divs(content, gallery_file_id, attachment_file_id) gallery_link = '/Streamfile/' + str(gallery_file_id) # print(f"attachment_link content: {attachment_link}") # print(f"href_tag content: {href_tag['href']}") - # print(f"541 in this div:{'541' in href_tag['href']}") # print(attachment_link in href_tag['href'] or gallery_link in href_tag['href']) - if attachment_link in href_tag['href'] or gallery_link in href_tag['href']: + if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: attachment_div.decompose() return soup.prettify() -def get_notebook(): +async def get_notebook(): raw_notebook_file_id = None attachment_file_id = None gallery_file_id = None @@ -135,7 +162,7 @@ def get_notebook(): Saves notebook using ipylab and then writes notebook to Rspace document as an attachment """ - save_notebook() + await save_notebook() try: loaded_state = load_data() raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) @@ -143,7 +170,9 @@ def get_notebook(): gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) current_notebook = get_notebook_name()['name'] - attachedData = "spectroscopy_data.csv" + location = os.getcwd() + await add_to_notebook_metadata(current_notebook, location) + attachedData = "spectroscopy_data.csv" # FIXME attachments = None updateDocAttachments = False if raw_notebook_file_id is not None: @@ -180,19 +209,18 @@ def get_notebook(): print('end update to gallery') print(f"Gallery file ID is: {gallery_file}") gallery_file_id = gallery_file - location = os.getcwd() if raw_notebook_file_id is None: new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) raw_notebook_file_id = new_doc['id'] - content = make_content(gallery_file_id,attachment_file_id, location) + content = make_content(gallery_file_id,attachment_file_id) client.append_content(new_doc['id'], content) print(f"New document with this notebook as attachement has ID: {new_doc['id']}") else: previous_content = client.get_document(raw_notebook_file_id)['fields'][0]['content'] - print(f"previous content BEFORE metadata removed : {previous_content}") + # print(f"previous content BEFORE metadata removed : {previous_content}") previous_content = remove_jupyter_attachment_divs(previous_content, gallery_file_id, attachment_file_id) - print(f"previous content after metadata removed : {previous_content}") - new_content = previous_content + make_content(gallery_file_id,attachment_file_id, location) + # print(f"previous content after metadata removed : {previous_content}") + new_content = previous_content + make_content(gallery_file_id,attachment_file_id) updated_doc = client.update_document(raw_notebook_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) # client.append_content(updated_doc['id'], content) raw_notebook_file_id = updated_doc['id'] @@ -227,5 +255,5 @@ def get_notebook(): print(f"Error reading notebook file: {e}") return None -notebook_dict = get_notebook() +await get_notebook() From 0f47a8f6e2630838716c168a7bd0177fc272e0a9 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 25 Sep 2025 11:06:43 +0100 Subject: [PATCH 06/34] RSDEV-782-Jupyter-Notebooks: meta_data --- jupyter_notebooks/provenance_jupyter_hub | 48 ++++++++++++++---------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 69a8d71..e51235f 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -26,6 +26,9 @@ RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' ATTACHMENTS_FOR_NOTEBOOK = 'attachments_for_notebook' GALLERY_FILE_FOR_NOTEBOOK = 'gallery_file_for_notebook' +# TODO - Your RSpace instance goes here +RSPACE_URL='https://researchspace2.eu.ngrok.io/' + rspace_client = None app = JupyterFrontEnd() @@ -67,12 +70,10 @@ def get_rspace_client(): Returns rspace ELN API client """ try: - # TODO - Your RSpace instance goes here - URL='https://researchspace2.eu.ngrok.io/' global rspace_client if rspace_client is None: retrieved_password = get_password() - rspace_client = eln.ELNClient(URL, retrieved_password) + rspace_client = eln.ELNClient(RSPACE_URL, retrieved_password) print(rspace_client.get_status()) return rspace_client except Exception as e: @@ -107,28 +108,40 @@ def load_data(): async def save_notebook(): app.commands.execute('docmanager:save') + # 'docmanager:save' does not hook into any callback when the document is actually saved await asyncio.sleep(1) async def reload_notebook(): app.commands.execute('docmanager:reload') + # 'docmanager:reload' does not hook into any callback when the document is actually reloaded await asyncio.sleep(1) +def make_metadata_cell(text): + loaded_state = load_data() + raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + attachment_file_id = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) + gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) + + meta_data_cell = nbformat.v4.new_raw_cell(text) + galery_doc_link = f'"This Notebook in RSpace' + meta_data_cell['source'] = text + ' ' + galery_doc_link + meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": ["attch1","attach2"]} } + return meta_data_cell + async def add_to_notebook_metadata(fname, text): await save_notebook() with open(fname, 'r') as original: nb = nbformat.read(original, nbformat.NO_CONVERT) - meta_data_cell = nbformat.v4.new_raw_cell(text) - meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": []} } - nb["cells"].extend([meta_data_cell]) - # for i, cell in enumerate(nb['cells']): - # print(f"Cell {i+1}:") - # print(cell['metadata']) - # data = json.load(original) - # print(data['metadata']) - # data['metadata']['rspace'] = text; - # print(data['metadata']) - # with open(fname, 'w') as modified: - # modified.write(data); + meta_data_cell = make_metadata_cell(text) + replaced = False + for i, cell in enumerate(nb['cells']): + # print(f"Cell {i+1}:") + # print(cell['metadata']) + if 'rspace_metadata' in cell['metadata']: + nb["cells"][i] = meta_data_cell + replaced = True + if replaced is False: + nb["cells"].extend([meta_data_cell]) with open(fname, 'w', encoding='utf-8') as modified: nbformat.write(nb, modified) await reload_notebook() @@ -147,9 +160,6 @@ def remove_jupyter_attachment_divs(content, gallery_file_id, attachment_file_id) href_tag = attachment_div.find('a') attachment_link = '/Streamfile/' + str(attachment_file_id) gallery_link = '/Streamfile/' + str(gallery_file_id) - # print(f"attachment_link content: {attachment_link}") - # print(f"href_tag content: {href_tag['href']}") - # print(attachment_link in href_tag['href'] or gallery_link in href_tag['href']) if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: attachment_div.decompose() return soup.prettify() @@ -171,7 +181,6 @@ async def get_notebook(): current_notebook = get_notebook_name()['name'] location = os.getcwd() - await add_to_notebook_metadata(current_notebook, location) attachedData = "spectroscopy_data.csv" # FIXME attachments = None updateDocAttachments = False @@ -226,6 +235,7 @@ async def get_notebook(): raw_notebook_file_id = updated_doc['id'] print(f"Updated document with this notebook as attachement has ID: {updated_doc['id']}") save_rspace_data_ids(raw_notebook_file_id, attachment_file_id, gallery_file_id) + await add_to_notebook_metadata(current_notebook, location) # with open(current_notebook, 'r', encoding='utf-8') as f: # notebook_dict = json.load(f) # docName = f.name From 33fc5147206f757a6e581551bbc974f491aaa24b Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 25 Sep 2025 11:08:28 +0100 Subject: [PATCH 07/34] RSDEV-782-Jupyter-Notebooks: markdown link to rspace gallery doc in meta_data --- jupyter_notebooks/provenance_jupyter_hub | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index e51235f..504b8cb 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -122,13 +122,14 @@ def make_metadata_cell(text): attachment_file_id = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) - meta_data_cell = nbformat.v4.new_raw_cell(text) + meta_data_cell = nbformat.v4.new_markdown_cell() galery_doc_link = f'"This Notebook in RSpace' - meta_data_cell['source'] = text + ' ' + galery_doc_link + galery_doc_markdown = f'[This Notebook in RSpace]({RSPACE_URL}gallery/item/{gallery_file_id})' + meta_data_cell['source'] = galery_doc_markdown meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": ["attch1","attach2"]} } return meta_data_cell -async def add_to_notebook_metadata(fname, text): +async def add_rspace_details_to_notebook_metadata(fname, text): await save_notebook() with open(fname, 'r') as original: nb = nbformat.read(original, nbformat.NO_CONVERT) @@ -181,7 +182,8 @@ async def get_notebook(): current_notebook = get_notebook_name()['name'] location = os.getcwd() - attachedData = "spectroscopy_data.csv" # FIXME + # attachedData = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" + attachedData = "spectroscopy_data.csv" attachments = None updateDocAttachments = False if raw_notebook_file_id is not None: @@ -235,7 +237,7 @@ async def get_notebook(): raw_notebook_file_id = updated_doc['id'] print(f"Updated document with this notebook as attachement has ID: {updated_doc['id']}") save_rspace_data_ids(raw_notebook_file_id, attachment_file_id, gallery_file_id) - await add_to_notebook_metadata(current_notebook, location) + await add_rspace_details_to_notebook_metadata(current_notebook, location) # with open(current_notebook, 'r', encoding='utf-8') as f: # notebook_dict = json.load(f) # docName = f.name From 76841000863928522526efd8c4ab6533beec0bb3 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 25 Sep 2025 16:48:16 +0100 Subject: [PATCH 08/34] RSDEV-782-Jupyter-Notebooks: multiple data files saved to RSpace --- jupyter_notebooks/provenance_jupyter_hub | 111 ++++++++++++++++------- 1 file changed, 76 insertions(+), 35 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 504b8cb..93b77e1 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -23,11 +23,25 @@ import nbformat import asyncio RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' -ATTACHMENTS_FOR_NOTEBOOK = 'attachments_for_notebook' +ATTACHMENTS_FOR_NOTEBOOK = 'ids_of_attached_data_for_notebook' GALLERY_FILE_FOR_NOTEBOOK = 'gallery_file_for_notebook' -# TODO - Your RSpace instance goes here -RSPACE_URL='https://researchspace2.eu.ngrok.io/' +# Your RSpace instance goes here +RSPACE_URL="https://researchspace2.eu.ngrok.io/" + +""" + All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' + then paste here using a ',' comma to separate files if there is more than one. + + Example: + attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" + + If you wish to have no attached data, set this value to be "" (a pair of double quotes) + + Example: + attached_data_files = "" +""" +attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" rspace_client = None app = JupyterFrontEnd() @@ -108,18 +122,27 @@ def load_data(): async def save_notebook(): app.commands.execute('docmanager:save') - # 'docmanager:save' does not hook into any callback when the document is actually saved + # 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved await asyncio.sleep(1) async def reload_notebook(): app.commands.execute('docmanager:reload') - # 'docmanager:reload' does not hook into any callback when the document is actually reloaded + # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded await asyncio.sleep(1) -def make_metadata_cell(text): +def get_saved_data_ids(): loaded_state = load_data() raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_file_id = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) + attachment_file_ids = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) + gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) + if attachment_file_ids is None: + attachment_file_ids = {} + return {RSPACE_DOC_FOR_NOTEBOOK:raw_notebook_file_id, ATTACHMENTS_FOR_NOTEBOOK:attachment_file_ids,GALLERY_FILE_FOR_NOTEBOOK:gallery_file_id } + +def make_metadata_cell(text): + loaded_state = get_saved_data_ids() + raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + attachment_file_ids = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) meta_data_cell = nbformat.v4.new_markdown_cell() @@ -147,27 +170,59 @@ async def add_rspace_details_to_notebook_metadata(fname, text): nbformat.write(nb, modified) await reload_notebook() -def make_content(gallery_file_id,attachment_file_id): +def make_content(gallery_file_id,attachment_file_ids): content = f""" - """ + for attachment_file in attached_data_files.split(","): + content += f""" + + """ + print(f"content is {content}") return content -def remove_jupyter_attachment_divs(content, gallery_file_id, attachment_file_id): +def remove_jupyter_attachment_divs(content, gallery_file_id, attachment_file_ids): soup = BeautifulSoup(content, 'html.parser') attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) for attachment_div in attachment_divs: href_tag = attachment_div.find('a') - attachment_link = '/Streamfile/' + str(attachment_file_id) + print(f"href_tag{href_tag}") gallery_link = '/Streamfile/' + str(gallery_file_id) - if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: - attachment_div.decompose() + for attachment_file in attached_data_files.split(","): + attachment_file_id = attachment_file_ids.get(attachment_file) + # print(f"removing attachment file id: {attachment_file_id}") + attachment_link = '/Streamfile/' + str(attachment_file_id) + print(f"attachment link: {attachment_link}") + print(f"href: {href_tag['href']}") + if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: + attachment_div.decompose() + break return soup.prettify() +def upload_attached_data(attachment_file_ids): + client = get_rspace_client() + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + if attached_data: + with open(attached_data, 'r', encoding='utf-8') as attch: + attachment_file_id = attachment_file_ids.get(attached_data) + print(f"attached file id: {attachment_file_id}") + if attachment_file_id is None: + print('start upload attachments') + attachment_file = client.upload_file(attch)['id'] + print(f"Attachment file ID is: {attachment_file}") + attachment_file_ids[attached_data] = attachment_file + print('done upload attachments') + else: + print('start update attachments') + attachment_file = client.update_file(attch,attachment_file_id)['id'] + print(f"Attachment file ID is: {attachment_file}") + print('done update attachments') + print(f"attached file ids: {attachment_file_ids}") + async def get_notebook(): raw_notebook_file_id = None - attachment_file_id = None + attachment_file_idss = None gallery_file_id = None """ Saves notebook using ipylab and then writes notebook to Rspace document as @@ -175,15 +230,13 @@ async def get_notebook(): """ await save_notebook() try: - loaded_state = load_data() + loaded_state = get_saved_data_ids() raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_file_id = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) + attachment_file_ids = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) current_notebook = get_notebook_name()['name'] location = os.getcwd() - # attachedData = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" - attachedData = "spectroscopy_data.csv" attachments = None updateDocAttachments = False if raw_notebook_file_id is not None: @@ -194,19 +247,7 @@ async def get_notebook(): print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) else: print("Notebook not previously saved to RSpace Gallery") - with open(attachedData, 'r', encoding='utf-8') as attch: - client = get_rspace_client() - if attachment_file_id is None: - print('start upload attachments') - attachment_file = client.upload_file(attch)['id'] - print(f"Attachment file ID is: {attachment_file}") - print('done upload attachments') - else: - print('start update attachments') - attachment_file = client.update_file(attch,attachment_file_id)['id'] - print(f"Attachment file ID is: {attachment_file}") - print('done update attachments') - attachment_file_id = attachment_file + upload_attached_data(attachment_file_ids) with open(current_notebook, 'r', encoding='utf-8') as f: client = get_rspace_client() if gallery_file_id is None: @@ -223,20 +264,20 @@ async def get_notebook(): if raw_notebook_file_id is None: new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) raw_notebook_file_id = new_doc['id'] - content = make_content(gallery_file_id,attachment_file_id) + content = make_content(gallery_file_id,attachment_file_ids) client.append_content(new_doc['id'], content) print(f"New document with this notebook as attachement has ID: {new_doc['id']}") else: previous_content = client.get_document(raw_notebook_file_id)['fields'][0]['content'] # print(f"previous content BEFORE metadata removed : {previous_content}") - previous_content = remove_jupyter_attachment_divs(previous_content, gallery_file_id, attachment_file_id) + previous_content = remove_jupyter_attachment_divs(previous_content, gallery_file_id, attachment_file_ids) # print(f"previous content after metadata removed : {previous_content}") - new_content = previous_content + make_content(gallery_file_id,attachment_file_id) + new_content = previous_content + make_content(gallery_file_id,attachment_file_ids) updated_doc = client.update_document(raw_notebook_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) # client.append_content(updated_doc['id'], content) raw_notebook_file_id = updated_doc['id'] print(f"Updated document with this notebook as attachement has ID: {updated_doc['id']}") - save_rspace_data_ids(raw_notebook_file_id, attachment_file_id, gallery_file_id) + save_rspace_data_ids(raw_notebook_file_id, attachment_file_ids, gallery_file_id) await add_rspace_details_to_notebook_metadata(current_notebook, location) # with open(current_notebook, 'r', encoding='utf-8') as f: # notebook_dict = json.load(f) From 773111eabc99bf975b34d4e5bf17af3a813ce9a7 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Mon, 29 Sep 2025 16:10:24 +0100 Subject: [PATCH 09/34] RSDEV-782-Jupyter-Notebooks: gallery file versions saved and displayed in notebook cell --- jupyter_notebooks/provenance_jupyter_hub | 192 ++++++++++------------- 1 file changed, 80 insertions(+), 112 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 93b77e1..bd17736 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -23,8 +23,8 @@ import nbformat import asyncio RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' -ATTACHMENTS_FOR_NOTEBOOK = 'ids_of_attached_data_for_notebook' -GALLERY_FILE_FOR_NOTEBOOK = 'gallery_file_for_notebook' +ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' +GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' # Your RSpace instance goes here RSPACE_URL="https://researchspace2.eu.ngrok.io/" @@ -95,20 +95,16 @@ def get_rspace_client(): print(f"Error connecting to RSpace: {e}") return None -def save_rspace_data_ids(rspace_doc, attachments, gallery_file): +def save_rspace_data(rspace_doc, attachments, gallery_file): # Define the filename to save the state state_filename = get_notebook_name()['part_name']+"_state.pkl" - # Save the variables to the file using dill with open(state_filename, 'wb') as f: dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, GALLERY_FILE_FOR_NOTEBOOK: gallery_file}, f) - print(f"Variables saved to {state_filename}") def load_data(): - # Define the filename where the state was saved state_filename = get_notebook_name()['part_name']+"_state.pkl" - # Check if the state file exists before attempting to load if os.path.exists(state_filename): # Load the variables from the file using dill with open(state_filename, 'rb') as f: @@ -130,25 +126,24 @@ async def reload_notebook(): # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded await asyncio.sleep(1) -def get_saved_data_ids(): - loaded_state = load_data() - raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_file_ids = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) - gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) - if attachment_file_ids is None: - attachment_file_ids = {} - return {RSPACE_DOC_FOR_NOTEBOOK:raw_notebook_file_id, ATTACHMENTS_FOR_NOTEBOOK:attachment_file_ids,GALLERY_FILE_FOR_NOTEBOOK:gallery_file_id } - def make_metadata_cell(text): - loaded_state = get_saved_data_ids() - raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_file_ids = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) - gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) - + loaded_state = load_data() + rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) + nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) + nb_gallery_file_id = nb_gallery_file['id'] + nb_gallery_file_version = nb_gallery_file['version'] + nb_gallery_file_name = nb_gallery_file['name'] meta_data_cell = nbformat.v4.new_markdown_cell() - galery_doc_link = f'"This Notebook in RSpace' - galery_doc_markdown = f'[This Notebook in RSpace]({RSPACE_URL}gallery/item/{gallery_file_id})' - meta_data_cell['source'] = galery_doc_markdown + galery_doc_link = f'"This Notebook in RSpace' + rspace_doc_for_markdown = f'[The RSpace Document describing this notebook]({RSPACE_URL}workspace/editor/structuredDocument/{rspace_document_file_id})' + gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({RSPACE_URL}gallery/item/{nb_gallery_file_id})' + meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + meta_data_cell['source'] +=f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": ["attch1","attach2"]} } return meta_data_cell @@ -159,8 +154,6 @@ async def add_rspace_details_to_notebook_metadata(fname, text): meta_data_cell = make_metadata_cell(text) replaced = False for i, cell in enumerate(nb['cells']): - # print(f"Cell {i+1}:") - # print(cell['metadata']) if 'rspace_metadata' in cell['metadata']: nb["cells"][i] = meta_data_cell replaced = True @@ -170,143 +163,118 @@ async def add_rspace_details_to_notebook_metadata(fname, text): nbformat.write(nb, modified) await reload_notebook() -def make_content(gallery_file_id,attachment_file_ids): +def make_content(nb_gallery_file_id,attachment_files): content = f""" - + """ for attachment_file in attached_data_files.split(","): content += f""" - + """ print(f"content is {content}") return content -def remove_jupyter_attachment_divs(content, gallery_file_id, attachment_file_ids): +def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): soup = BeautifulSoup(content, 'html.parser') attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) for attachment_div in attachment_divs: href_tag = attachment_div.find('a') print(f"href_tag{href_tag}") - gallery_link = '/Streamfile/' + str(gallery_file_id) + gallery_link = '/Streamfile/' + str(nb_gallery_file_id) for attachment_file in attached_data_files.split(","): - attachment_file_id = attachment_file_ids.get(attachment_file) - # print(f"removing attachment file id: {attachment_file_id}") + attachment_file_id = attachment_files.get(attachment_file, {}).get('id') attachment_link = '/Streamfile/' + str(attachment_file_id) - print(f"attachment link: {attachment_link}") - print(f"href: {href_tag['href']}") if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: attachment_div.decompose() break return soup.prettify() -def upload_attached_data(attachment_file_ids): +def upload_file_to_gallery(rspaceid, file, client): + if rspaceid is None: + print('start upload file') + uploaded = client.upload_file(file) + rspace_id = uploaded['id'] + rspace_version = uploaded['version'] + else: + print('start update file') + updated = client.update_file(file,rspaceid) + rspace_id = updated['id'] + rspace_version = updated['version'] + print(f"RSpace ID is: {rspace_id} and version is {rspace_version} for file {file}") + rspace_file_data = {"id": rspace_id, "version": rspace_version} + return rspace_file_data + +def upload_attached_data(attachment_files): client = get_rspace_client() attached_data_files_list = attached_data_files.split(",") for attached_data in attached_data_files_list: if attached_data: with open(attached_data, 'r', encoding='utf-8') as attch: - attachment_file_id = attachment_file_ids.get(attached_data) - print(f"attached file id: {attachment_file_id}") - if attachment_file_id is None: - print('start upload attachments') - attachment_file = client.upload_file(attch)['id'] - print(f"Attachment file ID is: {attachment_file}") - attachment_file_ids[attached_data] = attachment_file - print('done upload attachments') - else: - print('start update attachments') - attachment_file = client.update_file(attch,attachment_file_id)['id'] - print(f"Attachment file ID is: {attachment_file}") - print('done update attachments') - print(f"attached file ids: {attachment_file_ids}") + attachment_file_id = attachment_files.get(attached_data,{}).get('id') + attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) + attachment_files[attached_data] = attachment_file_data + print(f"attached files: {attachment_files}") -async def get_notebook(): - raw_notebook_file_id = None - attachment_file_idss = None - gallery_file_id = None +async def sync_notebook(): """ Saves notebook using ipylab and then writes notebook to Rspace document as an attachment """ + rspace_document_file_id = None + attachment_filess = None + gallery_file = None await save_notebook() try: - loaded_state = get_saved_data_ids() - raw_notebook_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_file_ids = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK) - gallery_file_id = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK) - + loaded_state = load_data() + rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) + nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) + nb_gallery_file_id = nb_gallery_file.get('id') current_notebook = get_notebook_name()['name'] location = os.getcwd() attachments = None - updateDocAttachments = False - if raw_notebook_file_id is not None: - print(f"A document with attachement to this notebook saved previously with RSpaceID {raw_notebook_file_id}" ) + if rspace_document_file_id is not None: + print(f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {rspace_document_file_id}" ) else: - print("NO document with attachement to this notebook saved previously in RSpace") - if gallery_file_id is not None: - print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) + print("No RSpace document with this notebook as an attachment saved previously in RSpace") + if nb_gallery_file_id is not None: + print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}" ) else: print("Notebook not previously saved to RSpace Gallery") - upload_attached_data(attachment_file_ids) + upload_attached_data(attachment_files) with open(current_notebook, 'r', encoding='utf-8') as f: client = get_rspace_client() - if gallery_file_id is None: - print('start upload to gallery') - gallery_file = client.upload_file(f)['id'] - print(f"Gallery file ID is: {gallery_file}") - print('done upload to gallery') + + if nb_gallery_file_id is None: + print('start upload notebook to gallery') + nb_gallery_file = client.upload_file(f) + nb_gallery_file_id = nb_gallery_file['id'] + print(f"Notebook Gallery file is: {nb_gallery_file}") + print('done upload notebook to gallery') else: - print('start update to gallery') - gallery_file = client.update_file(f,gallery_file_id)['id'] + print('start update notebook in gallery') + nb_gallery_file = client.update_file(f, nb_gallery_file_id) print('end update to gallery') - print(f"Gallery file ID is: {gallery_file}") - gallery_file_id = gallery_file - if raw_notebook_file_id is None: + print(f"Notebook Gallery file ID is: {gallery_file}") + + if rspace_document_file_id is None: new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) - raw_notebook_file_id = new_doc['id'] - content = make_content(gallery_file_id,attachment_file_ids) + rspace_document_file_id = new_doc['id'] + content = make_content(nb_gallery_file_id,attachment_files) client.append_content(new_doc['id'], content) print(f"New document with this notebook as attachement has ID: {new_doc['id']}") else: - previous_content = client.get_document(raw_notebook_file_id)['fields'][0]['content'] - # print(f"previous content BEFORE metadata removed : {previous_content}") - previous_content = remove_jupyter_attachment_divs(previous_content, gallery_file_id, attachment_file_ids) - # print(f"previous content after metadata removed : {previous_content}") - new_content = previous_content + make_content(gallery_file_id,attachment_file_ids) - updated_doc = client.update_document(raw_notebook_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) - # client.append_content(updated_doc['id'], content) - raw_notebook_file_id = updated_doc['id'] - print(f"Updated document with this notebook as attachement has ID: {updated_doc['id']}") - save_rspace_data_ids(raw_notebook_file_id, attachment_file_ids, gallery_file_id) + previous_content = client.get_document(rspace_document_file_id)['fields'][0]['content'] + previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) + new_content = previous_content + make_content(nb_gallery_file_id,attachment_files) + updated_doc = client.update_document(rspace_document_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) + save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file) await add_rspace_details_to_notebook_metadata(current_notebook, location) - # with open(current_notebook, 'r', encoding='utf-8') as f: - # notebook_dict = json.load(f) - # docName = f.name - # client = get_rspace_client() - # if(raw_notebook_file_id): - # print('start doc update') - # raw_data_file = client.update_document( - # raw_notebook_file_id, - # name = docName, - # tags = ["Python", "API", "example"], - # fields = [{"content": json.dumps(notebook_dict)}], - # ) - # print(f"Updated notebook: {current_notebook}") - # else: - # raw_data_file = client.create_document( - # name = docName, - # tags = ["Python", "API", "example"], - # fields = [{"content": json.dumps(notebook_dict)}], - # ) - # print(json.dumps(notebook_dict)) - # print(f"Created notebook: {current_notebook}") - # raw_notebook_file_id = raw_data_file['id'] - # print(raw_notebook_file_id) except Exception as e: print(traceback.format_exc()) print(f"Error reading notebook file: {e}") return None -await get_notebook() +await sync_notebook() From ed650458fb7f356c3391eae8fac8e12c65270e4f Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 1 Oct 2025 13:31:29 +0100 Subject: [PATCH 10/34] RSDEV-782-Jupyter-Notebooks: handles correct version for new gallery file by sending twice to rspace --- jupyter_notebooks/provenance_jupyter_hub | 129 ++++++++++++----------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index bd17736..ae020b0 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -21,6 +21,8 @@ import traceback from bs4 import BeautifulSoup import nbformat import asyncio +import getpass +import keyring RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' @@ -42,6 +44,7 @@ RSPACE_URL="https://researchspace2.eu.ngrok.io/" attached_data_files = "" """ attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" +get_new_password = False rspace_client = None app = JupyterFrontEnd() @@ -52,9 +55,6 @@ def get_notebook_name(): nb_path = str(ipynbname.path()) ext_pos=(''+nb_path).rfind('.') ext=nb_path[ext_pos:] - # print(f"{ext=}") - # print(f"{nb_fname=}") - # print(f"{nb_path=}") return {'name':nb_fname+ext, 'part_name':nb_fname,'path':nb_path} def get_password(): @@ -62,8 +62,6 @@ def get_password(): Retrieves password from (or saves a new password to) keyring """ try: - import getpass - import keyring # TODO - Define the service name (e.g., the notebook name the secret is for) service_id = "RSpaceJupyterDemoApp" @@ -71,7 +69,7 @@ def get_password(): username = "myuser" # use your own username retrieved_password = keyring.get_password(service_id, username) - if retrieved_password is None: + if retrieved_password is None or get_new_password: retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") keyring.set_password(service_id, username, retrieved_password) return retrieved_password @@ -98,7 +96,7 @@ def get_rspace_client(): def save_rspace_data(rspace_doc, attachments, gallery_file): # Define the filename to save the state state_filename = get_notebook_name()['part_name']+"_state.pkl" - + print(f"writing to file: {state_filename}") with open(state_filename, 'wb') as f: dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, GALLERY_FILE_FOR_NOTEBOOK: gallery_file}, f) @@ -113,6 +111,7 @@ def load_data(): except Exception as e: loaded_state = {} else: + loaded_state = {} print(f"State file '{state_filename}' not found. No variables loaded.") return loaded_state @@ -126,13 +125,9 @@ async def reload_notebook(): # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded await asyncio.sleep(1) -def make_metadata_cell(text): - loaded_state = load_data() - rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) - nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) +def make_metadata_cell(nb_gallery_file, attachment_files,rspace_document_file_id ): nb_gallery_file_id = nb_gallery_file['id'] - nb_gallery_file_version = nb_gallery_file['version'] + nb_gallery_file_version = int(nb_gallery_file['version']) +1 nb_gallery_file_name = nb_gallery_file['name'] meta_data_cell = nbformat.v4.new_markdown_cell() galery_doc_link = f'"This Notebook in RSpace' @@ -144,14 +139,21 @@ def make_metadata_cell(text): attachment_file_id = attachment_files.get(attached_data, {}).get('id') attachment_version = attachment_files.get(attached_data, {}).get('version') meta_data_cell['source'] +=f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' - meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": ["attch1","attach2"]} } + meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": [""]} } return meta_data_cell -async def add_rspace_details_to_notebook_metadata(fname, text): +async def add_rspace_details_to_notebook_metadata(fname, nb_gallery_file, attachment_files,rspace_document_file_id ): + """ + We have to save meta data about a notebook before its been uploaded to the gallery. + Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None + its the initial upload to the Gallery and so do not write any meta data + """ + if nb_gallery_file.get('id') is None: + return await save_notebook() with open(fname, 'r') as original: nb = nbformat.read(original, nbformat.NO_CONVERT) - meta_data_cell = make_metadata_cell(text) + meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id) replaced = False for i, cell in enumerate(nb['cells']): if 'rspace_metadata' in cell['metadata']: @@ -161,7 +163,6 @@ async def add_rspace_details_to_notebook_metadata(fname, text): nb["cells"].extend([meta_data_cell]) with open(fname, 'w', encoding='utf-8') as modified: nbformat.write(nb, modified) - await reload_notebook() def make_content(nb_gallery_file_id,attachment_files): content = f""" @@ -191,18 +192,20 @@ def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files def upload_file_to_gallery(rspaceid, file, client): if rspaceid is None: - print('start upload file') - uploaded = client.upload_file(file) - rspace_id = uploaded['id'] - rspace_version = uploaded['version'] + print(f'start upload file {file} using {client}') + data = client.upload_file(file) else: print('start update file') - updated = client.update_file(file,rspaceid) - rspace_id = updated['id'] - rspace_version = updated['version'] - print(f"RSpace ID is: {rspace_id} and version is {rspace_version} for file {file}") - rspace_file_data = {"id": rspace_id, "version": rspace_version} - return rspace_file_data + data = client.update_file(file,rspaceid) + return data + +def calc_hash(filename): + sha256_hash = hashlib.sha256() + with open(filename,"rb") as f: + # Read and update hash string value in blocks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() def upload_attached_data(attachment_files): client = get_rspace_client() @@ -211,10 +214,35 @@ def upload_attached_data(attachment_files): if attached_data: with open(attached_data, 'r', encoding='utf-8') as attch: attachment_file_id = attachment_files.get(attached_data,{}).get('id') - attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) - attachment_files[attached_data] = attachment_file_data + attachment_file_hash = attachment_files.get(attached_data,{}).get('hash') + calc_latest_hash = calc_hash(attached_data) + if calc_latest_hash != attachment_file_hash: + attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) + attachment_file_data['hash'] = calc_latest_hash + attachment_files[attached_data] = attachment_file_data + else: + print(f"File {attached_data} not changed so no update") print(f"attached files: {attachment_files}") +async def upload_notebook_to_gallery(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id): + """ + Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). + If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. + We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook + a second time. + """ + await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) + with open(current_notebook, 'r', encoding='utf-8') as nb_file: + client = get_rspace_client() + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + print(f"gallery file for nb: {nb_gallery_file}") + if nb_gallery_file.get('version') == 1: + await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") + await reload_notebook() + return nb_gallery_file + async def sync_notebook(): """ Saves notebook using ipylab and then writes notebook to Rspace document as @@ -226,50 +254,34 @@ async def sync_notebook(): await save_notebook() try: loaded_state = load_data() + client = get_rspace_client() rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) nb_gallery_file_id = nb_gallery_file.get('id') current_notebook = get_notebook_name()['name'] - location = os.getcwd() attachments = None if rspace_document_file_id is not None: print(f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {rspace_document_file_id}" ) else: print("No RSpace document with this notebook as an attachment saved previously in RSpace") + upload_attached_data(attachment_files) if nb_gallery_file_id is not None: print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}" ) else: print("Notebook not previously saved to RSpace Gallery") - upload_attached_data(attachment_files) - with open(current_notebook, 'r', encoding='utf-8') as f: - client = get_rspace_client() - - if nb_gallery_file_id is None: - print('start upload notebook to gallery') - nb_gallery_file = client.upload_file(f) - nb_gallery_file_id = nb_gallery_file['id'] - print(f"Notebook Gallery file is: {nb_gallery_file}") - print('done upload notebook to gallery') - else: - print('start update notebook in gallery') - nb_gallery_file = client.update_file(f, nb_gallery_file_id) - print('end update to gallery') - print(f"Notebook Gallery file ID is: {gallery_file}") + if rspace_document_file_id is None: + new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) + rspace_document_file_id = new_doc['id'] + nb_gallery_file = await upload_notebook_to_gallery(current_notebook, nb_gallery_file,attachment_files, rspace_document_file_id) + print(f"nb_gallery_file was finally: {nb_gallery_file}") + nb_gallery_file_id = nb_gallery_file.get('id') - if rspace_document_file_id is None: - new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) - rspace_document_file_id = new_doc['id'] - content = make_content(nb_gallery_file_id,attachment_files) - client.append_content(new_doc['id'], content) - print(f"New document with this notebook as attachement has ID: {new_doc['id']}") - else: - previous_content = client.get_document(rspace_document_file_id)['fields'][0]['content'] - previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) - new_content = previous_content + make_content(nb_gallery_file_id,attachment_files) - updated_doc = client.update_document(rspace_document_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) - save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file) - await add_rspace_details_to_notebook_metadata(current_notebook, location) + previous_content = client.get_document(rspace_document_file_id)['fields'][0]['content'] + previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) + new_content = previous_content + make_content(nb_gallery_file_id,attachment_files) + updated_doc = client.update_document(rspace_document_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) + save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file) except Exception as e: print(traceback.format_exc()) @@ -277,4 +289,3 @@ async def sync_notebook(): return None await sync_notebook() - From 5b27dc5bce481618599ee3aac70496496c499c12 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 1 Oct 2025 13:32:39 +0100 Subject: [PATCH 11/34] RSDEV-782-Jupyter-Notebooks: rename --- .../{provenance_jupyter_hub => provenance_jupyter_hub.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename jupyter_notebooks/{provenance_jupyter_hub => provenance_jupyter_hub.ipynb} (100%) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub.ipynb similarity index 100% rename from jupyter_notebooks/provenance_jupyter_hub rename to jupyter_notebooks/provenance_jupyter_hub.ipynb From b5a070de1cd2210a144de64adadf90464151639f Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 1 Oct 2025 13:41:24 +0100 Subject: [PATCH 12/34] RSDEV-782-Jupyter-Notebooks: revert rename as doc uneditable in Pycharm --- ...pyter_hub.ipynb => provenance_jupyter_hub} | 0 .../rspace-demo-kaggle-v11.ipynb | 252 +++++++++++++++++- 2 files changed, 251 insertions(+), 1 deletion(-) rename jupyter_notebooks/{provenance_jupyter_hub.ipynb => provenance_jupyter_hub} (100%) diff --git a/jupyter_notebooks/provenance_jupyter_hub.ipynb b/jupyter_notebooks/provenance_jupyter_hub similarity index 100% rename from jupyter_notebooks/provenance_jupyter_hub.ipynb rename to jupyter_notebooks/provenance_jupyter_hub diff --git a/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb b/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb index 3b6687c..887835f 100644 --- a/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb +++ b/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb @@ -1 +1,251 @@ -{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### Working with RSpace data.\n\nThis notebook illustrates a workflow to get data from RSpace, analyse it, and send back results to RSpace.\nTo work with this tutorial, you'll need an account on RSpace, an RSpace API key and Python 3.6 or later.\n\nThis project requires modules `rspace_client` (Version 2) , `pandas` and `matplotlib`.\n\nTo install rspace client `pip install rspace-client==2.0.1`\n\nThe top-level README.md has more information on getting set up. \n\nThe notebook is split into 3 sections:\n\n1. Adding some data to RSpace to analyse. In reality, this might be done manually by a wet-lab scientist or be delivered programmatically by an instrument. \n2. Getting the datasets to analyse\n3. Sending back the analysis linked to an experimental record","metadata":{}},{"cell_type":"markdown","source":"#### Setup Step 1 - configuring the RSpace Client. `rspace_client` is available from pip.\n\nIt's good practice to store API keys as environment variables rather than hard-coding it.","metadata":{}},{"cell_type":"code","source":"!pip install rspace-client==2.0.1\nprint(\"Kernel running OK\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:34:56.205067Z","iopub.execute_input":"2021-12-02T09:34:56.20544Z","iopub.status.idle":"2021-12-02T09:35:05.672998Z","shell.execute_reply.started":"2021-12-02T09:34:56.205406Z","shell.execute_reply":"2021-12-02T09:35:05.671894Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## get your API key and set your RSpace URL. Change this code as needed for your own environment\nfrom kaggle_secrets import UserSecretsClient\napi_key_label = \"mp_demos_key\"\nAPI_KEY = UserSecretsClient().get_secret(api_key_label)\nprint (f\"Retrieved API key {API_KEY[0:4]}...\")\nURL=\"https://demos.researchspace.com\"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from rspace_client.eln import eln\nimport os\n\napi = eln.ELNClient(URL, API_KEY)\n\n## sanity check that that the client is configured correctly\nprint(api.get_status())","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:11:50.086056Z","iopub.execute_input":"2021-12-02T09:11:50.086431Z","iopub.status.idle":"2021-12-02T09:11:51.361265Z","shell.execute_reply.started":"2021-12-02T09:11:50.086396Z","shell.execute_reply":"2021-12-02T09:11:51.360371Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Setup Step 2 - adding some test data.\n\nHere we'll add a CSV file to RSpace, containing some synthetic weather-related data.","metadata":{}},{"cell_type":"code","source":"import os\ndata_input_dir='/kaggle/input/rspacedemofiles'\ntemp_data_path=os.path.join(data_input_dir, 'temp_data.csv')\n\nwith open (temp_data_path) as f:\n raw_data_file = api.upload_file(f)['id']\nraw_data_file_id= raw_data_file\nprint(f\"Temperature data uploaded to RSpace with ID {raw_data_file_id}\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:29:29.73737Z","iopub.execute_input":"2021-12-02T09:29:29.737804Z","iopub.status.idle":"2021-12-02T09:29:30.742869Z","shell.execute_reply.started":"2021-12-02T09:29:29.737762Z","shell.execute_reply":"2021-12-02T09:29:30.741818Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Analysis Step 1 - retrieving dataset\n\nOK, now we can start working with this dataset. If this dataset had been uploaded by a colleague, we could have been notified by Slack, Teams, email or within RSpace itself that this file was available for analysis.","metadata":{}},{"cell_type":"code","source":"file_name = \"downloaded_\"+(api.get_file_info(raw_data_file_id)['name'])\nprint(file_name)\n\n## retrieve from RSpace - here we are downloading the file\nraw_temp_data = api.download_file(raw_data_file_id, file_name)","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:29:43.868586Z","iopub.execute_input":"2021-12-02T09:29:43.86891Z","iopub.status.idle":"2021-12-02T09:29:45.244744Z","shell.execute_reply.started":"2021-12-02T09:29:43.868874Z","shell.execute_reply":"2021-12-02T09:29:45.243908Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Analysis Step 2 - the analysis\n\nHere is where you do your actual analytis of the data... here we'll just plot the data and generate a summary, saving both to file.","metadata":{}},{"cell_type":"code","source":"import pandas as pd;\ndf = pd.read_csv(file_name)\nsummary_stats = df.describe()\n\ndf = df.set_index('city_id')\nplot = df.plot(ylabel='Celsius', title=f'Temperature plots from dataset {raw_data_file_id}')\nimg_f= f'Temperature_per_city-{raw_data_file_id}'\nplot.get_figure().savefig(img_f)\n\nsummary_stats_csv = f'{file_name[:file_name.rindex(\".\")]}-summarystats.csv'\nsummary_stats.to_csv(summary_stats_csv)","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:30:05.805607Z","iopub.execute_input":"2021-12-02T09:30:05.805916Z","iopub.status.idle":"2021-12-02T09:30:06.227801Z","shell.execute_reply.started":"2021-12-02T09:30:05.805882Z","shell.execute_reply":"2021-12-02T09:30:06.226521Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Analysis Step 3 - uploading back to RSpace\n\nYou can add captions to the file to help describe your analysis","metadata":{}},{"cell_type":"code","source":"with open(summary_stats_csv, 'rb') as f:\n summary_file = api.upload_file(f, caption=f\"Summary data for {raw_data_file_id}\")\n print(f\"uploaded id = {summary_file['id']}\")\nwith open(img_f+\".png\", 'rb') as f:\n uploaded_image = api.upload_file(f, caption=f\"City vs temperature for {raw_data_file_id}\")\n print(f\"uploaded id = {uploaded_image['id']}\") ","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:30:13.165456Z","iopub.execute_input":"2021-12-02T09:30:13.165815Z","iopub.status.idle":"2021-12-02T09:30:15.074355Z","shell.execute_reply.started":"2021-12-02T09:30:13.165782Z","shell.execute_reply":"2021-12-02T09:30:15.073388Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"There are several options now:\n\n* You can create an RSpace document, and insert these files, and share the document with your group or colleage. \n* Your colleagues may have already created and shared document describing an experiment that generated these files, in which case you would already have access to a document.\n\nHere we'll go with a simple flow where we create a new RSpace document to share with the rest of our research group.\n\nThe content we'll insert will be HTML. However you don't need to figure out how to display the linked files. Just include file links as `` syntax and RSpace will turn these into formatted links\n","metadata":{}},{"cell_type":"code","source":"new_doc = api.create_document(name=f\"Analysis of dataset {raw_data_file_id}\")\ncontent = f\"\"\"\n

Analysis of temperature dataset from our standard locations.\n

No variation between locations:\nRaw data: \n

\nStatistical summary: \n

\nLocation vs temperature: \n\"\"\"\n\nupdated_doc = api.append_content(new_doc['id'], content)\n\n## a simple utility function so you can get a link to view the updated contents in a browser.\ndef api_to_browser(link):\n return '/globalId/SD'.join(link.split('/api/v1/documents/'))\n\nprint(f\"You can view this in a browser at {api_to_browser(updated_doc['_links'][0]['link'])}\")\n","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:30:33.035126Z","iopub.execute_input":"2021-12-02T09:30:33.036313Z","iopub.status.idle":"2021-12-02T09:30:38.342404Z","shell.execute_reply.started":"2021-12-02T09:30:33.036246Z","shell.execute_reply":"2021-12-02T09:30:38.341291Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"If you're in a group, you can now share this with your group. You can get your groups' IDs: ","metadata":{}},{"cell_type":"code","source":"groups = api.get_groups()\nfor gp in groups:\n print(f\"{gp['name']:30}{gp['id']}\")\nchosen_group = None\n#chosen_group = input(\"please enter a group ID to share with\")\nchosen_group = chosen_group or groups[0]['id'] ## if not running interactively, choose 1st group","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:48:52.764733Z","iopub.execute_input":"2021-12-02T09:48:52.765029Z","iopub.status.idle":"2021-12-02T09:48:53.294518Z","shell.execute_reply.started":"2021-12-02T09:48:52.764997Z","shell.execute_reply":"2021-12-02T09:48:53.293245Z"},"trusted":true},"execution_count":53,"outputs":[]},{"cell_type":"code","source":"api.shareDocuments([new_doc['id']], chosen_group, permission=\"EDIT\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:31:28.045633Z","iopub.execute_input":"2021-12-02T09:31:28.045924Z","iopub.status.idle":"2021-12-02T09:31:29.469595Z","shell.execute_reply.started":"2021-12-02T09:31:28.045892Z","shell.execute_reply":"2021-12-02T09:31:29.468475Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"### tidy up - remove output files\noutfile_dir=\"/kaggle/working\"\nfor root,dirs,files in os.walk(outfile_dir):\n for f in files:\n os.remove(f)\nprint (\"output files removed\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:32:58.850987Z","iopub.execute_input":"2021-12-02T09:32:58.851459Z","iopub.status.idle":"2021-12-02T09:32:58.859278Z","shell.execute_reply.started":"2021-12-02T09:32:58.851405Z","shell.execute_reply":"2021-12-02T09:32:58.858292Z"},"trusted":true},"execution_count":null,"outputs":[]}]} \ No newline at end of file +{ + "metadata": { + "kernelspec": { + "language": "python", + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.7.12", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + } + }, + "nbformat_minor": 4, + "nbformat": 4, + "cells": [ + { + "cell_type": "raw", + "source": [ + "### Working with RSpace data.\n", + "\n", + "This notebook illustrates a workflow to get data from RSpace, analyse it, and send back results to RSpace.\n", + "To work with this tutorial, you'll need an account on RSpace, an RSpace API key and Python 3.6 or later.\n", + "\n", + "This project requires modules `rspace_client` (Version 2) , `pandas` and `matplotlib`.\n", + "\n", + "To install rspace client `pip install rspace-client==2.0.1`\n", + "\n", + "The top-level README.md has more information on getting set up. \n", + "\n", + "The notebook is split into 3 sections:\n", + "\n", + "1. Adding some data to RSpace to analyse. In reality, this might be done manually by a wet-lab scientist or be delivered programmatically by an instrument. \n", + "2. Getting the datasets to analyse\n", + "3. Sending back the analysis linked to an experimental record" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": "#### Setup Step 1 - configuring the RSpace Client. `rspace_client` is available from pip.\n\nIt's good practice to store API keys as environment variables rather than hard-coding it.", + "metadata": {} + }, + { + "cell_type": "code", + "source": "!pip install rspace-client==2.0.1\nprint(\"Kernel running OK\")", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:34:56.205067Z", + "iopub.execute_input": "2021-12-02T09:34:56.20544Z", + "iopub.status.idle": "2021-12-02T09:35:05.672998Z", + "shell.execute_reply.started": "2021-12-02T09:34:56.205406Z", + "shell.execute_reply": "2021-12-02T09:35:05.671894Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": "## get your API key and set your RSpace URL. Change this code as needed for your own environment\nfrom kaggle_secrets import UserSecretsClient\napi_key_label = \"mp_demos_key\"\nAPI_KEY = UserSecretsClient().get_secret(api_key_label)\nprint (f\"Retrieved API key {API_KEY[0:4]}...\")\nURL=\"https://demos.researchspace.com\"", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": "from rspace_client.eln import eln\nimport os\n\napi = eln.ELNClient(URL, API_KEY)\n\n## sanity check that that the client is configured correctly\nprint(api.get_status())", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:11:50.086056Z", + "iopub.execute_input": "2021-12-02T09:11:50.086431Z", + "iopub.status.idle": "2021-12-02T09:11:51.361265Z", + "shell.execute_reply.started": "2021-12-02T09:11:50.086396Z", + "shell.execute_reply": "2021-12-02T09:11:51.360371Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": "#### Setup Step 2 - adding some test data.\n\nHere we'll add a CSV file to RSpace, containing some synthetic weather-related data.", + "metadata": {} + }, + { + "cell_type": "code", + "source": "import os\ndata_input_dir='/kaggle/input/rspacedemofiles'\ntemp_data_path=os.path.join(data_input_dir, 'temp_data.csv')\n\nwith open (temp_data_path) as f:\n raw_data_file = api.upload_file(f)['id']\nraw_data_file_id= raw_data_file\nprint(f\"Temperature data uploaded to RSpace with ID {raw_data_file_id}\")", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:29:29.73737Z", + "iopub.execute_input": "2021-12-02T09:29:29.737804Z", + "iopub.status.idle": "2021-12-02T09:29:30.742869Z", + "shell.execute_reply.started": "2021-12-02T09:29:29.737762Z", + "shell.execute_reply": "2021-12-02T09:29:30.741818Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": "#### Analysis Step 1 - retrieving dataset\n\nOK, now we can start working with this dataset. If this dataset had been uploaded by a colleague, we could have been notified by Slack, Teams, email or within RSpace itself that this file was available for analysis.", + "metadata": {} + }, + { + "cell_type": "code", + "source": "file_name = \"downloaded_\"+(api.get_file_info(raw_data_file_id)['name'])\nprint(file_name)\n\n## retrieve from RSpace - here we are downloading the file\nraw_temp_data = api.download_file(raw_data_file_id, file_name)", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:29:43.868586Z", + "iopub.execute_input": "2021-12-02T09:29:43.86891Z", + "iopub.status.idle": "2021-12-02T09:29:45.244744Z", + "shell.execute_reply.started": "2021-12-02T09:29:43.868874Z", + "shell.execute_reply": "2021-12-02T09:29:45.243908Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": "#### Analysis Step 2 - the analysis\n\nHere is where you do your actual analytis of the data... here we'll just plot the data and generate a summary, saving both to file.", + "metadata": {} + }, + { + "cell_type": "code", + "source": "import pandas as pd;\ndf = pd.read_csv(file_name)\nsummary_stats = df.describe()\n\ndf = df.set_index('city_id')\nplot = df.plot(ylabel='Celsius', title=f'Temperature plots from dataset {raw_data_file_id}')\nimg_f= f'Temperature_per_city-{raw_data_file_id}'\nplot.get_figure().savefig(img_f)\n\nsummary_stats_csv = f'{file_name[:file_name.rindex(\".\")]}-summarystats.csv'\nsummary_stats.to_csv(summary_stats_csv)", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:30:05.805607Z", + "iopub.execute_input": "2021-12-02T09:30:05.805916Z", + "iopub.status.idle": "2021-12-02T09:30:06.227801Z", + "shell.execute_reply.started": "2021-12-02T09:30:05.805882Z", + "shell.execute_reply": "2021-12-02T09:30:06.226521Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": "#### Analysis Step 3 - uploading back to RSpace\n\nYou can add captions to the file to help describe your analysis", + "metadata": {} + }, + { + "cell_type": "code", + "source": "with open(summary_stats_csv, 'rb') as f:\n summary_file = api.upload_file(f, caption=f\"Summary data for {raw_data_file_id}\")\n print(f\"uploaded id = {summary_file['id']}\")\nwith open(img_f+\".png\", 'rb') as f:\n uploaded_image = api.upload_file(f, caption=f\"City vs temperature for {raw_data_file_id}\")\n print(f\"uploaded id = {uploaded_image['id']}\") ", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:30:13.165456Z", + "iopub.execute_input": "2021-12-02T09:30:13.165815Z", + "iopub.status.idle": "2021-12-02T09:30:15.074355Z", + "shell.execute_reply.started": "2021-12-02T09:30:13.165782Z", + "shell.execute_reply": "2021-12-02T09:30:15.073388Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": "There are several options now:\n\n* You can create an RSpace document, and insert these files, and share the document with your group or colleage. \n* Your colleagues may have already created and shared document describing an experiment that generated these files, in which case you would already have access to a document.\n\nHere we'll go with a simple flow where we create a new RSpace document to share with the rest of our research group.\n\nThe content we'll insert will be HTML. However you don't need to figure out how to display the linked files. Just include file links as `` syntax and RSpace will turn these into formatted links\n", + "metadata": {} + }, + { + "cell_type": "code", + "source": "new_doc = api.create_document(name=f\"Analysis of dataset {raw_data_file_id}\")\ncontent = f\"\"\"\n

Analysis of temperature dataset from our standard locations.\n

No variation between locations:\nRaw data: \n

\nStatistical summary: \n

\nLocation vs temperature: \n\"\"\"\n\nupdated_doc = api.append_content(new_doc['id'], content)\n\n## a simple utility function so you can get a link to view the updated contents in a browser.\ndef api_to_browser(link):\n return '/globalId/SD'.join(link.split('/api/v1/documents/'))\n\nprint(f\"You can view this in a browser at {api_to_browser(updated_doc['_links'][0]['link'])}\")\n", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:30:33.035126Z", + "iopub.execute_input": "2021-12-02T09:30:33.036313Z", + "iopub.status.idle": "2021-12-02T09:30:38.342404Z", + "shell.execute_reply.started": "2021-12-02T09:30:33.036246Z", + "shell.execute_reply": "2021-12-02T09:30:38.341291Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": "If you're in a group, you can now share this with your group. You can get your groups' IDs: ", + "metadata": {} + }, + { + "cell_type": "code", + "source": "groups = api.get_groups()\nfor gp in groups:\n print(f\"{gp['name']:30}{gp['id']}\")\nchosen_group = None\n#chosen_group = input(\"please enter a group ID to share with\")\nchosen_group = chosen_group or groups[0]['id'] ## if not running interactively, choose 1st group", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:48:52.764733Z", + "iopub.execute_input": "2021-12-02T09:48:52.765029Z", + "iopub.status.idle": "2021-12-02T09:48:53.294518Z", + "shell.execute_reply.started": "2021-12-02T09:48:52.764997Z", + "shell.execute_reply": "2021-12-02T09:48:53.293245Z" + }, + "trusted": true + }, + "execution_count": 53, + "outputs": [] + }, + { + "cell_type": "code", + "source": "api.shareDocuments([new_doc['id']], chosen_group, permission=\"EDIT\")", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:31:28.045633Z", + "iopub.execute_input": "2021-12-02T09:31:28.045924Z", + "iopub.status.idle": "2021-12-02T09:31:29.469595Z", + "shell.execute_reply.started": "2021-12-02T09:31:28.045892Z", + "shell.execute_reply": "2021-12-02T09:31:29.468475Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": "### tidy up - remove output files\noutfile_dir=\"/kaggle/working\"\nfor root,dirs,files in os.walk(outfile_dir):\n for f in files:\n os.remove(f)\nprint (\"output files removed\")", + "metadata": { + "execution": { + "iopub.status.busy": "2021-12-02T09:32:58.850987Z", + "iopub.execute_input": "2021-12-02T09:32:58.851459Z", + "iopub.status.idle": "2021-12-02T09:32:58.859278Z", + "shell.execute_reply.started": "2021-12-02T09:32:58.851405Z", + "shell.execute_reply": "2021-12-02T09:32:58.858292Z" + }, + "trusted": true + }, + "execution_count": null, + "outputs": [] + } + ] +} From 12a6cbb7351ca09f348e8998848fac4ea51c1ff4 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 1 Oct 2025 21:14:12 +0100 Subject: [PATCH 13/34] RSDEV-782-Jupyter-Notebooks: server url links and overrids for notebookname, server url and server port --- jupyter_notebooks/provenance_jupyter_hub | 46 +++++++++++++++++++++--- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index ae020b0..9b45a56 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -45,17 +45,52 @@ RSPACE_URL="https://researchspace2.eu.ngrok.io/" """ attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" get_new_password = False +""" +This must be set to a value if exceptions are thrown trying to determine the notebook name +""" +notebook_name = None +""" +This must be set to a value if exceptions are thrown trying to determine the server url +""" +server_url = None + +""" +Set this to a value if server_url is calculated correctly except for the port (which will happen, for example +if the port is being mapped inside a docker container to an external port +""" +server_port = 10000 rspace_client = None app = JupyterFrontEnd() +def get_server_urls(): + all_urls = [] + if(server_url is not None): + all_urls.append(server_url) + else: + for srv in ipynbname._list_maybe_running_servers(): + try: + srv, path = ipynbname._find_nb_path() + if server_port is not None: + srv_url = srv['url'] + print(f"srv_url{srv_url}") + part_url = srv_url[:srv_url.rfind(':')+1] + print(f"part_url{part_url}") + all_urls.append(part_url+str(server_port)+'/lab/tree/'+str(path)) + else: + all_urls.append(srv['url']+'lab/tree/'+str(path)) + except Exception: + pass # Code may fail if server has a password/doesnt use token auth - see ipynbname README + return all_urls def get_notebook_name(): + if notebook_name is not None: + return {'name':notebook_name, 'part_name':notebook_name[:notebook_name.rfind('.')]} nb_fname = ipynbname.name() nb_path = str(ipynbname.path()) ext_pos=(''+nb_path).rfind('.') ext=nb_path[ext_pos:] - return {'name':nb_fname+ext, 'part_name':nb_fname,'path':nb_path} + return {'name':nb_fname+ext, 'part_name':nb_fname} def get_password(): """ @@ -130,7 +165,6 @@ def make_metadata_cell(nb_gallery_file, attachment_files,rspace_document_file_id nb_gallery_file_version = int(nb_gallery_file['version']) +1 nb_gallery_file_name = nb_gallery_file['name'] meta_data_cell = nbformat.v4.new_markdown_cell() - galery_doc_link = f'"This Notebook in RSpace' rspace_doc_for_markdown = f'[The RSpace Document describing this notebook]({RSPACE_URL}workspace/editor/structuredDocument/{rspace_document_file_id})' gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({RSPACE_URL}gallery/item/{nb_gallery_file_id})' meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown @@ -139,6 +173,8 @@ def make_metadata_cell(nb_gallery_file, attachment_files,rspace_document_file_id attachment_file_id = attachment_files.get(attached_data, {}).get('id') attachment_version = attachment_files.get(attached_data, {}).get('version') meta_data_cell['source'] +=f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' + for url in get_server_urls(): + meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": [""]} } return meta_data_cell @@ -222,7 +258,7 @@ def upload_attached_data(attachment_files): attachment_files[attached_data] = attachment_file_data else: print(f"File {attached_data} not changed so no update") - print(f"attached files: {attachment_files}") + # print(f"attached files: {attachment_files}") async def upload_notebook_to_gallery(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id): """ @@ -235,11 +271,11 @@ async def upload_notebook_to_gallery(current_notebook, nb_gallery_file, attachme with open(current_notebook, 'r', encoding='utf-8') as nb_file: client = get_rspace_client() nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - print(f"gallery file for nb: {nb_gallery_file}") + # print(f"gallery file for nb: {nb_gallery_file}") if nb_gallery_file.get('version') == 1: await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") + # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") await reload_notebook() return nb_gallery_file From 9d972871d036d9a0af450bc4c7cfe8b608f6b40b Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 2 Oct 2025 15:47:42 +0100 Subject: [PATCH 14/34] RSDEV-782-Jupyter-Notebooks: data paths calculated relative to the notebook --- jupyter_notebooks/provenance_jupyter_hub | 509 ++++++++--------------- 1 file changed, 182 insertions(+), 327 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 9b45a56..e5a092a 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -1,327 +1,182 @@ -import json -%pip install -q rspace-client==2.6.1 -%pip install -q pickleshare -try: - from notebook import app -except: - %conda install -q notebook -%pip install -q keyring -from rspace_client.eln import eln -import os -import hashlib -import json -%pip install -q dill -import dill -%pip install -q ipynbname -import ipynbname -%pip install -q ipylab -from ipylab import JupyterFrontEnd -import traceback -%pip install -q lxml -from bs4 import BeautifulSoup -import nbformat -import asyncio -import getpass -import keyring - -RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' -ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' -GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' - -# Your RSpace instance goes here -RSPACE_URL="https://researchspace2.eu.ngrok.io/" - -""" - All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' - then paste here using a ',' comma to separate files if there is more than one. - - Example: - attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" - - If you wish to have no attached data, set this value to be "" (a pair of double quotes) - - Example: - attached_data_files = "" -""" -attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" -get_new_password = False -""" -This must be set to a value if exceptions are thrown trying to determine the notebook name -""" -notebook_name = None -""" -This must be set to a value if exceptions are thrown trying to determine the server url -""" -server_url = None - -""" -Set this to a value if server_url is calculated correctly except for the port (which will happen, for example -if the port is being mapped inside a docker container to an external port -""" -server_port = 10000 - -rspace_client = None -app = JupyterFrontEnd() - -def get_server_urls(): - all_urls = [] - if(server_url is not None): - all_urls.append(server_url) - else: - for srv in ipynbname._list_maybe_running_servers(): - try: - srv, path = ipynbname._find_nb_path() - if server_port is not None: - srv_url = srv['url'] - print(f"srv_url{srv_url}") - part_url = srv_url[:srv_url.rfind(':')+1] - print(f"part_url{part_url}") - all_urls.append(part_url+str(server_port)+'/lab/tree/'+str(path)) - else: - all_urls.append(srv['url']+'lab/tree/'+str(path)) - except Exception: - pass # Code may fail if server has a password/doesnt use token auth - see ipynbname README - return all_urls - -def get_notebook_name(): - if notebook_name is not None: - return {'name':notebook_name, 'part_name':notebook_name[:notebook_name.rfind('.')]} - nb_fname = ipynbname.name() - nb_path = str(ipynbname.path()) - ext_pos=(''+nb_path).rfind('.') - ext=nb_path[ext_pos:] - return {'name':nb_fname+ext, 'part_name':nb_fname} - -def get_password(): - """ - Retrieves password from (or saves a new password to) keyring - """ - try: - - # TODO - Define the service name (e.g., the notebook name the secret is for) - service_id = "RSpaceJupyterDemoApp" - # TODO - Define the username associated with the secret - username = "myuser" # use your own username - - retrieved_password = keyring.get_password(service_id, username) - if retrieved_password is None or get_new_password: - retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") - keyring.set_password(service_id, username, retrieved_password) - return retrieved_password - except Exception as e: - print(f"Error getting password: {e}") - return None - -def get_rspace_client(): - """ - Returns rspace ELN API client - """ - try: - global rspace_client - if rspace_client is None: - retrieved_password = get_password() - rspace_client = eln.ELNClient(RSPACE_URL, retrieved_password) - print(rspace_client.get_status()) - return rspace_client - except Exception as e: - print(traceback.format_exc()) - print(f"Error connecting to RSpace: {e}") - return None - -def save_rspace_data(rspace_doc, attachments, gallery_file): - # Define the filename to save the state - state_filename = get_notebook_name()['part_name']+"_state.pkl" - print(f"writing to file: {state_filename}") - with open(state_filename, 'wb') as f: - dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, GALLERY_FILE_FOR_NOTEBOOK: gallery_file}, f) - -def load_data(): - state_filename = get_notebook_name()['part_name']+"_state.pkl" - - if os.path.exists(state_filename): - # Load the variables from the file using dill - with open(state_filename, 'rb') as f: - try: - loaded_state = dill.load(f) - except Exception as e: - loaded_state = {} - else: - loaded_state = {} - print(f"State file '{state_filename}' not found. No variables loaded.") - return loaded_state - -async def save_notebook(): - app.commands.execute('docmanager:save') - # 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved - await asyncio.sleep(1) - -async def reload_notebook(): - app.commands.execute('docmanager:reload') - # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded - await asyncio.sleep(1) - -def make_metadata_cell(nb_gallery_file, attachment_files,rspace_document_file_id ): - nb_gallery_file_id = nb_gallery_file['id'] - nb_gallery_file_version = int(nb_gallery_file['version']) +1 - nb_gallery_file_name = nb_gallery_file['name'] - meta_data_cell = nbformat.v4.new_markdown_cell() - rspace_doc_for_markdown = f'[The RSpace Document describing this notebook]({RSPACE_URL}workspace/editor/structuredDocument/{rspace_document_file_id})' - gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({RSPACE_URL}gallery/item/{nb_gallery_file_id})' - meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_version = attachment_files.get(attached_data, {}).get('version') - meta_data_cell['source'] +=f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' - for url in get_server_urls(): - meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' - meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": [""]} } - return meta_data_cell - -async def add_rspace_details_to_notebook_metadata(fname, nb_gallery_file, attachment_files,rspace_document_file_id ): - """ - We have to save meta data about a notebook before its been uploaded to the gallery. - Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None - its the initial upload to the Gallery and so do not write any meta data - """ - if nb_gallery_file.get('id') is None: - return - await save_notebook() - with open(fname, 'r') as original: - nb = nbformat.read(original, nbformat.NO_CONVERT) - meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id) - replaced = False - for i, cell in enumerate(nb['cells']): - if 'rspace_metadata' in cell['metadata']: - nb["cells"][i] = meta_data_cell - replaced = True - if replaced is False: - nb["cells"].extend([meta_data_cell]) - with open(fname, 'w', encoding='utf-8') as modified: - nbformat.write(nb, modified) - -def make_content(nb_gallery_file_id,attachment_files): - content = f""" - - """ - for attachment_file in attached_data_files.split(","): - content += f""" - - """ - print(f"content is {content}") - return content - -def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): - soup = BeautifulSoup(content, 'html.parser') - attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) - for attachment_div in attachment_divs: - href_tag = attachment_div.find('a') - print(f"href_tag{href_tag}") - gallery_link = '/Streamfile/' + str(nb_gallery_file_id) - for attachment_file in attached_data_files.split(","): - attachment_file_id = attachment_files.get(attachment_file, {}).get('id') - attachment_link = '/Streamfile/' + str(attachment_file_id) - if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: - attachment_div.decompose() - break - return soup.prettify() - -def upload_file_to_gallery(rspaceid, file, client): - if rspaceid is None: - print(f'start upload file {file} using {client}') - data = client.upload_file(file) - else: - print('start update file') - data = client.update_file(file,rspaceid) - return data - -def calc_hash(filename): - sha256_hash = hashlib.sha256() - with open(filename,"rb") as f: - # Read and update hash string value in blocks of 4K - for byte_block in iter(lambda: f.read(4096),b""): - sha256_hash.update(byte_block) - return sha256_hash.hexdigest() - -def upload_attached_data(attachment_files): - client = get_rspace_client() - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - if attached_data: - with open(attached_data, 'r', encoding='utf-8') as attch: - attachment_file_id = attachment_files.get(attached_data,{}).get('id') - attachment_file_hash = attachment_files.get(attached_data,{}).get('hash') - calc_latest_hash = calc_hash(attached_data) - if calc_latest_hash != attachment_file_hash: - attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) - attachment_file_data['hash'] = calc_latest_hash - attachment_files[attached_data] = attachment_file_data - else: - print(f"File {attached_data} not changed so no update") - # print(f"attached files: {attachment_files}") - -async def upload_notebook_to_gallery(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id): - """ - Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). - If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. - We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook - a second time. - """ - await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) - with open(current_notebook, 'r', encoding='utf-8') as nb_file: - client = get_rspace_client() - nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - # print(f"gallery file for nb: {nb_gallery_file}") - if nb_gallery_file.get('version') == 1: - await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) - nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") - await reload_notebook() - return nb_gallery_file - -async def sync_notebook(): - """ - Saves notebook using ipylab and then writes notebook to Rspace document as - an attachment - """ - rspace_document_file_id = None - attachment_filess = None - gallery_file = None - await save_notebook() - try: - loaded_state = load_data() - client = get_rspace_client() - rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) - nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) - nb_gallery_file_id = nb_gallery_file.get('id') - current_notebook = get_notebook_name()['name'] - attachments = None - if rspace_document_file_id is not None: - print(f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {rspace_document_file_id}" ) - else: - print("No RSpace document with this notebook as an attachment saved previously in RSpace") - upload_attached_data(attachment_files) - if nb_gallery_file_id is not None: - print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}" ) - else: - print("Notebook not previously saved to RSpace Gallery") - if rspace_document_file_id is None: - new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) - rspace_document_file_id = new_doc['id'] - nb_gallery_file = await upload_notebook_to_gallery(current_notebook, nb_gallery_file,attachment_files, rspace_document_file_id) - print(f"nb_gallery_file was finally: {nb_gallery_file}") - nb_gallery_file_id = nb_gallery_file.get('id') - - previous_content = client.get_document(rspace_document_file_id)['fields'][0]['content'] - previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) - new_content = previous_content + make_content(nb_gallery_file_id,attachment_files) - updated_doc = client.update_document(rspace_document_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) - save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file) - - except Exception as e: - print(traceback.format_exc()) - print(f"Error reading notebook file: {e}") - return None - -await sync_notebook() +# Spectroscopy Data Visualization - RSpace Integration Example +# This notebook creates publication-ready plots from spectroscopy data +%pip install -q pandas +%pip install -q matplotlib +%pip install -q scipy +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from scipy.signal import find_peaks +import datetime + +# Set publication-ready style +plt.rcParams['figure.facecolor'] = 'white' +plt.rcParams['axes.grid'] = True +plt.rcParams['grid.alpha'] = 0.3 +plt.rcParams['font.size'] = 10 + +# client uses direct RSpace file access to download file to jupyterLab file system +# client = get_rspace_client() +# raw_temp_data = client.download_file("GL408", "spectroscopy_data.csv") +data = pd.read_csv('spectroscopy_data.csv') + +print("=== RSpace Spectroscopy Visualization Workflow ===") +print(f"Visualization started: {datetime.datetime.now()}") +print(f"Dataset shape: {data.shape}") +print(f"Wavelength range: {data['Wavelength'].min()}-{data['Wavelength'].max()} nm") + +# Background correction +data['Sample_A_Corrected'] = data['Sample_A_Abs'] - data['Background'] +data['Sample_B_Corrected'] = data['Sample_B_Abs'] - data['Background'] +data['Sample_C_Corrected'] = data['Sample_C_Abs'] - data['Background'] + +# Create comprehensive visualization suite +fig = plt.figure(figsize=(15, 12)) + +# 1. Main absorption spectra +ax1 = plt.subplot(3, 2, 1) +ax1.plot(data['Wavelength'], data['Sample_A_Corrected'], 'b-', linewidth=2, label='Sample A (10.5 μM)') +ax1.plot(data['Wavelength'], data['Sample_B_Corrected'], 'g-', linewidth=2, label='Sample B (5.2 μM)') +ax1.plot(data['Wavelength'], data['Sample_C_Corrected'], 'r-', linewidth=2, label='Sample C (18.7 μM)') +ax1.set_xlabel('Wavelength (nm)') +ax1.set_ylabel('Absorbance (AU)') +ax1.set_title('UV-Vis Absorption Spectra (Background Corrected)') +ax1.legend() +ax1.grid(True, alpha=0.3) + +# 2. Peak analysis +ax2 = plt.subplot(3, 2, 2) +# Find peaks for Sample C (highest concentration) +peaks_c, _ = find_peaks(data['Sample_C_Corrected'], height=0.1, distance=10) +ax2.plot(data['Wavelength'], data['Sample_C_Corrected'], 'r-', linewidth=2, label='Sample C') +ax2.plot(data['Wavelength'].iloc[peaks_c], data['Sample_C_Corrected'].iloc[peaks_c], 'ro', markersize=8, label='Detected Peaks') +ax2.set_xlabel('Wavelength (nm)') +ax2.set_ylabel('Absorbance (AU)') +ax2.set_title('Peak Analysis - Sample C') +ax2.legend() +ax2.grid(True, alpha=0.3) + +# Add peak annotations +for i, peak_idx in enumerate(peaks_c): + ax2.annotate(f'{data["Wavelength"].iloc[peak_idx]:.0f} nm\n{data["Sample_C_Corrected"].iloc[peak_idx]:.3f} AU', + xy=(data['Wavelength'].iloc[peak_idx], data['Sample_C_Corrected'].iloc[peak_idx]), + xytext=(10, 10), textcoords='offset points', fontsize=9, + bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7)) + +# 3. Concentration vs Max Absorbance +ax3 = plt.subplot(3, 2, 3) +concentrations = [5.2, 10.5, 18.7] # B, A, C +max_abs = [data['Sample_B_Corrected'].max(), + data['Sample_A_Corrected'].max(), + data['Sample_C_Corrected'].max()] + +# Linear regression for Beer's law +slope, intercept = np.polyfit(concentrations, max_abs, 1) +r_squared = np.corrcoef(concentrations, max_abs)[0, 1]**2 + +ax3.scatter(concentrations, max_abs, s=100, alpha=0.7) +x_line = np.linspace(0, 20, 100) +y_line = slope * x_line + intercept +ax3.plot(x_line, y_line, 'r--', alpha=0.8, label=f'Linear fit (R² = {r_squared:.3f})') +ax3.set_xlabel('Concentration (μM)') +ax3.set_ylabel('Maximum Absorbance (AU)') +ax3.set_title('Beer\'s Law Calibration') +ax3.legend() +ax3.grid(True, alpha=0.3) + +# Add equation annotation +ax3.text(0.05, 0.95, f'y = {slope:.4f}x + {intercept:.4f}', + transform=ax3.transAxes, fontsize=10, + bbox=dict(boxstyle='round,pad=0.3', facecolor='lightblue', alpha=0.7)) + +# 4. Raw vs Background-corrected comparison +ax4 = plt.subplot(3, 2, 4) +ax4.plot(data['Wavelength'], data['Sample_A_Abs'], 'b--', alpha=0.7, label='Raw Sample A') +ax4.plot(data['Wavelength'], data['Sample_A_Corrected'], 'b-', linewidth=2, label='Corrected Sample A') +ax4.plot(data['Wavelength'], data['Background'], 'k:', linewidth=2, label='Background') +ax4.set_xlabel('Wavelength (nm)') +ax4.set_ylabel('Absorbance (AU)') +ax4.set_title('Background Correction Effect') +ax4.legend() +ax4.grid(True, alpha=0.3) + +# 5. Heatmap of all samples (using matplotlib instead of seaborn) +ax5 = plt.subplot(3, 2, 5) +heatmap_data = np.array([data['Sample_A_Corrected'], + data['Sample_B_Corrected'], + data['Sample_C_Corrected']]) +im = ax5.imshow(heatmap_data, cmap='viridis', aspect='auto') +ax5.set_yticks([0, 1, 2]) +ax5.set_yticklabels(['Sample A', 'Sample B', 'Sample C']) +ax5.set_xticks(range(0, len(data), 5)) +ax5.set_xticklabels(data['Wavelength'][::5].astype(int)) +ax5.set_title('Absorption Intensity Heatmap') +ax5.set_xlabel('Wavelength (nm)') +plt.colorbar(im, ax=ax5, label='Absorbance (AU)') + +# 6. Derivative spectra for peak resolution +ax6 = plt.subplot(3, 2, 6) +# Calculate first derivative +deriv_a = np.gradient(data['Sample_A_Corrected']) +deriv_b = np.gradient(data['Sample_B_Corrected']) +deriv_c = np.gradient(data['Sample_C_Corrected']) + +ax6.plot(data['Wavelength'], deriv_a, 'b-', linewidth=2, label='Sample A') +ax6.plot(data['Wavelength'], deriv_b, 'g-', linewidth=2, label='Sample B') +ax6.plot(data['Wavelength'], deriv_c, 'r-', linewidth=2, label='Sample C') +ax6.axhline(y=0, color='k', linestyle='--', alpha=0.5) +ax6.set_xlabel('Wavelength (nm)') +ax6.set_ylabel('First Derivative (dA/dλ)') +ax6.set_title('First Derivative Spectra') +ax6.legend() +ax6.grid(True, alpha=0.3) + +plt.tight_layout() + +# TODO: This would be automatically synced to RSpace +# rspace_client.add_plot_to_document(document_id="SD12345", plot=fig, +# title="Complete Spectroscopy Analysis") +plt.savefig('spectroscopy_analysis_suite.png', dpi=300, bbox_inches='tight') +plt.show() + +# Generate summary statistics table +summary_stats = { + 'Sample': ['A', 'B', 'C'], + 'Concentration_uM': [10.5, 5.2, 18.7], + 'Max_Absorbance': [data['Sample_A_Corrected'].max(), + data['Sample_B_Corrected'].max(), + data['Sample_C_Corrected'].max()], + 'Peak_Wavelength': [data.loc[data['Sample_A_Corrected'].idxmax(), 'Wavelength'], + data.loc[data['Sample_B_Corrected'].idxmax(), 'Wavelength'], + data.loc[data['Sample_C_Corrected'].idxmax(), 'Wavelength']], + 'Peak_Width_FWHM': [15.2, 14.8, 16.1] # Would be calculated from actual peak analysis +} + +summary_df = pd.DataFrame(summary_stats) +summary_df['Max_Absorbance'] = summary_df['Max_Absorbance'].round(3) + +print("\n=== Analysis Summary for RSpace ===") +print(summary_df.to_string(index=False)) + +# TODO: Metadata would be automatically captured by RSpace +visualization_metadata = { + 'notebook_version': '2.1', + 'execution_time': datetime.datetime.now(), + 'input_file_checksum': 'def456', # Would be calculated by RSpace + 'plots_generated': [ + 'UV-Vis Absorption Spectra', + 'Peak Analysis', + 'Beer\'s Law Calibration', + 'Background Correction Comparison', + 'Absorption Heatmap', + 'First Derivative Spectra' + ], + 'key_results': [ + f"Linear calibration R² = {r_squared:.3f}", + f"Peak wavelengths: {data.loc[data['Sample_C_Corrected'].idxmax(), 'Wavelength']:.0f} nm", + f"Molar absorptivity: {slope:.2f} L/(mol·cm)" + ] +} + +print(f"\nVisualization completed: {visualization_metadata['execution_time']}") +print("All plots ready for RSpace document integration!") \ No newline at end of file From d1305e80300d0b2124d51bee91c01487c5d4c5af Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 2 Oct 2025 18:35:00 +0100 Subject: [PATCH 15/34] RSDEV-782-Jupyter-Notebooks: data paths calculated relative to the notebook --- jupyter_notebooks/provenance_jupyter_hub | 561 +++++++++++++++-------- 1 file changed, 379 insertions(+), 182 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index e5a092a..0348205 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -1,182 +1,379 @@ -# Spectroscopy Data Visualization - RSpace Integration Example -# This notebook creates publication-ready plots from spectroscopy data -%pip install -q pandas -%pip install -q matplotlib -%pip install -q scipy -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -from scipy.signal import find_peaks -import datetime - -# Set publication-ready style -plt.rcParams['figure.facecolor'] = 'white' -plt.rcParams['axes.grid'] = True -plt.rcParams['grid.alpha'] = 0.3 -plt.rcParams['font.size'] = 10 - -# client uses direct RSpace file access to download file to jupyterLab file system -# client = get_rspace_client() -# raw_temp_data = client.download_file("GL408", "spectroscopy_data.csv") -data = pd.read_csv('spectroscopy_data.csv') - -print("=== RSpace Spectroscopy Visualization Workflow ===") -print(f"Visualization started: {datetime.datetime.now()}") -print(f"Dataset shape: {data.shape}") -print(f"Wavelength range: {data['Wavelength'].min()}-{data['Wavelength'].max()} nm") - -# Background correction -data['Sample_A_Corrected'] = data['Sample_A_Abs'] - data['Background'] -data['Sample_B_Corrected'] = data['Sample_B_Abs'] - data['Background'] -data['Sample_C_Corrected'] = data['Sample_C_Abs'] - data['Background'] - -# Create comprehensive visualization suite -fig = plt.figure(figsize=(15, 12)) - -# 1. Main absorption spectra -ax1 = plt.subplot(3, 2, 1) -ax1.plot(data['Wavelength'], data['Sample_A_Corrected'], 'b-', linewidth=2, label='Sample A (10.5 μM)') -ax1.plot(data['Wavelength'], data['Sample_B_Corrected'], 'g-', linewidth=2, label='Sample B (5.2 μM)') -ax1.plot(data['Wavelength'], data['Sample_C_Corrected'], 'r-', linewidth=2, label='Sample C (18.7 μM)') -ax1.set_xlabel('Wavelength (nm)') -ax1.set_ylabel('Absorbance (AU)') -ax1.set_title('UV-Vis Absorption Spectra (Background Corrected)') -ax1.legend() -ax1.grid(True, alpha=0.3) - -# 2. Peak analysis -ax2 = plt.subplot(3, 2, 2) -# Find peaks for Sample C (highest concentration) -peaks_c, _ = find_peaks(data['Sample_C_Corrected'], height=0.1, distance=10) -ax2.plot(data['Wavelength'], data['Sample_C_Corrected'], 'r-', linewidth=2, label='Sample C') -ax2.plot(data['Wavelength'].iloc[peaks_c], data['Sample_C_Corrected'].iloc[peaks_c], 'ro', markersize=8, label='Detected Peaks') -ax2.set_xlabel('Wavelength (nm)') -ax2.set_ylabel('Absorbance (AU)') -ax2.set_title('Peak Analysis - Sample C') -ax2.legend() -ax2.grid(True, alpha=0.3) - -# Add peak annotations -for i, peak_idx in enumerate(peaks_c): - ax2.annotate(f'{data["Wavelength"].iloc[peak_idx]:.0f} nm\n{data["Sample_C_Corrected"].iloc[peak_idx]:.3f} AU', - xy=(data['Wavelength'].iloc[peak_idx], data['Sample_C_Corrected'].iloc[peak_idx]), - xytext=(10, 10), textcoords='offset points', fontsize=9, - bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7)) - -# 3. Concentration vs Max Absorbance -ax3 = plt.subplot(3, 2, 3) -concentrations = [5.2, 10.5, 18.7] # B, A, C -max_abs = [data['Sample_B_Corrected'].max(), - data['Sample_A_Corrected'].max(), - data['Sample_C_Corrected'].max()] - -# Linear regression for Beer's law -slope, intercept = np.polyfit(concentrations, max_abs, 1) -r_squared = np.corrcoef(concentrations, max_abs)[0, 1]**2 - -ax3.scatter(concentrations, max_abs, s=100, alpha=0.7) -x_line = np.linspace(0, 20, 100) -y_line = slope * x_line + intercept -ax3.plot(x_line, y_line, 'r--', alpha=0.8, label=f'Linear fit (R² = {r_squared:.3f})') -ax3.set_xlabel('Concentration (μM)') -ax3.set_ylabel('Maximum Absorbance (AU)') -ax3.set_title('Beer\'s Law Calibration') -ax3.legend() -ax3.grid(True, alpha=0.3) - -# Add equation annotation -ax3.text(0.05, 0.95, f'y = {slope:.4f}x + {intercept:.4f}', - transform=ax3.transAxes, fontsize=10, - bbox=dict(boxstyle='round,pad=0.3', facecolor='lightblue', alpha=0.7)) - -# 4. Raw vs Background-corrected comparison -ax4 = plt.subplot(3, 2, 4) -ax4.plot(data['Wavelength'], data['Sample_A_Abs'], 'b--', alpha=0.7, label='Raw Sample A') -ax4.plot(data['Wavelength'], data['Sample_A_Corrected'], 'b-', linewidth=2, label='Corrected Sample A') -ax4.plot(data['Wavelength'], data['Background'], 'k:', linewidth=2, label='Background') -ax4.set_xlabel('Wavelength (nm)') -ax4.set_ylabel('Absorbance (AU)') -ax4.set_title('Background Correction Effect') -ax4.legend() -ax4.grid(True, alpha=0.3) - -# 5. Heatmap of all samples (using matplotlib instead of seaborn) -ax5 = plt.subplot(3, 2, 5) -heatmap_data = np.array([data['Sample_A_Corrected'], - data['Sample_B_Corrected'], - data['Sample_C_Corrected']]) -im = ax5.imshow(heatmap_data, cmap='viridis', aspect='auto') -ax5.set_yticks([0, 1, 2]) -ax5.set_yticklabels(['Sample A', 'Sample B', 'Sample C']) -ax5.set_xticks(range(0, len(data), 5)) -ax5.set_xticklabels(data['Wavelength'][::5].astype(int)) -ax5.set_title('Absorption Intensity Heatmap') -ax5.set_xlabel('Wavelength (nm)') -plt.colorbar(im, ax=ax5, label='Absorbance (AU)') - -# 6. Derivative spectra for peak resolution -ax6 = plt.subplot(3, 2, 6) -# Calculate first derivative -deriv_a = np.gradient(data['Sample_A_Corrected']) -deriv_b = np.gradient(data['Sample_B_Corrected']) -deriv_c = np.gradient(data['Sample_C_Corrected']) - -ax6.plot(data['Wavelength'], deriv_a, 'b-', linewidth=2, label='Sample A') -ax6.plot(data['Wavelength'], deriv_b, 'g-', linewidth=2, label='Sample B') -ax6.plot(data['Wavelength'], deriv_c, 'r-', linewidth=2, label='Sample C') -ax6.axhline(y=0, color='k', linestyle='--', alpha=0.5) -ax6.set_xlabel('Wavelength (nm)') -ax6.set_ylabel('First Derivative (dA/dλ)') -ax6.set_title('First Derivative Spectra') -ax6.legend() -ax6.grid(True, alpha=0.3) - -plt.tight_layout() - -# TODO: This would be automatically synced to RSpace -# rspace_client.add_plot_to_document(document_id="SD12345", plot=fig, -# title="Complete Spectroscopy Analysis") -plt.savefig('spectroscopy_analysis_suite.png', dpi=300, bbox_inches='tight') -plt.show() - -# Generate summary statistics table -summary_stats = { - 'Sample': ['A', 'B', 'C'], - 'Concentration_uM': [10.5, 5.2, 18.7], - 'Max_Absorbance': [data['Sample_A_Corrected'].max(), - data['Sample_B_Corrected'].max(), - data['Sample_C_Corrected'].max()], - 'Peak_Wavelength': [data.loc[data['Sample_A_Corrected'].idxmax(), 'Wavelength'], - data.loc[data['Sample_B_Corrected'].idxmax(), 'Wavelength'], - data.loc[data['Sample_C_Corrected'].idxmax(), 'Wavelength']], - 'Peak_Width_FWHM': [15.2, 14.8, 16.1] # Would be calculated from actual peak analysis -} - -summary_df = pd.DataFrame(summary_stats) -summary_df['Max_Absorbance'] = summary_df['Max_Absorbance'].round(3) - -print("\n=== Analysis Summary for RSpace ===") -print(summary_df.to_string(index=False)) - -# TODO: Metadata would be automatically captured by RSpace -visualization_metadata = { - 'notebook_version': '2.1', - 'execution_time': datetime.datetime.now(), - 'input_file_checksum': 'def456', # Would be calculated by RSpace - 'plots_generated': [ - 'UV-Vis Absorption Spectra', - 'Peak Analysis', - 'Beer\'s Law Calibration', - 'Background Correction Comparison', - 'Absorption Heatmap', - 'First Derivative Spectra' - ], - 'key_results': [ - f"Linear calibration R² = {r_squared:.3f}", - f"Peak wavelengths: {data.loc[data['Sample_C_Corrected'].idxmax(), 'Wavelength']:.0f} nm", - f"Molar absorptivity: {slope:.2f} L/(mol·cm)" - ] -} - -print(f"\nVisualization completed: {visualization_metadata['execution_time']}") -print("All plots ready for RSpace document integration!") \ No newline at end of file +import json +%pip install -q rspace-client==2.6.1 +%pip install -q pickleshare +try: + from notebook import app +except: + %conda install -q notebook +%pip install -q keyring +from rspace_client.eln import eln +import os +import hashlib +import json +%pip install -q dill +import dill +%pip install -q ipynbname +import ipynbname +%pip install -q ipylab +from ipylab import JupyterFrontEnd +import traceback +%pip install -q lxml +from bs4 import BeautifulSoup +import nbformat +import asyncio +import getpass +import keyring + +RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' +ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' +GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' + +# Your RSpace instance goes here +RSPACE_URL="https://researchspace2.eu.ngrok.io/" + +""" + All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' + then paste here using a ',' comma to separate files if there is more than one. + + Example: + attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" + + If you wish to have no attached data, set this value to be "" (a pair of double quotes) + + Example: + attached_data_files = "" +""" +attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" +""" +Set this to true to manually enter a new password +""" +get_new_password = False +""" +This must be set to a the value of the PATH to the notebook (select the notebook in the file browser and choose 'copy path'), +if exceptions are thrown trying to determine the notebook name. +If this value is set server_url MUST also be set. +""" +notebook_name = None +""" +This must be set to a value if exceptions are thrown or the calculated value is incorrect when trying to determine the server url. Give the url +of the server including the port: eg http://localhost:10000 (no trailing '/') + +If this value is set, notebook_name MUST also be set. +""" +server_url = None + +""" +Set this to a value if server_url is calculated correctly except for the port (which will happen, for example +if the port is being mapped inside a docker container to an external port +""" +server_port = 10000 + +rspace_client = None +app = JupyterFrontEnd() + +def get_server_urls(): + global server_url + all_urls = [] + if(server_url is not None): + all_urls.append(server_url + '/lab/tree/' + notebook_name) + else: + try: + for srv in ipynbname._list_maybe_running_servers(): + srv, path = ipynbname._find_nb_path() + if server_port is not None: + srv_url = srv['url'] + # print(f"srv_url: {srv_url}") + # print(f"root_dir: {srv['root_dir']}") + # print(f"path: {str(path)}") + part_url = srv_url[:srv_url.rfind(':')+1] + # print(f"part_url: {part_url}") + all_urls.append(part_url+str(server_port)+'/lab/tree/'+str(path)) + else: + all_urls.append(srv['url']+'lab/tree/'+str(path)) + except Exception: + print(f"Error determining server urls, please manually set a value for 'server_url'") + raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README + return all_urls + +def get_server_roots(): + """ + this will only be called if ipyname library is working correctly + """ + all_roots = [] + try: + if len(all_roots) == 0: + for srv in ipynbname._list_maybe_running_servers(): + srv, path = ipynbname._find_nb_path() + root = srv['root_dir'] + all_roots.append(root) + except Exception: + print(f"Error determining server roots, please manually set a value for 'server_url'") + raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README + return all_roots + +def get_notebook_name(): + global notebook_name + try: + if notebook_name is not None: + if '/' in notebook_name: + notebook_name_alone = notebook_name[notebook_name.rfind('/')+1:] + else: + notebook_name_alone = notebook_name + return {'name':notebook_name_alone, 'root_name':notebook_name_alone[:notebook_name_alone.rfind('.')], 'name_path':notebook_name} + nb_fname = ipynbname.name() + nb_path = str(ipynbname.path()) + for srv_root in get_server_roots(): + if not srv_root.endswith("/"): + srv_root = srv_root + "/" + if srv_root in nb_path: + nb_path = nb_path.replace(srv_root,'') + print(f"nb_path: {nb_path}") + ext_pos=(''+nb_path).rfind('.') + ext=nb_path[ext_pos:] + return {'name':nb_fname+ext, 'root_name':nb_fname,'name_path': nb_path} + except Exception as e: + print(f"Error getting notebook name, please manually set a value for 'notebook_name'") + raise + +def get_password(): + """ + Retrieves password from (or saves a new password to) keyring + """ + global get_new_password + try: + + # TODO - Define the service name (e.g., the notebook name the secret is for) + service_id = "RSpaceJupyterDemoApp" + # TODO - Define the username associated with the secret + username = "myuser" # use your own username + + retrieved_password = keyring.get_password(service_id, username) + if retrieved_password is None or get_new_password: + retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") + keyring.set_password(service_id, username, retrieved_password) + return retrieved_password + except Exception as e: + print(f"Error getting password: {e}") + return None + +def get_rspace_client(): + """ + Returns rspace ELN API client + """ + try: + global rspace_client + if rspace_client is None: + retrieved_password = get_password() + rspace_client = eln.ELNClient(RSPACE_URL, retrieved_password) + print(rspace_client.get_status()) + return rspace_client + except Exception as e: + print(traceback.format_exc()) + print(f"Error connecting to RSpace: {e}") + return None + +def save_rspace_data(rspace_doc, attachments, gallery_file): + # Define the filename to save the state + state_filename = get_notebook_name()['root_name']+"_state.pkl" + print(f"writing to file: {state_filename}") + with open(state_filename, 'wb') as f: + dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, GALLERY_FILE_FOR_NOTEBOOK: gallery_file}, f) + +def load_data(): + state_filename = get_notebook_name()['root_name']+"_state.pkl" + + if os.path.exists(state_filename): + # Load the variables from the file using dill + with open(state_filename, 'rb') as f: + try: + loaded_state = dill.load(f) + except Exception as e: + loaded_state = {} + else: + loaded_state = {} + print(f"State file '{state_filename}' not found. No variables loaded.") + return loaded_state + +async def save_notebook(): + app.commands.execute('docmanager:save') + # 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved + await asyncio.sleep(1) + +async def reload_notebook(): + app.commands.execute('docmanager:reload') + # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded + await asyncio.sleep(1) + +def make_metadata_cell(nb_gallery_file, attachment_files,rspace_document_file_id ): + nb_gallery_file_id = nb_gallery_file['id'] + nb_gallery_file_version = int(nb_gallery_file['version']) +1 + nb_gallery_file_name = nb_gallery_file['name'] + meta_data_cell = nbformat.v4.new_markdown_cell() + rspace_doc_for_markdown = f'[The RSpace Document describing this notebook]({RSPACE_URL}workspace/editor/structuredDocument/{rspace_document_file_id})' + gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({RSPACE_URL}gallery/item/{nb_gallery_file_id})' + meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + meta_data_cell['source'] +=f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' + for url in get_server_urls(): + meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' + meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": [""]} } + return meta_data_cell + +async def add_rspace_details_to_notebook_metadata(fname, nb_gallery_file, attachment_files,rspace_document_file_id ): + """ + We have to save meta data about a notebook before its been uploaded to the gallery. + Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None + its the initial upload to the Gallery and so do not write any meta data + """ + if nb_gallery_file.get('id') is None: + return + await save_notebook() + with open(fname, 'r') as original: + nb = nbformat.read(original, nbformat.NO_CONVERT) + meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id) + replaced = False + for i, cell in enumerate(nb['cells']): + if 'rspace_metadata' in cell['metadata']: + nb["cells"][i] = meta_data_cell + replaced = True + if replaced is False: + nb["cells"].extend([meta_data_cell]) + with open(fname, 'w', encoding='utf-8') as modified: + nbformat.write(nb, modified) + +def make_content(nb_gallery_file_id,attachment_files): + content = f""" + + """ + for attachment_file in attached_data_files.split(","): + content += f""" + + """ + print(f"content is {content}") + return content + +def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): + soup = BeautifulSoup(content, 'html.parser') + attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) + for attachment_div in attachment_divs: + href_tag = attachment_div.find('a') + print(f"href_tag{href_tag}") + gallery_link = '/Streamfile/' + str(nb_gallery_file_id) + for attachment_file in attached_data_files.split(","): + attachment_file_id = attachment_files.get(attachment_file, {}).get('id') + attachment_link = '/Streamfile/' + str(attachment_file_id) + if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: + attachment_div.decompose() + break + return soup.prettify() + +def upload_file_to_gallery(rspaceid, file, client): + if rspaceid is None: + print(f'start upload file {file} using {client}') + data = client.upload_file(file) + else: + print('start update file') + data = client.update_file(file,rspaceid) + return data + +def calc_hash(filename): + sha256_hash = hashlib.sha256() + with open(filename,"rb") as f: + # Read and update hash string value in blocks of 4K + for byte_block in iter(lambda: f.read(4096),b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + +def upload_attached_data(attachment_files): + client = get_rspace_client() + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + if attached_data: + # make file paths to data relative to the location of this notebook + nested_dir_pos=get_notebook_name()['name_path'].count('/') + relative_attached_data =attached_data + for i in range(nested_dir_pos): + relative_attached_data = "../" + relative_attached_data + print(f"relative_attached_data: {relative_attached_data}") + with open(relative_attached_data, 'r', encoding='utf-8') as attch: + attachment_file_id = attachment_files.get(attached_data,{}).get('id') + attachment_file_hash = attachment_files.get(attached_data,{}).get('hash') + calc_latest_hash = calc_hash(relative_attached_data) + if calc_latest_hash != attachment_file_hash: + attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) + attachment_file_data['hash'] = calc_latest_hash + attachment_files[attached_data] = attachment_file_data + else: + print(f"File {attached_data} not changed so no update") + # print(f"attached files: {attachment_files}") + +async def upload_notebook_to_gallery(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id): + """ + Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). + If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. + We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook + a second time. + """ + await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) + with open(current_notebook, 'r', encoding='utf-8') as nb_file: + client = get_rspace_client() + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + # print(f"gallery file for nb: {nb_gallery_file}") + if nb_gallery_file.get('version') == 1: + await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") + await reload_notebook() + return nb_gallery_file + +async def sync_notebook(): + """ + Saves notebook using ipylab and then writes notebook to Rspace document as + an attachment + """ + rspace_document_file_id = None + attachment_filess = None + gallery_file = None + await save_notebook() + get_server_urls() + print(f"notebook name: {get_notebook_name()}") + try: + loaded_state = load_data() + client = get_rspace_client() + rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) + nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) + nb_gallery_file_id = nb_gallery_file.get('id') + current_notebook = get_notebook_name()['name'] + attachments = None + if rspace_document_file_id is not None: + print(f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {rspace_document_file_id}" ) + else: + print("No RSpace document with this notebook as an attachment saved previously in RSpace") + upload_attached_data(attachment_files) + if nb_gallery_file_id is not None: + print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}" ) + else: + print("Notebook not previously saved to RSpace Gallery") + if rspace_document_file_id is None: + new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) + rspace_document_file_id = new_doc['id'] + nb_gallery_file = await upload_notebook_to_gallery(current_notebook, nb_gallery_file,attachment_files, rspace_document_file_id) + print(f"nb_gallery_file was finally: {nb_gallery_file}") + nb_gallery_file_id = nb_gallery_file.get('id') + + previous_content = client.get_document(rspace_document_file_id)['fields'][0]['content'] + previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) + new_content = previous_content + make_content(nb_gallery_file_id,attachment_files) + updated_doc = client.update_document(rspace_document_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) + save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file) + + except Exception as e: + print(traceback.format_exc()) + print(f"Error reading notebook file: {e}") + return None + +await sync_notebook() From ec7e9b7ffb4639ce592dca8730c103b5d96991d0 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Fri, 3 Oct 2025 17:02:15 +0100 Subject: [PATCH 16/34] RSDEV-782-Jupyter-Notebooks: code stops execution of execution count of notebook has not changed --- jupyter_notebooks/provenance_jupyter_hub | 179 +++++++++++++++-------- 1 file changed, 119 insertions(+), 60 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 0348205..ea1e781 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -27,9 +27,10 @@ import keyring RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' +EXECUTION_COUNT_FOR_NOTEBOOK = 'execution_count_for_notebook' # Your RSpace instance goes here -RSPACE_URL="https://researchspace2.eu.ngrok.io/" +RSPACE_URL = "https://researchspace2.eu.ngrok.io/" """ All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' @@ -71,10 +72,11 @@ server_port = 10000 rspace_client = None app = JupyterFrontEnd() + def get_server_urls(): global server_url all_urls = [] - if(server_url is not None): + if (server_url is not None): all_urls.append(server_url + '/lab/tree/' + notebook_name) else: try: @@ -85,16 +87,17 @@ def get_server_urls(): # print(f"srv_url: {srv_url}") # print(f"root_dir: {srv['root_dir']}") # print(f"path: {str(path)}") - part_url = srv_url[:srv_url.rfind(':')+1] + part_url = srv_url[:srv_url.rfind(':') + 1] # print(f"part_url: {part_url}") - all_urls.append(part_url+str(server_port)+'/lab/tree/'+str(path)) + all_urls.append(part_url + str(server_port) + '/lab/tree/' + str(path)) else: - all_urls.append(srv['url']+'lab/tree/'+str(path)) + all_urls.append(srv['url'] + 'lab/tree/' + str(path)) except Exception: print(f"Error determining server urls, please manually set a value for 'server_url'") raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README return all_urls + def get_server_roots(): """ this will only be called if ipyname library is working correctly @@ -111,29 +114,32 @@ def get_server_roots(): raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README return all_roots + def get_notebook_name(): global notebook_name try: if notebook_name is not None: if '/' in notebook_name: - notebook_name_alone = notebook_name[notebook_name.rfind('/')+1:] + notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] else: notebook_name_alone = notebook_name - return {'name':notebook_name_alone, 'root_name':notebook_name_alone[:notebook_name_alone.rfind('.')], 'name_path':notebook_name} + return {'name': notebook_name_alone, 'root_name': notebook_name_alone[:notebook_name_alone.rfind('.')], + 'name_path': notebook_name} nb_fname = ipynbname.name() nb_path = str(ipynbname.path()) for srv_root in get_server_roots(): if not srv_root.endswith("/"): srv_root = srv_root + "/" if srv_root in nb_path: - nb_path = nb_path.replace(srv_root,'') + nb_path = nb_path.replace(srv_root, '') print(f"nb_path: {nb_path}") - ext_pos=(''+nb_path).rfind('.') - ext=nb_path[ext_pos:] - return {'name':nb_fname+ext, 'root_name':nb_fname,'name_path': nb_path} + ext_pos = ('' + nb_path).rfind('.') + ext = nb_path[ext_pos:] + return {'name': nb_fname + ext, 'root_name': nb_fname, 'name_path': nb_path} except Exception as e: print(f"Error getting notebook name, please manually set a value for 'notebook_name'") - raise + raise + def get_password(): """ @@ -146,7 +152,7 @@ def get_password(): service_id = "RSpaceJupyterDemoApp" # TODO - Define the username associated with the secret username = "myuser" # use your own username - + retrieved_password = keyring.get_password(service_id, username) if retrieved_password is None or get_new_password: retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") @@ -156,6 +162,7 @@ def get_password(): print(f"Error getting password: {e}") return None + def get_rspace_client(): """ Returns rspace ELN API client @@ -172,15 +179,18 @@ def get_rspace_client(): print(f"Error connecting to RSpace: {e}") return None -def save_rspace_data(rspace_doc, attachments, gallery_file): + +def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count): # Define the filename to save the state - state_filename = get_notebook_name()['root_name']+"_state.pkl" + state_filename = get_notebook_name()['root_name'] + "_state.pkl" print(f"writing to file: {state_filename}") with open(state_filename, 'wb') as f: - dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, GALLERY_FILE_FOR_NOTEBOOK: gallery_file}, f) + dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, + GALLERY_FILE_FOR_NOTEBOOK: gallery_file, EXECUTION_COUNT_FOR_NOTEBOOK: execution_count}, f) + def load_data(): - state_filename = get_notebook_name()['root_name']+"_state.pkl" + state_filename = get_notebook_name()['root_name'] + "_state.pkl" if os.path.exists(state_filename): # Load the variables from the file using dill @@ -194,19 +204,22 @@ def load_data(): print(f"State file '{state_filename}' not found. No variables loaded.") return loaded_state + async def save_notebook(): app.commands.execute('docmanager:save') # 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved await asyncio.sleep(1) + async def reload_notebook(): app.commands.execute('docmanager:reload') # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded await asyncio.sleep(1) -def make_metadata_cell(nb_gallery_file, attachment_files,rspace_document_file_id ): + +def make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id): nb_gallery_file_id = nb_gallery_file['id'] - nb_gallery_file_version = int(nb_gallery_file['version']) +1 + nb_gallery_file_version = int(nb_gallery_file['version']) + 1 nb_gallery_file_name = nb_gallery_file['name'] meta_data_cell = nbformat.v4.new_markdown_cell() rspace_doc_for_markdown = f'[The RSpace Document describing this notebook]({RSPACE_URL}workspace/editor/structuredDocument/{rspace_document_file_id})' @@ -216,13 +229,17 @@ def make_metadata_cell(nb_gallery_file, attachment_files,rspace_document_file_id for attached_data in attached_data_files_list: attachment_file_id = attachment_files.get(attached_data, {}).get('id') attachment_version = attachment_files.get(attached_data, {}).get('version') - meta_data_cell['source'] +=f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' + meta_data_cell[ + 'source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' for url in get_server_urls(): meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' - meta_data_cell['metadata'] = {"rspace_metadata": {"documentFor": "docid","notebook_file":"docid", "attachments": [""]} } + meta_data_cell['metadata'] = { + "rspace_metadata": {"documentFor": "docid", "notebook_file": "docid", "attachments": [""]}} return meta_data_cell -async def add_rspace_details_to_notebook_metadata(fname, nb_gallery_file, attachment_files,rspace_document_file_id ): + +async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_file, attachment_files, + rspace_document_file_id): """ We have to save meta data about a notebook before its been uploaded to the gallery. Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None @@ -231,30 +248,47 @@ async def add_rspace_details_to_notebook_metadata(fname, nb_gallery_file, attach if nb_gallery_file.get('id') is None: return await save_notebook() - with open(fname, 'r') as original: - nb = nbformat.read(original, nbformat.NO_CONVERT) - meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id) - replaced = False - for i, cell in enumerate(nb['cells']): - if 'rspace_metadata' in cell['metadata']: - nb["cells"][i] = meta_data_cell - replaced = True - if replaced is False: - nb["cells"].extend([meta_data_cell]) + meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id) + replaced = False + for i, cell in enumerate(notebook['cells']): + if 'rspace_metadata' in cell['metadata']: + notebook["cells"][i] = meta_data_cell + replaced = True + if replaced is False: + notebook["cells"].extend([meta_data_cell]) with open(fname, 'w', encoding='utf-8') as modified: - nbformat.write(nb, modified) + nbformat.write(notebook, modified) + + +def get_notebook_execution_count(notebook): + """ + return the sum of all execution counts for code cells + """ + new_executed_count = 0 + for i, cell in enumerate(notebook['cells']): + if cell['cell_type'] == 'code': + + print(f"cell id: {cell['id']} and execution_count {cell['execution_count']} ") + cell_count = cell['execution_count'] + if cell_count is None: + cell_count = 0 + new_executed_count += cell_count + print(f"new executed count {new_executed_count}") + return new_executed_count -def make_content(nb_gallery_file_id,attachment_files): + +def make_content(nb_gallery_file_id, attachment_files): content = f""" """ - for attachment_file in attached_data_files.split(","): + for attachment_file in attached_data_files.split(","): content += f""" """ print(f"content is {content}") return content + def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): soup = BeautifulSoup(content, 'html.parser') attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) @@ -262,7 +296,7 @@ def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files href_tag = attachment_div.find('a') print(f"href_tag{href_tag}") gallery_link = '/Streamfile/' + str(nb_gallery_file_id) - for attachment_file in attached_data_files.split(","): + for attachment_file in attached_data_files.split(","): attachment_file_id = attachment_files.get(attachment_file, {}).get('id') attachment_link = '/Streamfile/' + str(attachment_file_id) if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: @@ -270,37 +304,40 @@ def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files break return soup.prettify() + def upload_file_to_gallery(rspaceid, file, client): if rspaceid is None: print(f'start upload file {file} using {client}') data = client.upload_file(file) else: print('start update file') - data = client.update_file(file,rspaceid) + data = client.update_file(file, rspaceid) return data - + + def calc_hash(filename): sha256_hash = hashlib.sha256() - with open(filename,"rb") as f: + with open(filename, "rb") as f: # Read and update hash string value in blocks of 4K - for byte_block in iter(lambda: f.read(4096),b""): + for byte_block in iter(lambda: f.read(4096), b""): sha256_hash.update(byte_block) return sha256_hash.hexdigest() - + + def upload_attached_data(attachment_files): client = get_rspace_client() attached_data_files_list = attached_data_files.split(",") for attached_data in attached_data_files_list: if attached_data: # make file paths to data relative to the location of this notebook - nested_dir_pos=get_notebook_name()['name_path'].count('/') - relative_attached_data =attached_data + nested_dir_pos = get_notebook_name()['name_path'].count('/') + relative_attached_data = attached_data for i in range(nested_dir_pos): relative_attached_data = "../" + relative_attached_data print(f"relative_attached_data: {relative_attached_data}") with open(relative_attached_data, 'r', encoding='utf-8') as attch: - attachment_file_id = attachment_files.get(attached_data,{}).get('id') - attachment_file_hash = attachment_files.get(attached_data,{}).get('hash') + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') calc_latest_hash = calc_hash(relative_attached_data) if calc_latest_hash != attachment_file_hash: attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) @@ -309,26 +346,31 @@ def upload_attached_data(attachment_files): else: print(f"File {attached_data} not changed so no update") # print(f"attached files: {attachment_files}") - -async def upload_notebook_to_gallery(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id): + + +async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file, attachment_files, + rspace_document_file_id): """ Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. - We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook + We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook a second time. """ - await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) + await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, + rspace_document_file_id) with open(current_notebook, 'r', encoding='utf-8') as nb_file: client = get_rspace_client() nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) # print(f"gallery file for nb: {nb_gallery_file}") if nb_gallery_file.get('version') == 1: - await add_rspace_details_to_notebook_metadata(current_notebook, nb_gallery_file, attachment_files, rspace_document_file_id) + await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, + rspace_document_file_id) nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") await reload_notebook() return nb_gallery_file - + + async def sync_notebook(): """ Saves notebook using ipylab and then writes notebook to Rspace document as @@ -340,8 +382,23 @@ async def sync_notebook(): await save_notebook() get_server_urls() print(f"notebook name: {get_notebook_name()}") + current_notebook = get_notebook_name()['name'] + with open(current_notebook, 'r') as notebook: + notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) + new_hash = calc_hash(current_notebook) + print(f" new hash {new_hash}") try: loaded_state = load_data() + execution_count = loaded_state.get(EXECUTION_COUNT_FOR_NOTEBOOK) + new_execution_count = get_notebook_execution_count(notebook_node) + + print(f"New execution count {new_execution_count}") + print(f"Previous execution count {execution_count}") + + if execution_count == new_execution_count: + print("No execution since last sync: no data updated in RSpace") + await save_notebook() + return client = get_rspace_client() rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) @@ -350,30 +407,32 @@ async def sync_notebook(): current_notebook = get_notebook_name()['name'] attachments = None if rspace_document_file_id is not None: - print(f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {rspace_document_file_id}" ) + print( + f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {rspace_document_file_id}") else: print("No RSpace document with this notebook as an attachment saved previously in RSpace") upload_attached_data(attachment_files) if nb_gallery_file_id is not None: - print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}" ) + print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}") else: print("Notebook not previously saved to RSpace Gallery") if rspace_document_file_id is None: - new_doc = client.create_document(name="DocumentFor_"+current_notebook,tags = ["Python", "API", "Jupyter"] ) + new_doc = client.create_document(name="DocumentFor_" + current_notebook, tags=["Python", "API", "Jupyter"]) rspace_document_file_id = new_doc['id'] - nb_gallery_file = await upload_notebook_to_gallery(current_notebook, nb_gallery_file,attachment_files, rspace_document_file_id) + nb_gallery_file = await upload_notebook_to_gallery(current_notebook,notebook_node, nb_gallery_file,attachment_files, rspace_document_file_id) print(f"nb_gallery_file was finally: {nb_gallery_file}") nb_gallery_file_id = nb_gallery_file.get('id') - + previous_content = client.get_document(rspace_document_file_id)['fields'][0]['content'] previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) - new_content = previous_content + make_content(nb_gallery_file_id,attachment_files) - updated_doc = client.update_document(rspace_document_file_id,tags = ['Python', 'API', 'Jupyter'], fields = [{"content": new_content}]) - save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file) - + new_content = previous_content + make_content(nb_gallery_file_id, attachment_files) + updated_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], + fields=[{"content": new_content}]) + save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file, new_execution_count) except Exception as e: print(traceback.format_exc()) print(f"Error reading notebook file: {e}") return None + await sync_notebook() From 3a6d2636ee6d8a6e5b8342426787b1aeac925384 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Fri, 3 Oct 2025 17:08:35 +0100 Subject: [PATCH 17/34] RSDEV-782-Jupyter-Notebooks: comments --- jupyter_notebooks/provenance_jupyter_hub | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index ea1e781..4fa0a79 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -263,17 +263,19 @@ async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_fi def get_notebook_execution_count(notebook): """ return the sum of all execution counts for code cells + note that this code cell does not contribute to the count: + it is always saved before its execution_count gets updated + and so the value of execution_count for this cell is always 'None' """ new_executed_count = 0 for i, cell in enumerate(notebook['cells']): if cell['cell_type'] == 'code': - - print(f"cell id: {cell['id']} and execution_count {cell['execution_count']} ") + # print(f"cell id: {cell['id']} and execution_count {cell['execution_count']} ") cell_count = cell['execution_count'] if cell_count is None: cell_count = 0 new_executed_count += cell_count - print(f"new executed count {new_executed_count}") + # print(f"new executed count {new_executed_count}") return new_executed_count @@ -285,7 +287,7 @@ def make_content(nb_gallery_file_id, attachment_files): content += f""" """ - print(f"content is {content}") + # print(f"content is {content}") return content @@ -294,7 +296,7 @@ def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) for attachment_div in attachment_divs: href_tag = attachment_div.find('a') - print(f"href_tag{href_tag}") + # print(f"href_tag{href_tag}") gallery_link = '/Streamfile/' + str(nb_gallery_file_id) for attachment_file in attached_data_files.split(","): attachment_file_id = attachment_files.get(attachment_file, {}).get('id') @@ -374,7 +376,10 @@ async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file async def sync_notebook(): """ Saves notebook using ipylab and then writes notebook to Rspace document as - an attachment + an attachment if the execution_count of the notebook has changed since the last time + this cell was run. Attached data is also written to RSpace if its hash_sum has changed + since the last time this cell was run. The notebook and attached data will always be written + to RSpace at least once (on the first time this cell is run). """ rspace_document_file_id = None attachment_filess = None From 968cfec27993916922ee4a3b5d451d131eac5231 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Tue, 7 Oct 2025 12:24:21 +0100 Subject: [PATCH 18/34] RSDEV-782-Jupyter-Notebooks: writes a history with links to rspace_doc version --- jupyter_notebooks/provenance_jupyter_hub | 81 +++++++++++++++--------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 4fa0a79..dd9c4ac 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -28,7 +28,9 @@ RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' EXECUTION_COUNT_FOR_NOTEBOOK = 'execution_count_for_notebook' - +HISTORY_DATA = 'history_data' +RSPACE_DOC_URL = 'workspace/editor/structuredDocument/' +RSPACE_DOC_VERSION_URL_START = 'workspace/editor/structuredDocument/audit/view?globalId=' # Your RSpace instance goes here RSPACE_URL = "https://researchspace2.eu.ngrok.io/" @@ -132,7 +134,7 @@ def get_notebook_name(): srv_root = srv_root + "/" if srv_root in nb_path: nb_path = nb_path.replace(srv_root, '') - print(f"nb_path: {nb_path}") + # print(f"nb_path: {nb_path}") ext_pos = ('' + nb_path).rfind('.') ext = nb_path[ext_pos:] return {'name': nb_fname + ext, 'root_name': nb_fname, 'name_path': nb_path} @@ -180,13 +182,13 @@ def get_rspace_client(): return None -def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count): +def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count, history_data): # Define the filename to save the state state_filename = get_notebook_name()['root_name'] + "_state.pkl" print(f"writing to file: {state_filename}") with open(state_filename, 'wb') as f: dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, - GALLERY_FILE_FOR_NOTEBOOK: gallery_file, EXECUTION_COUNT_FOR_NOTEBOOK: execution_count}, f) + GALLERY_FILE_FOR_NOTEBOOK: gallery_file, EXECUTION_COUNT_FOR_NOTEBOOK: execution_count, HISTORY_DATA:history_data }, f) def load_data(): @@ -217,12 +219,17 @@ async def reload_notebook(): await asyncio.sleep(1) -def make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id): +def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data): + rspace_document_file_id =str(rspace_doc['id']) + # new content plus new attachment data increments the documentversion by two + rspace_document_version = 2 if rspace_doc['version'] == 1 else rspace_doc['version'] + 2 + rspace_document_name = rspace_doc['name'] + rspace_document_globalId = rspace_doc['globalId']+'v'+str(rspace_document_version) nb_gallery_file_id = nb_gallery_file['id'] nb_gallery_file_version = int(nb_gallery_file['version']) + 1 nb_gallery_file_name = nb_gallery_file['name'] meta_data_cell = nbformat.v4.new_markdown_cell() - rspace_doc_for_markdown = f'[The RSpace Document describing this notebook]({RSPACE_URL}workspace/editor/structuredDocument/{rspace_document_file_id})' + rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_URL}{rspace_document_file_id})' gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({RSPACE_URL}gallery/item/{nb_gallery_file_id})' meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown attached_data_files_list = attached_data_files.split(",") @@ -233,13 +240,21 @@ def make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_i 'source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' for url in get_server_urls(): meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' + new_history = f'
RSpace doc [{rspace_document_name} version {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_VERSION_URL_START}{rspace_document_globalId}) contains this Notebook, version {nb_gallery_file_version}, executed with: ' + for attached_data in attached_data_files_list: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + new_history += f'Data {attached_data} version: {attachment_version} ' + # print(f'history_data: {history_data}') + history_data['text'] = new_history + history_data['text'] + meta_data_cell[ + 'source'] += f'
{history_data['text']}' meta_data_cell['metadata'] = { "rspace_metadata": {"documentFor": "docid", "notebook_file": "docid", "attachments": [""]}} return meta_data_cell - async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_file, attachment_files, - rspace_document_file_id): + rspace_doc, history_data): """ We have to save meta data about a notebook before its been uploaded to the gallery. Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None @@ -248,7 +263,7 @@ async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_fi if nb_gallery_file.get('id') is None: return await save_notebook() - meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_document_file_id) + meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc,history_data) replaced = False for i, cell in enumerate(notebook['cells']): if 'rspace_metadata' in cell['metadata']: @@ -336,7 +351,7 @@ def upload_attached_data(attachment_files): relative_attached_data = attached_data for i in range(nested_dir_pos): relative_attached_data = "../" + relative_attached_data - print(f"relative_attached_data: {relative_attached_data}") + # print(f"relative_attached_data: {relative_attached_data}") with open(relative_attached_data, 'r', encoding='utf-8') as attch: attachment_file_id = attachment_files.get(attached_data, {}).get('id') attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') @@ -351,7 +366,7 @@ def upload_attached_data(attachment_files): async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_document_file_id): + rspace_doc, history_data): """ Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. @@ -359,14 +374,14 @@ async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file a second time. """ await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_document_file_id) + rspace_doc, history_data) with open(current_notebook, 'r', encoding='utf-8') as nb_file: client = get_rspace_client() nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) # print(f"gallery file for nb: {nb_gallery_file}") if nb_gallery_file.get('version') == 1: await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_document_file_id) + rspace_doc, history_data) nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") await reload_notebook() @@ -377,21 +392,24 @@ async def sync_notebook(): """ Saves notebook using ipylab and then writes notebook to Rspace document as an attachment if the execution_count of the notebook has changed since the last time - this cell was run. Attached data is also written to RSpace if its hash_sum has changed - since the last time this cell was run. The notebook and attached data will always be written - to RSpace at least once (on the first time this cell is run). + this cell was run. Note that the execution count of this cell does not contribute to + the comparison - we will not write data to RSpace if only this cell has been run + since the last time data was written to RSpace. + Attached data is also written to RSpace if its hash_sum has changed. + + The notebook and attached data will always be written to RSpace at least once (on the first time this cell is run). """ - rspace_document_file_id = None + rspace_doc = None attachment_filess = None gallery_file = None await save_notebook() get_server_urls() - print(f"notebook name: {get_notebook_name()}") + # print(f"notebook name: {get_notebook_name()}") current_notebook = get_notebook_name()['name'] with open(current_notebook, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) new_hash = calc_hash(current_notebook) - print(f" new hash {new_hash}") + # print(f" new hash {new_hash}") try: loaded_state = load_data() execution_count = loaded_state.get(EXECUTION_COUNT_FOR_NOTEBOOK) @@ -400,20 +418,22 @@ async def sync_notebook(): print(f"New execution count {new_execution_count}") print(f"Previous execution count {execution_count}") + # FIXME if execution_count == new_execution_count: print("No execution since last sync: no data updated in RSpace") await save_notebook() return client = get_rspace_client() - rspace_document_file_id = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + rspace_doc = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) nb_gallery_file_id = nb_gallery_file.get('id') + history_data = loaded_state.get(HISTORY_DATA,{'text':''}) current_notebook = get_notebook_name()['name'] attachments = None - if rspace_document_file_id is not None: + if rspace_doc is not None: print( - f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {rspace_document_file_id}") + f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {str(rspace_doc['id'])}") else: print("No RSpace document with this notebook as an attachment saved previously in RSpace") upload_attached_data(attachment_files) @@ -421,19 +441,20 @@ async def sync_notebook(): print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}") else: print("Notebook not previously saved to RSpace Gallery") - if rspace_document_file_id is None: - new_doc = client.create_document(name="DocumentFor_" + current_notebook, tags=["Python", "API", "Jupyter"]) - rspace_document_file_id = new_doc['id'] - nb_gallery_file = await upload_notebook_to_gallery(current_notebook,notebook_node, nb_gallery_file,attachment_files, rspace_document_file_id) - print(f"nb_gallery_file was finally: {nb_gallery_file}") + if rspace_doc is None: + rspace_doc = client.create_document(name="DocumentFor_" + current_notebook, tags=["Python", "API", "Jupyter"]) + rspace_document_file_id = str(rspace_doc['id']) + rspace_doc = client.get_document(rspace_document_file_id) + nb_gallery_file = await upload_notebook_to_gallery(current_notebook,notebook_node, nb_gallery_file,attachment_files, rspace_doc, history_data) + # print(f"nb_gallery_file was finally: {nb_gallery_file}") nb_gallery_file_id = nb_gallery_file.get('id') - previous_content = client.get_document(rspace_document_file_id)['fields'][0]['content'] + previous_content = rspace_doc['fields'][0]['content'] previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) new_content = previous_content + make_content(nb_gallery_file_id, attachment_files) - updated_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], + rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], fields=[{"content": new_content}]) - save_rspace_data(rspace_document_file_id, attachment_files, nb_gallery_file, new_execution_count) + save_rspace_data(rspace_doc, attachment_files, nb_gallery_file, new_execution_count, history_data) except Exception as e: print(traceback.format_exc()) print(f"Error reading notebook file: {e}") From c925a385ef11ce56e7fcce4f2620e1172735d35f Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Tue, 7 Oct 2025 17:31:44 +0100 Subject: [PATCH 19/34] RSDEV-782-Jupyter-Notebooks: saves first upload of notebook to gallery correctly --- jupyter_notebooks/provenance_jupyter_hub | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index dd9c4ac..8f7b0bb 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -221,12 +221,13 @@ async def reload_notebook(): def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data): rspace_document_file_id =str(rspace_doc['id']) - # new content plus new attachment data increments the documentversion by two + # new content plus new attachment data increments the document version by two rspace_document_version = 2 if rspace_doc['version'] == 1 else rspace_doc['version'] + 2 rspace_document_name = rspace_doc['name'] rspace_document_globalId = rspace_doc['globalId']+'v'+str(rspace_document_version) nb_gallery_file_id = nb_gallery_file['id'] - nb_gallery_file_version = int(nb_gallery_file['version']) + 1 + nb_gallery_file_version = int(nb_gallery_file['version']) + nb_gallery_file_version = 1 if nb_gallery_file_version == 1 else nb_gallery_file_version + 1 nb_gallery_file_name = nb_gallery_file['name'] meta_data_cell = nbformat.v4.new_markdown_cell() rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_URL}{rspace_document_file_id})' @@ -272,6 +273,7 @@ async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_fi if replaced is False: notebook["cells"].extend([meta_data_cell]) with open(fname, 'w', encoding='utf-8') as modified: + print(f"updated notebook: {notebook}") nbformat.write(notebook, modified) @@ -379,10 +381,13 @@ async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file client = get_rspace_client() nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) # print(f"gallery file for nb: {nb_gallery_file}") - if nb_gallery_file.get('version') == 1: - await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data) - nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + if nb_gallery_file.get('version') == 1: + await asyncio.sleep(1) + await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, + rspace_doc, history_data) + with open(current_notebook, 'r', encoding='utf-8') as nb_file: + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + await asyncio.sleep(1) # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") await reload_notebook() return nb_gallery_file From 34625eaaadadf6a1a98478ff200e7c3935942f31 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 8 Oct 2025 11:11:11 +0100 Subject: [PATCH 20/34] RSDEV-782-Jupyter-Notebooks: can write to an existing rspace_doc and into a given field --- jupyter_notebooks/provenance_jupyter_hub | 55 +++++++++++++++++++----- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 8f7b0bb..661bda0 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -33,7 +33,23 @@ RSPACE_DOC_URL = 'workspace/editor/structuredDocument/' RSPACE_DOC_VERSION_URL_START = 'workspace/editor/structuredDocument/audit/view?globalId=' # Your RSpace instance goes here RSPACE_URL = "https://researchspace2.eu.ngrok.io/" +""" +Default behaviour creates a new RSpace document when this cell is executed and attached this jupyter notebook to the +new document. + +Setting RSPACE_PREEXISITING_DOCUMENT_ID to a value other than None will attach this jupyter notebook to the RSpace document +with the given ID instead of creating a new RSpace document. +""" +# RSPACE_PREEXISITING_DOCUMENT_ID = None +RSPACE_PREEXISITING_DOCUMENT_ID = 155 +""" +Default behaviour writes links to this notebook into the 'first' field in a document (field '0'). Set this to a value +if a different field should be used. +If this is set to a value other than None, RSPACE_PREEXISITING_DOCUMENT_ID must be set to a value other than None. +""" +# RSPACE_DOCUMENT_TARGET_FIELD_ID = None +RSPACE_DOCUMENT_TARGET_FIELD_ID = 34 """ All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' then paste here using a ',' comma to separate files if there is more than one. @@ -54,6 +70,7 @@ get_new_password = False """ This must be set to a the value of the PATH to the notebook (select the notebook in the file browser and choose 'copy path'), if exceptions are thrown trying to determine the notebook name. + If this value is set server_url MUST also be set. """ notebook_name = None @@ -74,7 +91,6 @@ server_port = 10000 rspace_client = None app = JupyterFrontEnd() - def get_server_urls(): global server_url all_urls = [] @@ -273,7 +289,6 @@ async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_fi if replaced is False: notebook["cells"].extend([meta_data_cell]) with open(fname, 'w', encoding='utf-8') as modified: - print(f"updated notebook: {notebook}") nbformat.write(notebook, modified) @@ -392,6 +407,21 @@ async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file await reload_notebook() return nb_gallery_file +def get_field_content(rspace_doc): + if RSPACE_DOCUMENT_TARGET_FIELD_ID is None: + return rspace_doc['fields'][0]['content'] + else: + for field in rspace_doc['fields']: + if field['id'] == RSPACE_DOCUMENT_TARGET_FIELD_ID: + return field['content'] + return None + +def assert_invariants(): + if RSPACE_DOCUMENT_TARGET_FIELD_ID is not None and RSPACE_PREEXISITING_DOCUMENT_ID is None: + raise Exception("If RSPACE_DOCUMENT_TARGET_FIELD_ID has a value RSPACE_PREEXISITING_DOCUMENT_ID must also.") + + if server_url is not None and notebook_name is None or notebook_name is not None and server_url is None: + raise Exception("Both server_url and notebook_name must be either None or have a value") async def sync_notebook(): """ @@ -404,6 +434,7 @@ async def sync_notebook(): The notebook and attached data will always be written to RSpace at least once (on the first time this cell is run). """ + assert_invariants() rspace_doc = None attachment_filess = None gallery_file = None @@ -424,10 +455,10 @@ async def sync_notebook(): print(f"Previous execution count {execution_count}") # FIXME - if execution_count == new_execution_count: - print("No execution since last sync: no data updated in RSpace") - await save_notebook() - return + # if execution_count == new_execution_count: + # print("No execution since last sync: no data updated in RSpace") + # await save_notebook() + # return client = get_rspace_client() rspace_doc = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) @@ -446,18 +477,22 @@ async def sync_notebook(): print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}") else: print("Notebook not previously saved to RSpace Gallery") - if rspace_doc is None: + if rspace_doc is None and RSPACE_PREEXISITING_DOCUMENT_ID is None: rspace_doc = client.create_document(name="DocumentFor_" + current_notebook, tags=["Python", "API", "Jupyter"]) - rspace_document_file_id = str(rspace_doc['id']) + rspace_document_file_id = str(rspace_doc['id']) if RSPACE_PREEXISITING_DOCUMENT_ID is None else RSPACE_PREEXISITING_DOCUMENT_ID rspace_doc = client.get_document(rspace_document_file_id) nb_gallery_file = await upload_notebook_to_gallery(current_notebook,notebook_node, nb_gallery_file,attachment_files, rspace_doc, history_data) # print(f"nb_gallery_file was finally: {nb_gallery_file}") nb_gallery_file_id = nb_gallery_file.get('id') - previous_content = rspace_doc['fields'][0]['content'] + previous_content = get_field_content(rspace_doc) previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) new_content = previous_content + make_content(nb_gallery_file_id, attachment_files) - rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], + if RSPACE_DOCUMENT_TARGET_FIELD_ID is not None: + rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], + fields=[{'id': RSPACE_DOCUMENT_TARGET_FIELD_ID,"content": new_content}]) + else: + rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], fields=[{"content": new_content}]) save_rspace_data(rspace_doc, attachment_files, nb_gallery_file, new_execution_count, history_data) except Exception as e: From ff1a60abd6781b561a0d628cf5bcb991181b69b7 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 8 Oct 2025 19:08:40 +0100 Subject: [PATCH 21/34] RSDEV-782-Jupyter-Notebooks: install keyring alt. Bugfix for uploading notebook to gallery twice --- jupyter_notebooks/provenance_jupyter_hub | 33 ++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 661bda0..e6e62ff 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -40,16 +40,16 @@ new document. Setting RSPACE_PREEXISITING_DOCUMENT_ID to a value other than None will attach this jupyter notebook to the RSpace document with the given ID instead of creating a new RSpace document. """ -# RSPACE_PREEXISITING_DOCUMENT_ID = None -RSPACE_PREEXISITING_DOCUMENT_ID = 155 +RSPACE_PREEXISITING_DOCUMENT_ID = None +# RSPACE_PREEXISITING_DOCUMENT_ID = 155 """ Default behaviour writes links to this notebook into the 'first' field in a document (field '0'). Set this to a value if a different field should be used. If this is set to a value other than None, RSPACE_PREEXISITING_DOCUMENT_ID must be set to a value other than None. """ -# RSPACE_DOCUMENT_TARGET_FIELD_ID = None -RSPACE_DOCUMENT_TARGET_FIELD_ID = 34 +RSPACE_DOCUMENT_TARGET_FIELD_ID = None +# RSPACE_DOCUMENT_TARGET_FIELD_ID = 34 """ All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' then paste here using a ',' comma to separate files if there is more than one. @@ -164,6 +164,8 @@ def get_password(): Retrieves password from (or saves a new password to) keyring """ global get_new_password + from keyring import get_keyring + print("Keyring method: " + str(get_keyring())) try: # TODO - Define the service name (e.g., the notebook name the secret is for) @@ -189,6 +191,9 @@ def get_rspace_client(): global rspace_client if rspace_client is None: retrieved_password = get_password() + if retrieved_password is None: + %pip install keyrings.alt + retrieved_password = get_password() rspace_client = eln.ELNClient(RSPACE_URL, retrieved_password) print(rspace_client.get_status()) return rspace_client @@ -243,7 +248,8 @@ def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_da rspace_document_globalId = rspace_doc['globalId']+'v'+str(rspace_document_version) nb_gallery_file_id = nb_gallery_file['id'] nb_gallery_file_version = int(nb_gallery_file['version']) - nb_gallery_file_version = 1 if nb_gallery_file_version == 1 else nb_gallery_file_version + 1 + # nb_gallery_file_version = 1 if nb_gallery_file_version == 1 else nb_gallery_file_version + 1 + nb_gallery_file_version = nb_gallery_file_version + 1 nb_gallery_file_name = nb_gallery_file['name'] meta_data_cell = nbformat.v4.new_markdown_cell() rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_URL}{rspace_document_file_id})' @@ -253,8 +259,7 @@ def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_da for attached_data in attached_data_files_list: attachment_file_id = attachment_files.get(attached_data, {}).get('id') attachment_version = attachment_files.get(attached_data, {}).get('version') - meta_data_cell[ - 'source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' + meta_data_cell['source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' for url in get_server_urls(): meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' new_history = f'
RSpace doc [{rspace_document_name} version {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_VERSION_URL_START}{rspace_document_globalId}) contains this Notebook, version {nb_gallery_file_version}, executed with: ' @@ -264,8 +269,7 @@ def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_da new_history += f'Data {attached_data} version: {attachment_version} ' # print(f'history_data: {history_data}') history_data['text'] = new_history + history_data['text'] - meta_data_cell[ - 'source'] += f'
{history_data['text']}' + meta_data_cell['source'] += f"
{history_data['text']}" meta_data_cell['metadata'] = { "rspace_metadata": {"documentFor": "docid", "notebook_file": "docid", "attachments": [""]}} return meta_data_cell @@ -324,6 +328,10 @@ def make_content(nb_gallery_file_id, attachment_files): def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): + """ + Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook + then remove them + """ soup = BeautifulSoup(content, 'html.parser') attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) for attachment_div in attachment_divs: @@ -400,9 +408,9 @@ async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file await asyncio.sleep(1) await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, rspace_doc, history_data) - with open(current_notebook, 'r', encoding='utf-8') as nb_file: - nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - await asyncio.sleep(1) + with open(current_notebook, 'r', encoding='utf-8') as nb_file: + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + await asyncio.sleep(1) # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") await reload_notebook() return nb_gallery_file @@ -453,7 +461,6 @@ async def sync_notebook(): print(f"New execution count {new_execution_count}") print(f"Previous execution count {execution_count}") - # FIXME # if execution_count == new_execution_count: # print("No execution since last sync: no data updated in RSpace") From 8c7827d55cefe7026515e963d257327e4016e05d Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Wed, 8 Oct 2025 20:36:01 +0100 Subject: [PATCH 22/34] RSDEV-782-Jupyter-Notebooks: avoid iterating when there are no attached files --- jupyter_notebooks/provenance_jupyter_hub | 106 ++++++++++++++--------- 1 file changed, 67 insertions(+), 39 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index e6e62ff..0262d90 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -239,7 +239,6 @@ async def reload_notebook(): # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded await asyncio.sleep(1) - def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data): rspace_document_file_id =str(rspace_doc['id']) # new content plus new attachment data increments the document version by two @@ -255,18 +254,22 @@ def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_da rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_URL}{rspace_document_file_id})' gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({RSPACE_URL}gallery/item/{nb_gallery_file_id})' meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_version = attachment_files.get(attached_data, {}).get('version') - meta_data_cell['source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' + if len(attached_data_files) != 0: + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + meta_data_cell['source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' + else: + meta_data_cell['source'] += f'
No Attached Data' for url in get_server_urls(): meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' new_history = f'
RSpace doc [{rspace_document_name} version {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_VERSION_URL_START}{rspace_document_globalId}) contains this Notebook, version {nb_gallery_file_version}, executed with: ' - for attached_data in attached_data_files_list: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_version = attachment_files.get(attached_data, {}).get('version') - new_history += f'Data {attached_data} version: {attachment_version} ' + if len(attached_data_files) != 0: + for attached_data in attached_data_files.split(","): + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + new_history += f'Data {attached_data} version: {attachment_version} ' # print(f'history_data: {history_data}') history_data['text'] = new_history + history_data['text'] meta_data_cell['source'] += f"
{history_data['text']}" @@ -319,14 +322,34 @@ def make_content(nb_gallery_file_id, attachment_files): content = f""" """ - for attachment_file in attached_data_files.split(","): - content += f""" - - """ + if len(attached_data_files) != 0: + for attachment_file in attached_data_files.split(","): + content += f""" + + """ # print(f"content is {content}") return content +# def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): +# """ +# Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook +# then remove them +# """ +# soup = BeautifulSoup(content, 'html.parser') +# attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) +# for attachment_div in attachment_divs: +# href_tag = attachment_div.find('a') +# # print(f"href_tag{href_tag}") +# gallery_link = '/Streamfile/' + str(nb_gallery_file_id) +# for attachment_file in attached_data_files.split(","): +# attachment_file_id = attachment_files.get(attachment_file, {}).get('id') +# attachment_link = '/Streamfile/' + str(attachment_file_id) +# if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: +# attachment_div.decompose() +# break +# return soup.prettify() + def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): """ Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook @@ -338,12 +361,16 @@ def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files href_tag = attachment_div.find('a') # print(f"href_tag{href_tag}") gallery_link = '/Streamfile/' + str(nb_gallery_file_id) - for attachment_file in attached_data_files.split(","): - attachment_file_id = attachment_files.get(attachment_file, {}).get('id') - attachment_link = '/Streamfile/' + str(attachment_file_id) - if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: - attachment_div.decompose() - break + if gallery_link == href_tag['href']: + attachment_div.decompose() + continue + if len(attached_data_files) != 0: + for attachment_file in attached_data_files.split(","): + attachment_file_id = attachment_files.get(attachment_file, {}).get('id') + attachment_link = '/Streamfile/' + str(attachment_file_id) + if attachment_link == href_tag['href']: + attachment_div.decompose() + break return soup.prettify() @@ -368,25 +395,26 @@ def calc_hash(filename): def upload_attached_data(attachment_files): client = get_rspace_client() - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - if attached_data: - # make file paths to data relative to the location of this notebook - nested_dir_pos = get_notebook_name()['name_path'].count('/') - relative_attached_data = attached_data - for i in range(nested_dir_pos): - relative_attached_data = "../" + relative_attached_data - # print(f"relative_attached_data: {relative_attached_data}") - with open(relative_attached_data, 'r', encoding='utf-8') as attch: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') - calc_latest_hash = calc_hash(relative_attached_data) - if calc_latest_hash != attachment_file_hash: - attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) - attachment_file_data['hash'] = calc_latest_hash - attachment_files[attached_data] = attachment_file_data - else: - print(f"File {attached_data} not changed so no update") + if len(attached_data_files) != 0: + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + if attached_data: + # make file paths to data relative to the location of this notebook + nested_dir_pos = get_notebook_name()['name_path'].count('/') + relative_attached_data = attached_data + for i in range(nested_dir_pos): + relative_attached_data = "../" + relative_attached_data + # print(f"relative_attached_data: {relative_attached_data}") + with open(relative_attached_data, 'r', encoding='utf-8') as attch: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') + calc_latest_hash = calc_hash(relative_attached_data) + if calc_latest_hash != attachment_file_hash: + attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) + attachment_file_data['hash'] = calc_latest_hash + attachment_files[attached_data] = attachment_file_data + else: + print(f"File {attached_data} not changed so no update") # print(f"attached files: {attachment_files}") From d3067c5476b83fb42ec1f19eb053504ef8bafa68 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 9 Oct 2025 13:03:58 +0100 Subject: [PATCH 23/34] RSDEV-782-Jupyter-Notebooks: use pip install notebook instead of conda install --- jupyter_notebooks/provenance_jupyter_hub | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 0262d90..1992238 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -4,7 +4,7 @@ import json try: from notebook import app except: - %conda install -q notebook + %pip install -q notebook %pip install -q keyring from rspace_client.eln import eln import os @@ -56,6 +56,8 @@ RSPACE_DOCUMENT_TARGET_FIELD_ID = None Example: attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" + The code in this cell will calculate paths to the data relative to the location of this notebook. Therefore do + not change the 'paths' to the data, regardless of whether this notebook is in the top directory or in a sub directory. If you wish to have no attached data, set this value to be "" (a pair of double quotes) @@ -158,7 +160,6 @@ def get_notebook_name(): print(f"Error getting notebook name, please manually set a value for 'notebook_name'") raise - def get_password(): """ Retrieves password from (or saves a new password to) keyring From 8a536f9d2d165f92b3001dc91c736e4b6bd45a1c Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 9 Oct 2025 20:30:47 +0100 Subject: [PATCH 24/34] RSDEV-782-Jupyter-Notebooks: works with google colab --- jupyter_notebooks/provenance_jupyter_hub | 73 +++++++++++++----------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 1992238..57f2565 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -1,28 +1,50 @@ import json -%pip install -q rspace-client==2.6.1 +try: + from rspace_client.eln import eln +except: + %pip install -q rspace-client==2.6.1 + %pip install -q pickleshare try: from notebook import app except: %pip install -q notebook -%pip install -q keyring -from rspace_client.eln import eln +try: + import keyring +except: + %pip install -q keyring import os import hashlib import json -%pip install -q dill +try: + import dill +except: + %pip install -q dill import dill -%pip install -q ipynbname -import ipynbname -%pip install -q ipylab -from ipylab import JupyterFrontEnd +try: + import ipynbname +except: + %pip install -q ipynbname +try: + from ipylab import JupyterFrontEnd +except: + %pip install -q ipylab import traceback %pip install -q lxml from bs4 import BeautifulSoup import nbformat import asyncio import getpass -import keyring + +try: + import google.colab + IN_COLAB = True +except: + IN_COLAB = False + +if IN_COLAB == True: + from google.colab import drive + drive.mount('/content/drive') RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' @@ -56,8 +78,6 @@ RSPACE_DOCUMENT_TARGET_FIELD_ID = None Example: attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" - The code in this cell will calculate paths to the data relative to the location of this notebook. Therefore do - not change the 'paths' to the data, regardless of whether this notebook is in the top directory or in a sub directory. If you wish to have no attached data, set this value to be "" (a pair of double quotes) @@ -65,6 +85,7 @@ RSPACE_DOCUMENT_TARGET_FIELD_ID = None attached_data_files = "" """ attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" +# attached_data_files = "/content/drive/My Drive/Colab Notebooks/spectroscopy_data3.csv,/content/drive/MyDrive/Jupyter_data/spectroscopy_data.csv,/content/drive/MyDrive/Jupyter_data/spectroscopy_data1.csv" """ Set this to true to manually enter a new password """ @@ -76,6 +97,7 @@ if exceptions are thrown trying to determine the notebook name. If this value is set server_url MUST also be set. """ notebook_name = None +# notebook_name = '/content/drive/My Drive/Colab Notebooks/Untitled1.ipynb' """ This must be set to a value if exceptions are thrown or the calculated value is incorrect when trying to determine the server url. Give the url of the server including the port: eg http://localhost:10000 (no trailing '/') @@ -83,6 +105,7 @@ of the server including the port: eg http://localhost:10000 (no trailing '/') If this value is set, notebook_name MUST also be set. """ server_url = None +# server_url = 'https://colab.research.google.com/drive/1dXm1-rw31_Nu-VF23OPyxxRPV19s4G5i' """ Set this to a value if server_url is calculated correctly except for the port (which will happen, for example @@ -160,6 +183,7 @@ def get_notebook_name(): print(f"Error getting notebook name, please manually set a value for 'notebook_name'") raise + def get_password(): """ Retrieves password from (or saves a new password to) keyring @@ -332,25 +356,6 @@ def make_content(nb_gallery_file_id, attachment_files): return content -# def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): -# """ -# Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook -# then remove them -# """ -# soup = BeautifulSoup(content, 'html.parser') -# attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) -# for attachment_div in attachment_divs: -# href_tag = attachment_div.find('a') -# # print(f"href_tag{href_tag}") -# gallery_link = '/Streamfile/' + str(nb_gallery_file_id) -# for attachment_file in attached_data_files.split(","): -# attachment_file_id = attachment_files.get(attachment_file, {}).get('id') -# attachment_link = '/Streamfile/' + str(attachment_file_id) -# if attachment_link == href_tag['href'] or gallery_link == href_tag['href']: -# attachment_div.decompose() -# break -# return soup.prettify() - def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): """ Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook @@ -478,7 +483,7 @@ async def sync_notebook(): await save_notebook() get_server_urls() # print(f"notebook name: {get_notebook_name()}") - current_notebook = get_notebook_name()['name'] + current_notebook = get_notebook_name()['name'] if IN_COLAB == False else get_notebook_name()['name_path'] with open(current_notebook, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) new_hash = calc_hash(current_notebook) @@ -501,7 +506,9 @@ async def sync_notebook(): nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) nb_gallery_file_id = nb_gallery_file.get('id') history_data = loaded_state.get(HISTORY_DATA,{'text':''}) - current_notebook = get_notebook_name()['name'] + # Modified this line to get the 'name_path' which should be the actual file path + current_notebook_info = get_notebook_name() + current_notebook = current_notebook_info['name_path'] attachments = None if rspace_doc is not None: print( @@ -537,4 +544,4 @@ async def sync_notebook(): return None -await sync_notebook() +await sync_notebook() \ No newline at end of file From 91b458347fcf675f0f001a412c7db578a2eca94d Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Sat, 11 Oct 2025 18:26:27 +0100 Subject: [PATCH 25/34] RSDEV-782-Jupyter-Notebooks: all functions and private data encapsulated in sync_notebook_to_rspace function --- jupyter_notebooks/provenance_jupyter_hub | 828 ++++++++++++----------- 1 file changed, 426 insertions(+), 402 deletions(-) diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub index 57f2565..21a5faf 100644 --- a/jupyter_notebooks/provenance_jupyter_hub +++ b/jupyter_notebooks/provenance_jupyter_hub @@ -1,496 +1,522 @@ import json -try: - from rspace_client.eln import eln -except: - %pip install -q rspace-client==2.6.1 - +%pip install -q rspace-client==2.6.1 %pip install -q pickleshare try: from notebook import app except: %pip install -q notebook -try: - import keyring -except: - %pip install -q keyring +%pip install -q keyring +from rspace_client.eln import eln import os import hashlib import json -try: - import dill -except: - %pip install -q dill +%pip install -q dill import dill -try: - import ipynbname -except: - %pip install -q ipynbname -try: - from ipylab import JupyterFrontEnd -except: - %pip install -q ipylab +%pip install -q ipynbname +import ipynbname +%pip install -q ipylab +from ipylab import JupyterFrontEnd import traceback %pip install -q lxml from bs4 import BeautifulSoup import nbformat import asyncio import getpass - -try: - import google.colab - IN_COLAB = True -except: - IN_COLAB = False - -if IN_COLAB == True: - from google.colab import drive - drive.mount('/content/drive') +import keyring +from urllib.parse import urlparse RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' -ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' -GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' -EXECUTION_COUNT_FOR_NOTEBOOK = 'execution_count_for_notebook' -HISTORY_DATA = 'history_data' +RSPACE_ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' +RSPACE_GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' +RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK = 'RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK' +RSPACE_HISTORY_DATA = 'RSPACE_HISTORY_DATA' RSPACE_DOC_URL = 'workspace/editor/structuredDocument/' RSPACE_DOC_VERSION_URL_START = 'workspace/editor/structuredDocument/audit/view?globalId=' # Your RSpace instance goes here -RSPACE_URL = "https://researchspace2.eu.ngrok.io/" +# rspace_url = "https://researchspace2.eu.ngrok.io/" """ Default behaviour creates a new RSpace document when this cell is executed and attached this jupyter notebook to the new document. -Setting RSPACE_PREEXISITING_DOCUMENT_ID to a value other than None will attach this jupyter notebook to the RSpace document +Setting rspace_prexisting_document_id to a value other than None will attach this jupyter notebook to the RSpace document with the given ID instead of creating a new RSpace document. """ -RSPACE_PREEXISITING_DOCUMENT_ID = None -# RSPACE_PREEXISITING_DOCUMENT_ID = 155 +# rspace_prexisting_document_id = None +# rspace_prexisting_document_id = 155 """ Default behaviour writes links to this notebook into the 'first' field in a document (field '0'). Set this to a value if a different field should be used. -If this is set to a value other than None, RSPACE_PREEXISITING_DOCUMENT_ID must be set to a value other than None. +If this is set to a value other than None, rspace_prexisting_document_id must be set to a value other than None. """ -RSPACE_DOCUMENT_TARGET_FIELD_ID = None -# RSPACE_DOCUMENT_TARGET_FIELD_ID = 34 +# rspace_document_target_field_id = None +# rspace_document_target_field_id = 34 """ All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' then paste here using a ',' comma to separate files if there is more than one. Example: attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" + The code in this cell will calculate paths to the data relative to the location of this notebook. Therefore do + not change the 'paths' to the data, regardless of whether this notebook is in the top directory or in a sub directory. If you wish to have no attached data, set this value to be "" (a pair of double quotes) Example: attached_data_files = "" """ -attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" -# attached_data_files = "/content/drive/My Drive/Colab Notebooks/spectroscopy_data3.csv,/content/drive/MyDrive/Jupyter_data/spectroscopy_data.csv,/content/drive/MyDrive/Jupyter_data/spectroscopy_data1.csv" +# attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" """ Set this to true to manually enter a new password """ -get_new_password = False +# get_new_password = False """ This must be set to a the value of the PATH to the notebook (select the notebook in the file browser and choose 'copy path'), if exceptions are thrown trying to determine the notebook name. -If this value is set server_url MUST also be set. +If this value is set server_url MUST also be set. """ -notebook_name = None -# notebook_name = '/content/drive/My Drive/Colab Notebooks/Untitled1.ipynb' +# notebook_name = None """ This must be set to a value if exceptions are thrown or the calculated value is incorrect when trying to determine the server url. Give the url of the server including the port: eg http://localhost:10000 (no trailing '/') If this value is set, notebook_name MUST also be set. """ -server_url = None -# server_url = 'https://colab.research.google.com/drive/1dXm1-rw31_Nu-VF23OPyxxRPV19s4G5i' +# server_url = None """ -Set this to a value if server_url is calculated correctly except for the port (which will happen, for example -if the port is being mapped inside a docker container to an external port +Set this to a value if server_url is calculated correctly except for the port (which will happen, for example +if the port is being mapped inside a docker container to an external port) """ -server_port = 10000 +# server_port = 10000 rspace_client = None app = JupyterFrontEnd() -def get_server_urls(): - global server_url - all_urls = [] - if (server_url is not None): - all_urls.append(server_url + '/lab/tree/' + notebook_name) - else: - try: - for srv in ipynbname._list_maybe_running_servers(): - srv, path = ipynbname._find_nb_path() - if server_port is not None: - srv_url = srv['url'] - # print(f"srv_url: {srv_url}") - # print(f"root_dir: {srv['root_dir']}") - # print(f"path: {str(path)}") - part_url = srv_url[:srv_url.rfind(':') + 1] - # print(f"part_url: {part_url}") - all_urls.append(part_url + str(server_port) + '/lab/tree/' + str(path)) - else: - all_urls.append(srv['url'] + 'lab/tree/' + str(path)) - except Exception: - print(f"Error determining server urls, please manually set a value for 'server_url'") - raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README - return all_urls +async def sync_notebook_to_rspace(rspace_url="",attached_data_files="",notebook_name=None, server_url=None, rspace_prexisting_document_id=None,rspace_document_target_field_id=None, server_port=None, get_new_password=False ): + """ + Saves notebook using ipylab and then writes notebook to Rspace document as + an attachment if the execution_count of the notebook has changed since the last time + this cell was run. Note that the execution count of this cell does not contribute to + the comparison - we will not write data to RSpace if only this cell has been run + since the last time data was written to RSpace. + Attached data is also written to RSpace if its hash_sum has changed. + The notebook and attached data will always be written to RSpace at least once (on the first time this cell is run). -def get_server_roots(): - """ - this will only be called if ipyname library is working correctly - """ - all_roots = [] - try: - if len(all_roots) == 0: - for srv in ipynbname._list_maybe_running_servers(): - srv, path = ipynbname._find_nb_path() - root = srv['root_dir'] - all_roots.append(root) - except Exception: - print(f"Error determining server roots, please manually set a value for 'server_url'") - raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README - return all_roots - - -def get_notebook_name(): - global notebook_name - try: - if notebook_name is not None: - if '/' in notebook_name: - notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] - else: - notebook_name_alone = notebook_name - return {'name': notebook_name_alone, 'root_name': notebook_name_alone[:notebook_name_alone.rfind('.')], - 'name_path': notebook_name} - nb_fname = ipynbname.name() - nb_path = str(ipynbname.path()) - for srv_root in get_server_roots(): - if not srv_root.endswith("/"): - srv_root = srv_root + "/" - if srv_root in nb_path: - nb_path = nb_path.replace(srv_root, '') - # print(f"nb_path: {nb_path}") - ext_pos = ('' + nb_path).rfind('.') - ext = nb_path[ext_pos:] - return {'name': nb_fname + ext, 'root_name': nb_fname, 'name_path': nb_path} - except Exception as e: - print(f"Error getting notebook name, please manually set a value for 'notebook_name'") - raise + Parameters: + rspace_url : Your RSpace instance goes here -def get_password(): - """ - Retrieves password from (or saves a new password to) keyring - """ - global get_new_password - from keyring import get_keyring - print("Keyring method: " + str(get_keyring())) - try: + attached_data_files : + All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' + then paste here using a ',' comma to separate files if there is more than one. - # TODO - Define the service name (e.g., the notebook name the secret is for) - service_id = "RSpaceJupyterDemoApp" - # TODO - Define the username associated with the secret - username = "myuser" # use your own username + Example: + attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" + The code in this cell will calculate paths to the data relative to the location of this notebook. Therefore do + not change the 'paths' to the data, regardless of whether this notebook is in the top directory or in a sub directory. - retrieved_password = keyring.get_password(service_id, username) - if retrieved_password is None or get_new_password: - retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") - keyring.set_password(service_id, username, retrieved_password) - return retrieved_password - except Exception as e: - print(f"Error getting password: {e}") - return None + If you wish to have no attached data, set this value to be "" (a pair of double quotes) + Example: + attached_data_files = "" -def get_rspace_client(): - """ - Returns rspace ELN API client - """ - try: - global rspace_client - if rspace_client is None: - retrieved_password = get_password() - if retrieved_password is None: - %pip install keyrings.alt - retrieved_password = get_password() - rspace_client = eln.ELNClient(RSPACE_URL, retrieved_password) - print(rspace_client.get_status()) - return rspace_client - except Exception as e: - print(traceback.format_exc()) - print(f"Error connecting to RSpace: {e}") - return None + notebook_name: + This must be set to a the value of the PATH to the notebook (select the notebook in the file browser and choose 'copy path'), + if exceptions are thrown trying to determine the notebook name. + If this value is set server_url MUST also be set. + server_url: + This must be set to a value if exceptions are thrown or the calculated value is incorrect when trying to determine the server url. Give the url + of the server including the port: eg http://localhost:10000 (no trailing '/') -def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count, history_data): - # Define the filename to save the state - state_filename = get_notebook_name()['root_name'] + "_state.pkl" - print(f"writing to file: {state_filename}") - with open(state_filename, 'wb') as f: - dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, ATTACHMENTS_FOR_NOTEBOOK: attachments, - GALLERY_FILE_FOR_NOTEBOOK: gallery_file, EXECUTION_COUNT_FOR_NOTEBOOK: execution_count, HISTORY_DATA:history_data }, f) + If this value is set, notebook_name MUST also be set. + rspace_prexisting_document_id: + Default behaviour creates a new RSpace document when this cell is executed and attached this jupyter notebook to the new document. -def load_data(): - state_filename = get_notebook_name()['root_name'] + "_state.pkl" + Setting rspace_prexisting_document_id to a value other than None will attach this jupyter notebook to the RSpace document + with the given ID instead of creating a new RSpace document. - if os.path.exists(state_filename): - # Load the variables from the file using dill - with open(state_filename, 'rb') as f: - try: - loaded_state = dill.load(f) - except Exception as e: - loaded_state = {} - else: - loaded_state = {} - print(f"State file '{state_filename}' not found. No variables loaded.") - return loaded_state - - -async def save_notebook(): - app.commands.execute('docmanager:save') - # 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved - await asyncio.sleep(1) - - -async def reload_notebook(): - app.commands.execute('docmanager:reload') - # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded - await asyncio.sleep(1) - -def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data): - rspace_document_file_id =str(rspace_doc['id']) - # new content plus new attachment data increments the document version by two - rspace_document_version = 2 if rspace_doc['version'] == 1 else rspace_doc['version'] + 2 - rspace_document_name = rspace_doc['name'] - rspace_document_globalId = rspace_doc['globalId']+'v'+str(rspace_document_version) - nb_gallery_file_id = nb_gallery_file['id'] - nb_gallery_file_version = int(nb_gallery_file['version']) - # nb_gallery_file_version = 1 if nb_gallery_file_version == 1 else nb_gallery_file_version + 1 - nb_gallery_file_version = nb_gallery_file_version + 1 - nb_gallery_file_name = nb_gallery_file['name'] - meta_data_cell = nbformat.v4.new_markdown_cell() - rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_URL}{rspace_document_file_id})' - gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({RSPACE_URL}gallery/item/{nb_gallery_file_id})' - meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown - if len(attached_data_files) != 0: - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_version = attachment_files.get(attached_data, {}).get('version') - meta_data_cell['source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({RSPACE_URL}gallery/item/{attachment_file_id})' - else: - meta_data_cell['source'] += f'
No Attached Data' - for url in get_server_urls(): - meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' - new_history = f'
RSpace doc [{rspace_document_name} version {rspace_document_version}]({RSPACE_URL}{RSPACE_DOC_VERSION_URL_START}{rspace_document_globalId}) contains this Notebook, version {nb_gallery_file_version}, executed with: ' - if len(attached_data_files) != 0: - for attached_data in attached_data_files.split(","): - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_version = attachment_files.get(attached_data, {}).get('version') - new_history += f'Data {attached_data} version: {attachment_version} ' - # print(f'history_data: {history_data}') - history_data['text'] = new_history + history_data['text'] - meta_data_cell['source'] += f"
{history_data['text']}" - meta_data_cell['metadata'] = { - "rspace_metadata": {"documentFor": "docid", "notebook_file": "docid", "attachments": [""]}} - return meta_data_cell - -async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data): - """ - We have to save meta data about a notebook before its been uploaded to the gallery. - Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None - its the initial upload to the Gallery and so do not write any meta data - """ - if nb_gallery_file.get('id') is None: - return - await save_notebook() - meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc,history_data) - replaced = False - for i, cell in enumerate(notebook['cells']): - if 'rspace_metadata' in cell['metadata']: - notebook["cells"][i] = meta_data_cell - replaced = True - if replaced is False: - notebook["cells"].extend([meta_data_cell]) - with open(fname, 'w', encoding='utf-8') as modified: - nbformat.write(notebook, modified) - - -def get_notebook_execution_count(notebook): - """ - return the sum of all execution counts for code cells - note that this code cell does not contribute to the count: - it is always saved before its execution_count gets updated - and so the value of execution_count for this cell is always 'None' - """ - new_executed_count = 0 - for i, cell in enumerate(notebook['cells']): - if cell['cell_type'] == 'code': - # print(f"cell id: {cell['id']} and execution_count {cell['execution_count']} ") - cell_count = cell['execution_count'] - if cell_count is None: - cell_count = 0 - new_executed_count += cell_count - # print(f"new executed count {new_executed_count}") - return new_executed_count - - -def make_content(nb_gallery_file_id, attachment_files): - content = f""" - - """ - if len(attached_data_files) != 0: - for attachment_file in attached_data_files.split(","): - content += f""" - - """ - # print(f"content is {content}") - return content + rspace_document_target_field_id: + Default behaviour writes links to this notebook into the 'first' field in a document (field '0'). Set this to a value + if a different field should be used. + If this is set to a value other than None, rspace_prexisting_document_id must be set to a value other than None. + server_port: + Set this to a value if server_url is calculated correctly except for the port (which will happen, for example + if the port is being mapped inside a docker container to an external port) + + get_new_password: + Set this to true to manually enter a new password -def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): - """ - Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook - then remove them """ - soup = BeautifulSoup(content, 'html.parser') - attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) - for attachment_div in attachment_divs: - href_tag = attachment_div.find('a') - # print(f"href_tag{href_tag}") - gallery_link = '/Streamfile/' + str(nb_gallery_file_id) - if gallery_link == href_tag['href']: - attachment_div.decompose() - continue + def get_server_urls(): + all_urls = [] + if (server_url is not None): + all_urls.append(server_url + '/lab/tree/' + notebook_name) + else: + try: + for srv in ipynbname._list_maybe_running_servers(): + srv, path = ipynbname._find_nb_path() + if server_port is not None: + srv_url = srv['url'] + # print(f"srv_url: {srv_url}") + # print(f"root_dir: {srv['root_dir']}") + # print(f"path: {str(path)}") + part_url = srv_url[:srv_url.rfind(':') + 1] + # print(f"part_url: {part_url}") + all_urls.append(part_url + str(server_port) + '/lab/tree/' + str(path)) + else: + all_urls.append(srv['url'] + 'lab/tree/' + str(path)) + except Exception: + print(f"Error determining server urls, please manually set a value for 'server_url'") + raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README + return all_urls + + + def get_server_roots(): + """ + this will only be called if ipyname library is working correctly + """ + all_roots = [] + try: + if len(all_roots) == 0: + for srv in ipynbname._list_maybe_running_servers(): + srv, path = ipynbname._find_nb_path() + root = srv['root_dir'] + all_roots.append(root) + except Exception: + print(f"Error determining server roots, please manually set a value for 'server_url'") + raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README + return all_roots + + + def get_notebook_name(): + try: + if notebook_name is not None: + if '/' in notebook_name: + notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] + else: + notebook_name_alone = notebook_name + return {'name': notebook_name_alone, 'root_name': notebook_name_alone[:notebook_name_alone.rfind('.')], + 'name_path': notebook_name} + nb_fname = ipynbname.name() + nb_path = str(ipynbname.path()) + for srv_root in get_server_roots(): + if not srv_root.endswith("/"): + srv_root = srv_root + "/" + if srv_root in nb_path: + nb_path = nb_path.replace(srv_root, '') + # print(f"nb_path: {nb_path}") + ext_pos = ('' + nb_path).rfind('.') + ext = nb_path[ext_pos:] + return {'name': nb_fname + ext, 'root_name': nb_fname, 'name_path': nb_path} + except Exception as e: + print(f"Error getting notebook name, please manually set a value for 'notebook_name'") + raise + + def get_password(): + """ + Retrieves password from (or saves a new password to) keyring + """ + from keyring import get_keyring + print("Keyring method: " + str(get_keyring())) + try: + + # TODO - Define the service name (e.g., the notebook name the secret is for) + service_id = "RSpaceJupyterDemoApp" + # TODO - Define the username associated with the secret + username = "myuser" # use your own username + + retrieved_password = keyring.get_password(service_id, username) + if retrieved_password is None or get_new_password: + retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") + keyring.set_password(service_id, username, retrieved_password) + return retrieved_password + except Exception as e: + print(f"Error getting password: {e}") + return None + + + def get_rspace_client(): + """ + Returns rspace ELN API client + """ + try: + global rspace_client + if rspace_client is None: + retrieved_password = get_password() + if retrieved_password is None: + %pip install keyrings.alt + retrieved_password = get_password() + rspace_client = eln.ELNClient(rspace_url, retrieved_password) + print(rspace_client.get_status()) + return rspace_client + except Exception as e: + print(traceback.format_exc()) + print(f"Error connecting to RSpace: {e}") + return None + + + def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count, history_data): + # Define the filename to save the state + state_filename = get_notebook_name()['root_name'] + "_state.pkl" + print(f"writing to file: {state_filename}") + with open(state_filename, 'wb') as f: + dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, RSPACE_ATTACHMENTS_FOR_NOTEBOOK: attachments, + RSPACE_GALLERY_FILE_FOR_NOTEBOOK: gallery_file, RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK: execution_count, RSPACE_HISTORY_DATA:history_data }, f) + + + def load_data(): + state_filename = get_notebook_name()['root_name'] + "_state.pkl" + + if os.path.exists(state_filename): + # Load the variables from the file using dill + with open(state_filename, 'rb') as f: + try: + loaded_state = dill.load(f) + except Exception as e: + loaded_state = {} + else: + loaded_state = {} + print(f"State file '{state_filename}' not found. No variables loaded.") + return loaded_state + + + async def save_notebook(): + app.commands.execute('docmanager:save') + # 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved + await asyncio.sleep(1) + + + async def reload_notebook(): + app.commands.execute('docmanager:reload') + # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded + await asyncio.sleep(1) + + def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data): + rspace_document_file_id =str(rspace_doc['id']) + # new content plus new attachment data increments the document version by two + rspace_document_version = 2 if rspace_doc['version'] == 1 else rspace_doc['version'] + 2 + rspace_document_name = rspace_doc['name'] + rspace_document_globalId = rspace_doc['globalId']+'v'+str(rspace_document_version) + nb_gallery_file_id = nb_gallery_file['id'] + nb_gallery_file_version = int(nb_gallery_file['version']) + # nb_gallery_file_version = 1 if nb_gallery_file_version == 1 else nb_gallery_file_version + 1 + nb_gallery_file_version = nb_gallery_file_version + 1 + nb_gallery_file_name = nb_gallery_file['name'] + meta_data_cell = nbformat.v4.new_markdown_cell() + rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({rspace_url}{RSPACE_DOC_URL}{rspace_document_file_id})' + gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({rspace_url}gallery/item/{nb_gallery_file_id})' + meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown + if len(attached_data_files) != 0: + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + meta_data_cell['source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({rspace_url}gallery/item/{attachment_file_id})' + else: + meta_data_cell['source'] += f'
No Attached Data' + for url in get_server_urls(): + meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' + new_history = f'
RSpace doc [{rspace_document_name} version {rspace_document_version}]({rspace_url}{RSPACE_DOC_VERSION_URL_START}{rspace_document_globalId}) contains this Notebook, version {nb_gallery_file_version}, executed with: ' + if len(attached_data_files) != 0: + for attached_data in attached_data_files.split(","): + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + new_history += f'Data {attached_data} version: {attachment_version} ' + # print(f'RSPACE_HISTORY_DATA: {RSPACE_HISTORY_DATA}') + history_data['text'] = new_history + history_data['text'] + meta_data_cell['source'] += f"
{history_data['text']}" + meta_data_cell['metadata'] = { + "rspace_metadata": {"documentFor": "docid", "notebook_file": "docid", "attachments": [""]}} + return meta_data_cell + + async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_file, attachment_files, + rspace_doc, history_data): + """ + We have to save meta data about a notebook before its been uploaded to the gallery. + Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None + its the initial upload to the Gallery and so do not write any meta data + """ + if nb_gallery_file.get('id') is None: + return + await save_notebook() + meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc,history_data) + replaced = False + for i, cell in enumerate(notebook['cells']): + if 'rspace_metadata' in cell['metadata']: + notebook["cells"][i] = meta_data_cell + replaced = True + if replaced is False: + notebook["cells"].extend([meta_data_cell]) + with open(fname, 'w', encoding='utf-8') as modified: + nbformat.write(notebook, modified) + + + def get_notebook_execution_count(notebook): + """ + return the sum of all execution counts for code cells + note that this code cell does not contribute to the count: + it is always saved before its execution_count gets updated + and so the value of execution_count for this cell is always 'None' + """ + new_executed_count = 0 + for i, cell in enumerate(notebook['cells']): + if cell['cell_type'] == 'code': + # print(f"cell id: {cell['id']} and execution_count {cell['execution_count']} ") + cell_count = cell['execution_count'] + if cell_count is None: + cell_count = 0 + new_executed_count += cell_count + # print(f"new executed count {new_executed_count}") + return new_executed_count + + + def make_content(nb_gallery_file_id, attachment_files): + content = f""" + + """ if len(attached_data_files) != 0: for attachment_file in attached_data_files.split(","): - attachment_file_id = attachment_files.get(attachment_file, {}).get('id') - attachment_link = '/Streamfile/' + str(attachment_file_id) - if attachment_link == href_tag['href']: - attachment_div.decompose() - break - return soup.prettify() - - -def upload_file_to_gallery(rspaceid, file, client): - if rspaceid is None: - print(f'start upload file {file} using {client}') - data = client.upload_file(file) - else: - print('start update file') - data = client.update_file(file, rspaceid) - return data - - -def calc_hash(filename): - sha256_hash = hashlib.sha256() - with open(filename, "rb") as f: - # Read and update hash string value in blocks of 4K - for byte_block in iter(lambda: f.read(4096), b""): - sha256_hash.update(byte_block) - return sha256_hash.hexdigest() - - -def upload_attached_data(attachment_files): - client = get_rspace_client() - if len(attached_data_files) != 0: - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - if attached_data: - # make file paths to data relative to the location of this notebook - nested_dir_pos = get_notebook_name()['name_path'].count('/') - relative_attached_data = attached_data - for i in range(nested_dir_pos): - relative_attached_data = "../" + relative_attached_data - # print(f"relative_attached_data: {relative_attached_data}") - with open(relative_attached_data, 'r', encoding='utf-8') as attch: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') - calc_latest_hash = calc_hash(relative_attached_data) - if calc_latest_hash != attachment_file_hash: - attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) - attachment_file_data['hash'] = calc_latest_hash - attachment_files[attached_data] = attachment_file_data - else: - print(f"File {attached_data} not changed so no update") - # print(f"attached files: {attachment_files}") + content += f""" + + """ + # print(f"content is {content}") + return content + + + def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): + """ + Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook + then remove them + """ + soup = BeautifulSoup(content, 'html.parser') + attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) + for attachment_div in attachment_divs: + href_tag = attachment_div.find('a') + # print(f"href_tag{href_tag}") + gallery_link = '/Streamfile/' + str(nb_gallery_file_id) + if gallery_link == href_tag['href']: + attachment_div.decompose() + continue + if len(attached_data_files) != 0: + for attachment_file in attached_data_files.split(","): + attachment_file_id = attachment_files.get(attachment_file, {}).get('id') + attachment_link = '/Streamfile/' + str(attachment_file_id) + if attachment_link == href_tag['href']: + attachment_div.decompose() + break + return soup.prettify() + + + def upload_file_to_gallery(rspaceid, file, client): + if rspaceid is None: + print(f'start upload file {file} using {client}') + data = client.upload_file(file) + else: + print('start update file') + data = client.update_file(file, rspaceid) + return data -async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data): - """ - Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). - If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. - We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook - a second time. - """ - await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data) - with open(current_notebook, 'r', encoding='utf-8') as nb_file: + def calc_hash(filename): + sha256_hash = hashlib.sha256() + with open(filename, "rb") as f: + # Read and update hash string value in blocks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + + + def upload_attached_data(attachment_files): client = get_rspace_client() - nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - # print(f"gallery file for nb: {nb_gallery_file}") - if nb_gallery_file.get('version') == 1: - await asyncio.sleep(1) + if len(attached_data_files) != 0: + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + if attached_data: + # make file paths to data relative to the location of this notebook + nested_dir_pos = get_notebook_name()['name_path'].count('/') + relative_attached_data = attached_data + for i in range(nested_dir_pos): + relative_attached_data = "../" + relative_attached_data + # print(f"relative_attached_data: {relative_attached_data}") + with open(relative_attached_data, 'r', encoding='utf-8') as attch: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') + calc_latest_hash = calc_hash(relative_attached_data) + if calc_latest_hash != attachment_file_hash: + attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) + attachment_file_data['hash'] = calc_latest_hash + attachment_files[attached_data] = attachment_file_data + else: + print(f"File {attached_data} not changed so no update") + # print(f"attached files: {attachment_files}") + + + async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file, attachment_files, + rspace_doc, history_data): + """ + Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). + If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. + We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook + a second time. + """ await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, rspace_doc, history_data) with open(current_notebook, 'r', encoding='utf-8') as nb_file: + client = get_rspace_client() nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + # print(f"gallery file for nb: {nb_gallery_file}") + if nb_gallery_file.get('version') == 1: await asyncio.sleep(1) - # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") - await reload_notebook() - return nb_gallery_file - -def get_field_content(rspace_doc): - if RSPACE_DOCUMENT_TARGET_FIELD_ID is None: - return rspace_doc['fields'][0]['content'] - else: - for field in rspace_doc['fields']: - if field['id'] == RSPACE_DOCUMENT_TARGET_FIELD_ID: - return field['content'] - return None - -def assert_invariants(): - if RSPACE_DOCUMENT_TARGET_FIELD_ID is not None and RSPACE_PREEXISITING_DOCUMENT_ID is None: - raise Exception("If RSPACE_DOCUMENT_TARGET_FIELD_ID has a value RSPACE_PREEXISITING_DOCUMENT_ID must also.") - - if server_url is not None and notebook_name is None or notebook_name is not None and server_url is None: - raise Exception("Both server_url and notebook_name must be either None or have a value") - -async def sync_notebook(): - """ - Saves notebook using ipylab and then writes notebook to Rspace document as - an attachment if the execution_count of the notebook has changed since the last time - this cell was run. Note that the execution count of this cell does not contribute to - the comparison - we will not write data to RSpace if only this cell has been run - since the last time data was written to RSpace. - Attached data is also written to RSpace if its hash_sum has changed. + await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, + rspace_doc, history_data) + with open(current_notebook, 'r', encoding='utf-8') as nb_file: + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + await asyncio.sleep(1) + # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") + await reload_notebook() + return nb_gallery_file + + def get_field_content(rspace_doc): + if rspace_document_target_field_id is None: + return rspace_doc['fields'][0]['content'] + else: + for field in rspace_doc['fields']: + if field['id'] == rspace_document_target_field_id: + return field['content'] + return None + + def assert_invariants(): + if (len(rspace_url)==0): + raise Exception("You must provide a URL for your RSpace instance.") + + parsed_url = urlparse(rspace_url) + if not (parsed_url.scheme and parsed_url.netloc): + raise Exception("Your value for RSpace url is not a valid url.") + + if rspace_document_target_field_id is not None and rspace_prexisting_document_id is None: + raise Exception("If rspace_document_target_field_id has a value rspace_prexisting_document_id must also.") + + if server_url is not None and notebook_name is None or notebook_name is not None and server_url is None: + raise Exception("Both server_url and notebook_name must be either None or have a value") - The notebook and attached data will always be written to RSpace at least once (on the first time this cell is run). - """ assert_invariants() - rspace_doc = None - attachment_filess = None - gallery_file = None await save_notebook() get_server_urls() # print(f"notebook name: {get_notebook_name()}") - current_notebook = get_notebook_name()['name'] if IN_COLAB == False else get_notebook_name()['name_path'] + current_notebook = get_notebook_name()['name'] with open(current_notebook, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) new_hash = calc_hash(current_notebook) # print(f" new hash {new_hash}") try: loaded_state = load_data() - execution_count = loaded_state.get(EXECUTION_COUNT_FOR_NOTEBOOK) + execution_count = loaded_state.get(RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK) new_execution_count = get_notebook_execution_count(notebook_node) print(f"New execution count {new_execution_count}") @@ -502,13 +528,11 @@ async def sync_notebook(): # return client = get_rspace_client() rspace_doc = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_files = loaded_state.get(ATTACHMENTS_FOR_NOTEBOOK, {}) - nb_gallery_file = loaded_state.get(GALLERY_FILE_FOR_NOTEBOOK, {}) + attachment_files = loaded_state.get(RSPACE_ATTACHMENTS_FOR_NOTEBOOK, {}) + nb_gallery_file = loaded_state.get(RSPACE_GALLERY_FILE_FOR_NOTEBOOK, {}) nb_gallery_file_id = nb_gallery_file.get('id') - history_data = loaded_state.get(HISTORY_DATA,{'text':''}) - # Modified this line to get the 'name_path' which should be the actual file path - current_notebook_info = get_notebook_name() - current_notebook = current_notebook_info['name_path'] + history_data = loaded_state.get(RSPACE_HISTORY_DATA,{'text':''}) + current_notebook = get_notebook_name()['name'] attachments = None if rspace_doc is not None: print( @@ -520,9 +544,9 @@ async def sync_notebook(): print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}") else: print("Notebook not previously saved to RSpace Gallery") - if rspace_doc is None and RSPACE_PREEXISITING_DOCUMENT_ID is None: + if rspace_doc is None and rspace_prexisting_document_id is None: rspace_doc = client.create_document(name="DocumentFor_" + current_notebook, tags=["Python", "API", "Jupyter"]) - rspace_document_file_id = str(rspace_doc['id']) if RSPACE_PREEXISITING_DOCUMENT_ID is None else RSPACE_PREEXISITING_DOCUMENT_ID + rspace_document_file_id = str(rspace_doc['id']) if rspace_prexisting_document_id is None else rspace_prexisting_document_id rspace_doc = client.get_document(rspace_document_file_id) nb_gallery_file = await upload_notebook_to_gallery(current_notebook,notebook_node, nb_gallery_file,attachment_files, rspace_doc, history_data) # print(f"nb_gallery_file was finally: {nb_gallery_file}") @@ -531,9 +555,9 @@ async def sync_notebook(): previous_content = get_field_content(rspace_doc) previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) new_content = previous_content + make_content(nb_gallery_file_id, attachment_files) - if RSPACE_DOCUMENT_TARGET_FIELD_ID is not None: + if rspace_document_target_field_id is not None: rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], - fields=[{'id': RSPACE_DOCUMENT_TARGET_FIELD_ID,"content": new_content}]) + fields=[{'id': rspace_document_target_field_id,"content": new_content}]) else: rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], fields=[{"content": new_content}]) @@ -544,4 +568,4 @@ async def sync_notebook(): return None -await sync_notebook() \ No newline at end of file +# await sync_notebook_to_rspace(rspace_url="https://researchspace2.eu.ngrok.io/", attached_data_files="spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv") From 48a8a32482d9c7dee4b666a78a5042406a8f7702 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Thu, 23 Oct 2025 20:39:37 +0100 Subject: [PATCH 26/34] RSDEV-782-Jupyter-Notebooks: sync_notebook.py tested --- rspace_client/notebook_sync/sync_notebook.py | 489 +++++++++++++++++++ 1 file changed, 489 insertions(+) create mode 100644 rspace_client/notebook_sync/sync_notebook.py diff --git a/rspace_client/notebook_sync/sync_notebook.py b/rspace_client/notebook_sync/sync_notebook.py new file mode 100644 index 0000000..a50e494 --- /dev/null +++ b/rspace_client/notebook_sync/sync_notebook.py @@ -0,0 +1,489 @@ +from notebook import app +from rspace_client.eln import eln +import os +import hashlib +import dill +import ipynbname +from ipylab import JupyterFrontEnd +import traceback +from bs4 import BeautifulSoup +import nbformat +import asyncio +import getpass +import keyring +from urllib.parse import urlparse +import time + +RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' +RSPACE_ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' +RSPACE_GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' +RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK = 'RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK' +RSPACE_HISTORY_DATA = 'RSPACE_HISTORY_DATA' +RSPACE_DOC_URL = 'workspace/editor/structuredDocument/' +RSPACE_DOC_VERSION_URL_START = 'workspace/editor/structuredDocument/audit/view?globalId=' +RSPACE_KEYRING_SERVICE_ID = 'RSpaceSyncJupyterNotebookApp' + +rspace_client = None +app = JupyterFrontEnd() + + +def set_password(rspace_username=None): + """ + sets password for 'rspace_username' in keyring + """ + if rspace_username is None: + raise Exception("You must provide an rspace_username.") + service_id = RSPACE_KEYRING_SERVICE_ID + retrieved_password = getpass.getpass("Please enter the RSpace Api key for the provided username: ") + keyring.set_password(service_id, rspace_username, retrieved_password) + return "password set" + + +async def sync_notebook_to_rspace(rspace_url="", attached_data_files="", notebook_name=None, server_url=None, + rspace_prexisting_document_id=None, rspace_document_target_field=None, + server_port=None, rspace_username=None): + """ + Saves notebook using ipylab and then writes notebook to Rspace document as + an attachment if the execution_count of the notebook has changed since the last time + this cell was run. Note that the execution count of this cell does not contribute to + the comparison - we will not write data to RSpace if only this cell has been run + since the last time data was written to RSpace. + Attached data is also written to RSpace if its hash_sum has changed. + + The notebook and attached data will always be written to RSpace at least once (on the first time this cell is run). + + Parameters: + + rspace_url : Your RSpace instance goes here + + attached_data_files : + All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' + then paste here using a ',' comma to separate files if there is more than one. + + Example: + attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" + The code in this cell will calculate paths to the data relative to the location of this notebook. Therefore do + not change the 'paths' to the data, regardless of whether this notebook is in the top directory or in a sub directory. + + If you wish to have no attached data, set this value to be "" (a pair of double quotes) + + Example: + attached_data_files = "" + + notebook_name: + This must be set to a the value of the PATH to the notebook (select the notebook in the file browser and choose 'copy path'), + if exceptions are thrown trying to determine the notebook name. + + If this value is set server_url MUST also be set. + server_url: + This must be set to a value if exceptions are thrown or the calculated value is incorrect when trying to determine the server url. Give the url + of the server including the port: eg http://localhost:10000 (no trailing '/') + + If this value is set, notebook_name MUST also be set. + + rspace_prexisting_document_id: + Default behaviour creates a new RSpace document when this cell is executed and attached this jupyter notebook to the new document. + + Setting rspace_prexisting_document_id to a value other than None will attach this jupyter notebook to the RSpace document + with the given ID instead of creating a new RSpace document. + + rspace_document_target_field: + Default behaviour writes links to this notebook into the 'first' field in an RSpace document (field '0'). Set this to a value + if a different field should be used. For example, to target the third field in a document, use the value '2'. + + If this is set to a value other than None, rspace_prexisting_document_id must be set to a value other than None. + server_port: + Set this to a value if server_url is calculated correctly except for the port (which will happen, for example + if the port is being mapped inside a docker container to an external port) + + rspace_username: + This must be set to the name of the rspace user where the notebook is being saved + """ + + def get_server_urls(): + all_urls = [] + if (server_url is not None): + all_urls.append(server_url + '/lab/tree/' + notebook_name) + else: + try: + for srv in ipynbname._list_maybe_running_servers(): + srv, path = ipynbname._find_nb_path() + if server_port is not None: + srv_url = srv['url'] + part_url = srv_url[:srv_url.rfind(':') + 1] + all_urls.append(part_url + str(server_port) + '/lab/tree/' + str(path)) + else: + all_urls.append(srv['url'] + 'lab/tree/' + str(path)) + except Exception: + print(f"Error determining server urls, please manually set a value for 'server_url'") + raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README + return all_urls + + def get_server_roots(): + """ + this will only be called if ipyname library is working correctly + """ + all_roots = [] + try: + if len(all_roots) == 0: + for srv in ipynbname._list_maybe_running_servers(): + srv, path = ipynbname._find_nb_path() + root = srv['root_dir'] + all_roots.append(root) + except Exception: + print(f"Error determining server roots, please manually set a value for 'server_url'") + raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README + return all_roots + + def get_notebook_name(): + try: + if notebook_name is not None: + if '/' in notebook_name: + notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] + else: + notebook_name_alone = notebook_name + return {'name': notebook_name_alone, 'root_name': notebook_name_alone[:notebook_name_alone.rfind('.')], + 'name_path': notebook_name} + nb_fname = ipynbname.name() + nb_path = str(ipynbname.path()) + for srv_root in get_server_roots(): + if not srv_root.endswith("/"): + srv_root = srv_root + "/" + if srv_root in nb_path: + nb_path = nb_path.replace(srv_root, '') + ext_pos = ('' + nb_path).rfind('.') + ext = nb_path[ext_pos:] + return {'name': nb_fname + ext, 'root_name': nb_fname, 'name_path': nb_path} + except Exception as e: + print(f"Error getting notebook name, please manually set a value for 'notebook_name'") + raise + + def get_password(): + """ + Retrieves password from (or saves a new password to) keyring + """ + service_id = RSPACE_KEYRING_SERVICE_ID + + retrieved_password = keyring.get_password(service_id, rspace_username) + if retrieved_password is None: + retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") + keyring.set_password(service_id, username, retrieved_password) + return retrieved_password + + def get_rspace_client(): + """ + Returns rspace ELN API client + """ + global rspace_client + if rspace_client is None: + retrieved_password = get_password() + rspace_client = eln.ELNClient(rspace_url, retrieved_password) + return rspace_client + + def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count, history_data): + # Define the filename to save the state + state_filename = get_notebook_name()['root_name'] + "_state.pkl" + with open(state_filename, 'wb') as f: + dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, RSPACE_ATTACHMENTS_FOR_NOTEBOOK: attachments, + RSPACE_GALLERY_FILE_FOR_NOTEBOOK: gallery_file, + RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK: execution_count, RSPACE_HISTORY_DATA: history_data}, f) + + def load_data(): + state_filename = get_notebook_name()['root_name'] + "_state.pkl" + + if os.path.exists(state_filename): + # Load the variables from the file using dill + with open(state_filename, 'rb') as f: + try: + loaded_state = dill.load(f) + except Exception as e: + loaded_state = {} + else: + loaded_state = {} + return loaded_state + + async def save_notebook(): + ''' + 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved. So we have no + idea when it has completed. Jupyter Notebooks can be (at least) 100MB in size - there are some limitations imposed by the + Tornado web server Jupyter uses. When save is called the entire contents of the notebook are sent as the body of a REST + PUT request, including any images in cell outputs. + + We can write to the notebook data store synchronously using python's file handling API but we cant access the contents of the notebook the user + actually sees because this runs in the browser and the Jupyter API gives no access. The version of the notebook in the browser does + not match the version in the back end file store whenever the notebook is in 'unsaved' state (a black circle appears in its Jupyter notebook tab). + + There is a REST API which can get file contents: + https://github.com/ipython/ipython/wiki/IPEP-27%3A-Contents-Service#get-an-existing-file + and save: + https://github.com/ipython/ipython/wiki/IPEP-27%3A-Contents-Service#save-file + However the GET method of this REST API also fetches its data from the BACK END, not from the document front end contents. + The rest API would also be difficult to use as its not straightforward to obtain the host location for URL endpoints programatically. + + The ipylab library being used in this code wraps widgets in the UI and calling 'docmanager:save' programatically 'clicks' the save button. + + Solution: + (We do not get into a infinite loop when save is called because the notebook has ALWAYS changed - the act + of running the sync code begins *prior to calling save* by outputting text on the screen, causing the notebook to enter 'unsaved' state in the UI.) + + 1) Get modified time of file + 2) Loop until modified time changes + 3) Timeout after 30s - infinite loop can happen when user enters an incorrect notebook name and then mistakely saves a different notebook which is unchanged + ''' + file_path = get_notebook_name()['name_path'] + start_mod_time = os.path.getmtime(file_path) + curr_mod_time = start_mod_time + start_watch_time = time.time() + # this arbitrary 1 second sleep is to allow the UI time to update and register that it is the 'unsaved' state + await asyncio.sleep(1) + app.commands.execute('docmanager:save') + while start_mod_time == curr_mod_time: + await asyncio.sleep(0.1) + curr_mod_time = os.path.getmtime(file_path) + elapsed_time = time.time() - start_watch_time + if elapsed_time > 30: + msg = "TIMEOUT ON SAVE ***** DID YOU MEAN TO SAVE NOTEBOOK: " + file_path + " ?" + raise Exception(msg) + + async def reload_notebook(): + app.commands.execute('docmanager:reload') + # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded + await asyncio.sleep(1) + + def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data): + rspace_document_file_id = str(rspace_doc['id']) + # new content plus new attachment data increments the document version by two + rspace_document_version = 2 if rspace_doc['version'] == 1 else rspace_doc['version'] + 2 + rspace_document_name = rspace_doc['name'] + rspace_document_globalId = rspace_doc['globalId'] + 'v' + str(rspace_document_version) + nb_gallery_file_id = nb_gallery_file['id'] + nb_gallery_file_version = int(nb_gallery_file['version']) + nb_gallery_file_version = nb_gallery_file_version + 1 + nb_gallery_file_name = nb_gallery_file['name'] + meta_data_cell = nbformat.v4.new_markdown_cell() + rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({rspace_url}{RSPACE_DOC_URL}{rspace_document_file_id})' + gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({rspace_url}gallery/item/{nb_gallery_file_id})' + meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown + if len(attached_data_files) != 0: + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_version = attachment_files.get(attached_data, {}).get('version') + meta_data_cell[ + 'source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({rspace_url}gallery/item/{attachment_file_id})' + else: + meta_data_cell['source'] += f'
No Attached Data' + for url in get_server_urls(): + meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' + new_history = f'
RSpace doc [{rspace_document_name} version {rspace_document_version}]({rspace_url}{RSPACE_DOC_VERSION_URL_START}{rspace_document_globalId}) contains this Notebook, version {nb_gallery_file_version}, executed with: ' + if len(attached_data_files) != 0: + for attached_data in attached_data_files.split(","): + attachment_version = attachment_files.get(attached_data, {}).get('version') + new_history += f'Data {attached_data} version: {attachment_version} ' + history_data['text'] = new_history + history_data['text'] + meta_data_cell['source'] += f"
{history_data['text']}" + meta_data_cell['metadata'] = { + "rspace_metadata": {"documentFor": "docid", "notebook_file": "docid", "attachments": [""]}} + return meta_data_cell + + async def add_rspace_details_to_previously_uploaded_notebook_metadata(fname, notebook, nb_gallery_file, + attachment_files, + rspace_doc, history_data): + """ + We have to save meta data about a notebook before its been uploaded to the gallery. + Therefore increment version by 1 when writing the metadata. + + If nb_gallery_file[id] is None its the initial upload to the Gallery and so do not write any meta data + """ + if nb_gallery_file.get('id') is None: + return + meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data) + replaced = False + for i, cell in enumerate(notebook['cells']): + if 'rspace_metadata' in cell['metadata']: + notebook["cells"][i] = meta_data_cell + replaced = True + if replaced is False: + notebook["cells"].extend([meta_data_cell]) + with open(fname, 'w', encoding='utf-8') as modified: + nbformat.write(notebook, modified) + + def get_notebook_execution_count(notebook): + """ + return the sum of all execution counts for code cells + note that this code cell does not contribute to the count: + it is always saved before its execution_count gets updated + and so the value of execution_count for this cell is always 'None' + """ + new_executed_count = 0 + for i, cell in enumerate(notebook['cells']): + if cell['cell_type'] == 'code': + cell_count = cell['execution_count'] + if cell_count is None: + cell_count = 0 + new_executed_count += cell_count + return new_executed_count + + def make_content(nb_gallery_file_id, attachment_files): + content = f""" + + """ + if len(attached_data_files) != 0: + for attachment_file in attached_data_files.split(","): + content += f""" + + """ + return content + + def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): + """ + Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook + then remove them + """ + soup = BeautifulSoup(content, 'html.parser') + attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) + for attachment_div in attachment_divs: + href_tag = attachment_div.find('a') + gallery_link = '/Streamfile/' + str(nb_gallery_file_id) + if gallery_link == href_tag['href']: + attachment_div.decompose() + continue + if len(attached_data_files) != 0: + for attachment_file in attached_data_files.split(","): + attachment_file_id = attachment_files.get(attachment_file, {}).get('id') + attachment_link = '/Streamfile/' + str(attachment_file_id) + if attachment_link == href_tag['href']: + attachment_div.decompose() + break + return soup.prettify() + + def upload_file_to_gallery(rspaceid, file, client): + if rspaceid is None: + data = client.upload_file(file) + else: + data = client.update_file(file, rspaceid) + return data + + def calc_hash(filename): + sha256_hash = hashlib.sha256() + with open(filename, "rb") as f: + # Read and update hash string value in blocks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + + def upload_attached_data(attachment_files): + client = get_rspace_client() + if len(attached_data_files) != 0: + attached_data_files_list = attached_data_files.split(",") + for attached_data in attached_data_files_list: + if attached_data: + # make file paths to data relative to the location of this notebook + nested_dir_pos = get_notebook_name()['name_path'].count('/') + relative_attached_data = attached_data + for i in range(nested_dir_pos): + relative_attached_data = "../" + relative_attached_data + with open(relative_attached_data, 'r', encoding='utf-8') as attch: + attachment_file_id = attachment_files.get(attached_data, {}).get('id') + attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') + calc_latest_hash = calc_hash(relative_attached_data) + if calc_latest_hash != attachment_file_hash: + attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) + attachment_file_data['hash'] = calc_latest_hash + attachment_files[attached_data] = attachment_file_data + + async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file, attachment_files, + rspace_doc, history_data): + """ + Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). + If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. + We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook + a second time. + """ + await add_rspace_details_to_previously_uploaded_notebook_metadata(current_notebook, notebook, nb_gallery_file, + attachment_files, + rspace_doc, history_data) + with open(current_notebook, 'r', encoding='utf-8') as nb_file: + client = get_rspace_client() + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + if nb_gallery_file.get('version') == 1: + await asyncio.sleep(1) + await add_rspace_details_to_previously_uploaded_notebook_metadata(current_notebook, notebook, + nb_gallery_file, attachment_files, + rspace_doc, history_data) + with open(current_notebook, 'r', encoding='utf-8') as nb_file: + nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) + await asyncio.sleep(1) + return nb_gallery_file + + def get_field_content(rspace_doc): + if rspace_document_target_field is None: + return rspace_doc['fields'][0]['content'] + else: + return rspace_doc['fields'][int(rspace_document_target_field)]['content'] + + def assert_invariants(): + if (len(rspace_url) == 0): + raise Exception("You must provide a URL for your RSpace instance.") + + parsed_url = urlparse(rspace_url) + if not (parsed_url.scheme and parsed_url.netloc): + raise Exception("Your value for RSpace url is not a valid url.") + + if rspace_username is None: + raise Exception("You must provide an rspace_username.") + + if rspace_document_target_field is not None and rspace_prexisting_document_id is None: + raise Exception("If rspace_document_target_field has a value rspace_prexisting_document_id must also.") + + if server_url is not None and notebook_name is None or notebook_name is not None and server_url is None: + raise Exception("Both server_url and notebook_name must be either None or have a value") + + assert_invariants() + current_notebook = get_notebook_name()['name'] + # do not remove this print statement as it is required to ensure notebook is always in modified state when we call save_notebook + print(f'Running sync on notebook:{current_notebook}') + await save_notebook() + get_server_urls() + with open(current_notebook, 'r') as notebook: + notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) + try: + loaded_state = load_data() + execution_count = loaded_state.get(RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK) + new_execution_count = get_notebook_execution_count(notebook_node) + if execution_count == new_execution_count: + print("No execution since last sync: no data updated in RSpace") + return + client = get_rspace_client() + rspace_doc = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) + attachment_files = loaded_state.get(RSPACE_ATTACHMENTS_FOR_NOTEBOOK, {}) + nb_gallery_file = loaded_state.get(RSPACE_GALLERY_FILE_FOR_NOTEBOOK, {}) + history_data = loaded_state.get(RSPACE_HISTORY_DATA, {'text': ''}) + current_notebook = get_notebook_name()['name'] + upload_attached_data(attachment_files) + if rspace_doc is None and rspace_prexisting_document_id is None: + rspace_doc = client.create_document(name="DocumentFor_" + current_notebook, + tags=["Python", "API", "Jupyter"]) + if rspace_document_target_field is not None: + rspace_document_target_field_id = str(rspace_doc['fields'][int(rspace_document_target_field)]['id']) + else: + rspace_document_target_field_id = str(rspace_doc['fields'][0]['id']) + rspace_document_file_id = str( + rspace_doc['id']) if rspace_prexisting_document_id is None else rspace_prexisting_document_id + rspace_doc = client.get_document(rspace_document_file_id) + nb_gallery_file = await upload_notebook_to_gallery(current_notebook, notebook_node, nb_gallery_file, + attachment_files, rspace_doc, history_data) + nb_gallery_file_id = nb_gallery_file.get('id') + previous_content = get_field_content(rspace_doc) + previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) + new_content = make_content(nb_gallery_file_id, attachment_files) + previous_content + rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], + fields=[ + {'id': rspace_document_target_field_id, "content": new_content}]) + await reload_notebook() + save_rspace_data(rspace_doc, attachment_files, nb_gallery_file, new_execution_count, history_data) + return 'success' + except Exception as e: + print(traceback.format_exc()) + print(f"Error reading notebook file: {e}") + return traceback.format_exc() From d3551a1c42317817c601d942e04572e5fe1ee22f Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Fri, 24 Oct 2025 09:08:56 +0100 Subject: [PATCH 27/34] RSDEV-782-Jupyter-Notebooks: does not attempt to save_notebook unless it is python --- rspace_client/notebook_sync/sync_notebook.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/rspace_client/notebook_sync/sync_notebook.py b/rspace_client/notebook_sync/sync_notebook.py index a50e494..73fe736 100644 --- a/rspace_client/notebook_sync/sync_notebook.py +++ b/rspace_client/notebook_sync/sync_notebook.py @@ -439,11 +439,19 @@ def assert_invariants(): if server_url is not None and notebook_name is None or notebook_name is not None and server_url is None: raise Exception("Both server_url and notebook_name must be either None or have a value") + def notebook_can_be_saved(current_notebook): + with open(current_notebook, 'r') as notebook: + notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) + if notebook_node.metadata.kernelspec.display_name.lower() == 'python': + return True + return False + assert_invariants() current_notebook = get_notebook_name()['name'] # do not remove this print statement as it is required to ensure notebook is always in modified state when we call save_notebook print(f'Running sync on notebook:{current_notebook}') - await save_notebook() + if notebook_can_be_saved(current_notebook): + await save_notebook() get_server_urls() with open(current_notebook, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) @@ -478,8 +486,8 @@ def assert_invariants(): previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) new_content = make_content(nb_gallery_file_id, attachment_files) + previous_content rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], - fields=[ - {'id': rspace_document_target_field_id, "content": new_content}]) + fields=[ + {'id': rspace_document_target_field_id, "content": new_content}]) await reload_notebook() save_rspace_data(rspace_doc, attachment_files, nb_gallery_file, new_execution_count, history_data) return 'success' From f7620b6a0ff50ac048551b6694bae56be642a8c8 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Fri, 24 Oct 2025 09:28:31 +0100 Subject: [PATCH 28/34] RSDEV-782-Jupyter-Notebooks: added requirement files and build file changes --- jupyter_notebooks/provenance_jupyter_hub | 571 ------------------ jupyter_notebooks/provenance_jupyter_lite | 191 ------ pyproject.toml | 2 +- rspace_client/__init__.py | 2 + .../notebook_sync_requirements.py | 8 + .../notebook_sync_requirements_r.py | 16 + 6 files changed, 27 insertions(+), 763 deletions(-) delete mode 100644 jupyter_notebooks/provenance_jupyter_hub delete mode 100644 jupyter_notebooks/provenance_jupyter_lite create mode 100644 rspace_client/notebook_sync/notebook_sync_requirements.py create mode 100644 rspace_client/notebook_sync/notebook_sync_requirements_r.py diff --git a/jupyter_notebooks/provenance_jupyter_hub b/jupyter_notebooks/provenance_jupyter_hub deleted file mode 100644 index 21a5faf..0000000 --- a/jupyter_notebooks/provenance_jupyter_hub +++ /dev/null @@ -1,571 +0,0 @@ -import json -%pip install -q rspace-client==2.6.1 -%pip install -q pickleshare -try: - from notebook import app -except: - %pip install -q notebook -%pip install -q keyring -from rspace_client.eln import eln -import os -import hashlib -import json -%pip install -q dill -import dill -%pip install -q ipynbname -import ipynbname -%pip install -q ipylab -from ipylab import JupyterFrontEnd -import traceback -%pip install -q lxml -from bs4 import BeautifulSoup -import nbformat -import asyncio -import getpass -import keyring -from urllib.parse import urlparse - -RSPACE_DOC_FOR_NOTEBOOK = 'rspace_doc_for_notebook' -RSPACE_ATTACHMENTS_FOR_NOTEBOOK = 'data_attached_to_notebook' -RSPACE_GALLERY_FILE_FOR_NOTEBOOK = 'file_for_notebook' -RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK = 'RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK' -RSPACE_HISTORY_DATA = 'RSPACE_HISTORY_DATA' -RSPACE_DOC_URL = 'workspace/editor/structuredDocument/' -RSPACE_DOC_VERSION_URL_START = 'workspace/editor/structuredDocument/audit/view?globalId=' -# Your RSpace instance goes here -# rspace_url = "https://researchspace2.eu.ngrok.io/" -""" -Default behaviour creates a new RSpace document when this cell is executed and attached this jupyter notebook to the -new document. - -Setting rspace_prexisting_document_id to a value other than None will attach this jupyter notebook to the RSpace document -with the given ID instead of creating a new RSpace document. -""" -# rspace_prexisting_document_id = None -# rspace_prexisting_document_id = 155 -""" -Default behaviour writes links to this notebook into the 'first' field in a document (field '0'). Set this to a value -if a different field should be used. - -If this is set to a value other than None, rspace_prexisting_document_id must be set to a value other than None. -""" -# rspace_document_target_field_id = None -# rspace_document_target_field_id = 34 -""" - All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' - then paste here using a ',' comma to separate files if there is more than one. - - Example: - attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" - The code in this cell will calculate paths to the data relative to the location of this notebook. Therefore do - not change the 'paths' to the data, regardless of whether this notebook is in the top directory or in a sub directory. - - If you wish to have no attached data, set this value to be "" (a pair of double quotes) - - Example: - attached_data_files = "" -""" -# attached_data_files = "spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv" -""" -Set this to true to manually enter a new password -""" -# get_new_password = False -""" -This must be set to a the value of the PATH to the notebook (select the notebook in the file browser and choose 'copy path'), -if exceptions are thrown trying to determine the notebook name. - -If this value is set server_url MUST also be set. -""" -# notebook_name = None -""" -This must be set to a value if exceptions are thrown or the calculated value is incorrect when trying to determine the server url. Give the url -of the server including the port: eg http://localhost:10000 (no trailing '/') - -If this value is set, notebook_name MUST also be set. -""" -# server_url = None - -""" -Set this to a value if server_url is calculated correctly except for the port (which will happen, for example -if the port is being mapped inside a docker container to an external port) -""" -# server_port = 10000 - -rspace_client = None -app = JupyterFrontEnd() - -async def sync_notebook_to_rspace(rspace_url="",attached_data_files="",notebook_name=None, server_url=None, rspace_prexisting_document_id=None,rspace_document_target_field_id=None, server_port=None, get_new_password=False ): - """ - Saves notebook using ipylab and then writes notebook to Rspace document as - an attachment if the execution_count of the notebook has changed since the last time - this cell was run. Note that the execution count of this cell does not contribute to - the comparison - we will not write data to RSpace if only this cell has been run - since the last time data was written to RSpace. - Attached data is also written to RSpace if its hash_sum has changed. - - The notebook and attached data will always be written to RSpace at least once (on the first time this cell is run). - - Parameters: - - rspace_url : Your RSpace instance goes here - - attached_data_files : - All data that will be saved to RSpace along with this notebook: select the data in the file browser and choose 'copy path' - then paste here using a ',' comma to separate files if there is more than one. - - Example: - attached_data_files = "spectroscopy_data.csv, data/spectroscopy_data2.csv, data/spectroscopy_data3.csv" - The code in this cell will calculate paths to the data relative to the location of this notebook. Therefore do - not change the 'paths' to the data, regardless of whether this notebook is in the top directory or in a sub directory. - - If you wish to have no attached data, set this value to be "" (a pair of double quotes) - - Example: - attached_data_files = "" - - notebook_name: - This must be set to a the value of the PATH to the notebook (select the notebook in the file browser and choose 'copy path'), - if exceptions are thrown trying to determine the notebook name. - - If this value is set server_url MUST also be set. - server_url: - This must be set to a value if exceptions are thrown or the calculated value is incorrect when trying to determine the server url. Give the url - of the server including the port: eg http://localhost:10000 (no trailing '/') - - If this value is set, notebook_name MUST also be set. - - rspace_prexisting_document_id: - Default behaviour creates a new RSpace document when this cell is executed and attached this jupyter notebook to the new document. - - Setting rspace_prexisting_document_id to a value other than None will attach this jupyter notebook to the RSpace document - with the given ID instead of creating a new RSpace document. - - rspace_document_target_field_id: - Default behaviour writes links to this notebook into the 'first' field in a document (field '0'). Set this to a value - if a different field should be used. - - If this is set to a value other than None, rspace_prexisting_document_id must be set to a value other than None. - server_port: - Set this to a value if server_url is calculated correctly except for the port (which will happen, for example - if the port is being mapped inside a docker container to an external port) - - get_new_password: - Set this to true to manually enter a new password - - """ - def get_server_urls(): - all_urls = [] - if (server_url is not None): - all_urls.append(server_url + '/lab/tree/' + notebook_name) - else: - try: - for srv in ipynbname._list_maybe_running_servers(): - srv, path = ipynbname._find_nb_path() - if server_port is not None: - srv_url = srv['url'] - # print(f"srv_url: {srv_url}") - # print(f"root_dir: {srv['root_dir']}") - # print(f"path: {str(path)}") - part_url = srv_url[:srv_url.rfind(':') + 1] - # print(f"part_url: {part_url}") - all_urls.append(part_url + str(server_port) + '/lab/tree/' + str(path)) - else: - all_urls.append(srv['url'] + 'lab/tree/' + str(path)) - except Exception: - print(f"Error determining server urls, please manually set a value for 'server_url'") - raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README - return all_urls - - - def get_server_roots(): - """ - this will only be called if ipyname library is working correctly - """ - all_roots = [] - try: - if len(all_roots) == 0: - for srv in ipynbname._list_maybe_running_servers(): - srv, path = ipynbname._find_nb_path() - root = srv['root_dir'] - all_roots.append(root) - except Exception: - print(f"Error determining server roots, please manually set a value for 'server_url'") - raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README - return all_roots - - - def get_notebook_name(): - try: - if notebook_name is not None: - if '/' in notebook_name: - notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] - else: - notebook_name_alone = notebook_name - return {'name': notebook_name_alone, 'root_name': notebook_name_alone[:notebook_name_alone.rfind('.')], - 'name_path': notebook_name} - nb_fname = ipynbname.name() - nb_path = str(ipynbname.path()) - for srv_root in get_server_roots(): - if not srv_root.endswith("/"): - srv_root = srv_root + "/" - if srv_root in nb_path: - nb_path = nb_path.replace(srv_root, '') - # print(f"nb_path: {nb_path}") - ext_pos = ('' + nb_path).rfind('.') - ext = nb_path[ext_pos:] - return {'name': nb_fname + ext, 'root_name': nb_fname, 'name_path': nb_path} - except Exception as e: - print(f"Error getting notebook name, please manually set a value for 'notebook_name'") - raise - - def get_password(): - """ - Retrieves password from (or saves a new password to) keyring - """ - from keyring import get_keyring - print("Keyring method: " + str(get_keyring())) - try: - - # TODO - Define the service name (e.g., the notebook name the secret is for) - service_id = "RSpaceJupyterDemoApp" - # TODO - Define the username associated with the secret - username = "myuser" # use your own username - - retrieved_password = keyring.get_password(service_id, username) - if retrieved_password is None or get_new_password: - retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") - keyring.set_password(service_id, username, retrieved_password) - return retrieved_password - except Exception as e: - print(f"Error getting password: {e}") - return None - - - def get_rspace_client(): - """ - Returns rspace ELN API client - """ - try: - global rspace_client - if rspace_client is None: - retrieved_password = get_password() - if retrieved_password is None: - %pip install keyrings.alt - retrieved_password = get_password() - rspace_client = eln.ELNClient(rspace_url, retrieved_password) - print(rspace_client.get_status()) - return rspace_client - except Exception as e: - print(traceback.format_exc()) - print(f"Error connecting to RSpace: {e}") - return None - - - def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count, history_data): - # Define the filename to save the state - state_filename = get_notebook_name()['root_name'] + "_state.pkl" - print(f"writing to file: {state_filename}") - with open(state_filename, 'wb') as f: - dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, RSPACE_ATTACHMENTS_FOR_NOTEBOOK: attachments, - RSPACE_GALLERY_FILE_FOR_NOTEBOOK: gallery_file, RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK: execution_count, RSPACE_HISTORY_DATA:history_data }, f) - - - def load_data(): - state_filename = get_notebook_name()['root_name'] + "_state.pkl" - - if os.path.exists(state_filename): - # Load the variables from the file using dill - with open(state_filename, 'rb') as f: - try: - loaded_state = dill.load(f) - except Exception as e: - loaded_state = {} - else: - loaded_state = {} - print(f"State file '{state_filename}' not found. No variables loaded.") - return loaded_state - - - async def save_notebook(): - app.commands.execute('docmanager:save') - # 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved - await asyncio.sleep(1) - - - async def reload_notebook(): - app.commands.execute('docmanager:reload') - # 'docmanager:reload' does not appear to hook into any callback invoked when the document is actually reloaded - await asyncio.sleep(1) - - def make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc, history_data): - rspace_document_file_id =str(rspace_doc['id']) - # new content plus new attachment data increments the document version by two - rspace_document_version = 2 if rspace_doc['version'] == 1 else rspace_doc['version'] + 2 - rspace_document_name = rspace_doc['name'] - rspace_document_globalId = rspace_doc['globalId']+'v'+str(rspace_document_version) - nb_gallery_file_id = nb_gallery_file['id'] - nb_gallery_file_version = int(nb_gallery_file['version']) - # nb_gallery_file_version = 1 if nb_gallery_file_version == 1 else nb_gallery_file_version + 1 - nb_gallery_file_version = nb_gallery_file_version + 1 - nb_gallery_file_name = nb_gallery_file['name'] - meta_data_cell = nbformat.v4.new_markdown_cell() - rspace_doc_for_markdown = f'[The RSpace Document describing this notebook, version: {rspace_document_version}]({rspace_url}{RSPACE_DOC_URL}{rspace_document_file_id})' - gallery_doc_markdown = f'[This Notebook in RSpace Gallery: {nb_gallery_file_name} version: {nb_gallery_file_version}]({rspace_url}gallery/item/{nb_gallery_file_id})' - meta_data_cell['source'] = rspace_doc_for_markdown + "
" + gallery_doc_markdown - if len(attached_data_files) != 0: - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_version = attachment_files.get(attached_data, {}).get('version') - meta_data_cell['source'] += f'
[Attached Data {attached_data} version: {attachment_version} ]({rspace_url}gallery/item/{attachment_file_id})' - else: - meta_data_cell['source'] += f'
No Attached Data' - for url in get_server_urls(): - meta_data_cell['source'] += f'
[This notebook on the jupyter server]({url})' - new_history = f'
RSpace doc [{rspace_document_name} version {rspace_document_version}]({rspace_url}{RSPACE_DOC_VERSION_URL_START}{rspace_document_globalId}) contains this Notebook, version {nb_gallery_file_version}, executed with: ' - if len(attached_data_files) != 0: - for attached_data in attached_data_files.split(","): - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_version = attachment_files.get(attached_data, {}).get('version') - new_history += f'Data {attached_data} version: {attachment_version} ' - # print(f'RSPACE_HISTORY_DATA: {RSPACE_HISTORY_DATA}') - history_data['text'] = new_history + history_data['text'] - meta_data_cell['source'] += f"
{history_data['text']}" - meta_data_cell['metadata'] = { - "rspace_metadata": {"documentFor": "docid", "notebook_file": "docid", "attachments": [""]}} - return meta_data_cell - - async def add_rspace_details_to_notebook_metadata(fname, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data): - """ - We have to save meta data about a notebook before its been uploaded to the gallery. - Therefore increment version by 1 when writing the metadata. If nb_gallery_file[id] is None - its the initial upload to the Gallery and so do not write any meta data - """ - if nb_gallery_file.get('id') is None: - return - await save_notebook() - meta_data_cell = make_metadata_cell(nb_gallery_file, attachment_files, rspace_doc,history_data) - replaced = False - for i, cell in enumerate(notebook['cells']): - if 'rspace_metadata' in cell['metadata']: - notebook["cells"][i] = meta_data_cell - replaced = True - if replaced is False: - notebook["cells"].extend([meta_data_cell]) - with open(fname, 'w', encoding='utf-8') as modified: - nbformat.write(notebook, modified) - - - def get_notebook_execution_count(notebook): - """ - return the sum of all execution counts for code cells - note that this code cell does not contribute to the count: - it is always saved before its execution_count gets updated - and so the value of execution_count for this cell is always 'None' - """ - new_executed_count = 0 - for i, cell in enumerate(notebook['cells']): - if cell['cell_type'] == 'code': - # print(f"cell id: {cell['id']} and execution_count {cell['execution_count']} ") - cell_count = cell['execution_count'] - if cell_count is None: - cell_count = 0 - new_executed_count += cell_count - # print(f"new executed count {new_executed_count}") - return new_executed_count - - - def make_content(nb_gallery_file_id, attachment_files): - content = f""" - - """ - if len(attached_data_files) != 0: - for attachment_file in attached_data_files.split(","): - content += f""" - - """ - # print(f"content is {content}") - return content - - - def remove_jupyter_attachment_divs(content, nb_gallery_file_id, attachment_files): - """ - Iterate all attachments in the document field and if any have ids matching the stored ids for this notebook - then remove them - """ - soup = BeautifulSoup(content, 'html.parser') - attachment_divs = soup.find_all("div", {"class": "attachmentDiv"}) - for attachment_div in attachment_divs: - href_tag = attachment_div.find('a') - # print(f"href_tag{href_tag}") - gallery_link = '/Streamfile/' + str(nb_gallery_file_id) - if gallery_link == href_tag['href']: - attachment_div.decompose() - continue - if len(attached_data_files) != 0: - for attachment_file in attached_data_files.split(","): - attachment_file_id = attachment_files.get(attachment_file, {}).get('id') - attachment_link = '/Streamfile/' + str(attachment_file_id) - if attachment_link == href_tag['href']: - attachment_div.decompose() - break - return soup.prettify() - - - def upload_file_to_gallery(rspaceid, file, client): - if rspaceid is None: - print(f'start upload file {file} using {client}') - data = client.upload_file(file) - else: - print('start update file') - data = client.update_file(file, rspaceid) - return data - - - def calc_hash(filename): - sha256_hash = hashlib.sha256() - with open(filename, "rb") as f: - # Read and update hash string value in blocks of 4K - for byte_block in iter(lambda: f.read(4096), b""): - sha256_hash.update(byte_block) - return sha256_hash.hexdigest() - - - def upload_attached_data(attachment_files): - client = get_rspace_client() - if len(attached_data_files) != 0: - attached_data_files_list = attached_data_files.split(",") - for attached_data in attached_data_files_list: - if attached_data: - # make file paths to data relative to the location of this notebook - nested_dir_pos = get_notebook_name()['name_path'].count('/') - relative_attached_data = attached_data - for i in range(nested_dir_pos): - relative_attached_data = "../" + relative_attached_data - # print(f"relative_attached_data: {relative_attached_data}") - with open(relative_attached_data, 'r', encoding='utf-8') as attch: - attachment_file_id = attachment_files.get(attached_data, {}).get('id') - attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') - calc_latest_hash = calc_hash(relative_attached_data) - if calc_latest_hash != attachment_file_hash: - attachment_file_data = upload_file_to_gallery(attachment_file_id, attch, client) - attachment_file_data['hash'] = calc_latest_hash - attachment_files[attached_data] = attachment_file_data - else: - print(f"File {attached_data} not changed so no update") - # print(f"attached files: {attachment_files}") - - - async def upload_notebook_to_gallery(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data): - """ - Metadata about the notebook is written to the notebook before it us uploaded to the Gallery (and the version incremented predictively by one). - If the notebook has never been uploaded to the Gallery we have no stored rspace-id to write to the meta data and so we do not write any meta data. - We do the initial upload (which creates a Gallery file with version = '1'. Then we write meta data (incrementing the version to '2' and upload the notebook - a second time. - """ - await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data) - with open(current_notebook, 'r', encoding='utf-8') as nb_file: - client = get_rspace_client() - nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - # print(f"gallery file for nb: {nb_gallery_file}") - if nb_gallery_file.get('version') == 1: - await asyncio.sleep(1) - await add_rspace_details_to_notebook_metadata(current_notebook, notebook, nb_gallery_file, attachment_files, - rspace_doc, history_data) - with open(current_notebook, 'r', encoding='utf-8') as nb_file: - nb_gallery_file = upload_file_to_gallery(nb_gallery_file.get('id'), nb_file, client) - await asyncio.sleep(1) - # print(f"nb_gallery_file was uploaded a second time and was : {nb_gallery_file}") - await reload_notebook() - return nb_gallery_file - - def get_field_content(rspace_doc): - if rspace_document_target_field_id is None: - return rspace_doc['fields'][0]['content'] - else: - for field in rspace_doc['fields']: - if field['id'] == rspace_document_target_field_id: - return field['content'] - return None - - def assert_invariants(): - if (len(rspace_url)==0): - raise Exception("You must provide a URL for your RSpace instance.") - - parsed_url = urlparse(rspace_url) - if not (parsed_url.scheme and parsed_url.netloc): - raise Exception("Your value for RSpace url is not a valid url.") - - if rspace_document_target_field_id is not None and rspace_prexisting_document_id is None: - raise Exception("If rspace_document_target_field_id has a value rspace_prexisting_document_id must also.") - - if server_url is not None and notebook_name is None or notebook_name is not None and server_url is None: - raise Exception("Both server_url and notebook_name must be either None or have a value") - - assert_invariants() - await save_notebook() - get_server_urls() - # print(f"notebook name: {get_notebook_name()}") - current_notebook = get_notebook_name()['name'] - with open(current_notebook, 'r') as notebook: - notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) - new_hash = calc_hash(current_notebook) - # print(f" new hash {new_hash}") - try: - loaded_state = load_data() - execution_count = loaded_state.get(RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK) - new_execution_count = get_notebook_execution_count(notebook_node) - - print(f"New execution count {new_execution_count}") - print(f"Previous execution count {execution_count}") - # FIXME - # if execution_count == new_execution_count: - # print("No execution since last sync: no data updated in RSpace") - # await save_notebook() - # return - client = get_rspace_client() - rspace_doc = loaded_state.get(RSPACE_DOC_FOR_NOTEBOOK) - attachment_files = loaded_state.get(RSPACE_ATTACHMENTS_FOR_NOTEBOOK, {}) - nb_gallery_file = loaded_state.get(RSPACE_GALLERY_FILE_FOR_NOTEBOOK, {}) - nb_gallery_file_id = nb_gallery_file.get('id') - history_data = loaded_state.get(RSPACE_HISTORY_DATA,{'text':''}) - current_notebook = get_notebook_name()['name'] - attachments = None - if rspace_doc is not None: - print( - f"An RSpace document with this notebook as an attachment saved previously with RSpaceID {str(rspace_doc['id'])}") - else: - print("No RSpace document with this notebook as an attachment saved previously in RSpace") - upload_attached_data(attachment_files) - if nb_gallery_file_id is not None: - print(f"This notebook saved previously to Gallery with RSpaceID {nb_gallery_file_id}") - else: - print("Notebook not previously saved to RSpace Gallery") - if rspace_doc is None and rspace_prexisting_document_id is None: - rspace_doc = client.create_document(name="DocumentFor_" + current_notebook, tags=["Python", "API", "Jupyter"]) - rspace_document_file_id = str(rspace_doc['id']) if rspace_prexisting_document_id is None else rspace_prexisting_document_id - rspace_doc = client.get_document(rspace_document_file_id) - nb_gallery_file = await upload_notebook_to_gallery(current_notebook,notebook_node, nb_gallery_file,attachment_files, rspace_doc, history_data) - # print(f"nb_gallery_file was finally: {nb_gallery_file}") - nb_gallery_file_id = nb_gallery_file.get('id') - - previous_content = get_field_content(rspace_doc) - previous_content = remove_jupyter_attachment_divs(previous_content, nb_gallery_file_id, attachment_files) - new_content = previous_content + make_content(nb_gallery_file_id, attachment_files) - if rspace_document_target_field_id is not None: - rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], - fields=[{'id': rspace_document_target_field_id,"content": new_content}]) - else: - rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], - fields=[{"content": new_content}]) - save_rspace_data(rspace_doc, attachment_files, nb_gallery_file, new_execution_count, history_data) - except Exception as e: - print(traceback.format_exc()) - print(f"Error reading notebook file: {e}") - return None - - -# await sync_notebook_to_rspace(rspace_url="https://researchspace2.eu.ngrok.io/", attached_data_files="spectroscopy_data.csv,data/spectroscopy_data2.csv,data/spectroscopy_data3.csv") diff --git a/jupyter_notebooks/provenance_jupyter_lite b/jupyter_notebooks/provenance_jupyter_lite deleted file mode 100644 index dda01d6..0000000 --- a/jupyter_notebooks/provenance_jupyter_lite +++ /dev/null @@ -1,191 +0,0 @@ -import json -%pip install -q rspace-client==2.6.1 -%pip install -q pickleshare -from rspace_client.eln import eln -import os -import hashlib -import json -%pip install -q dill -import dill - -rspace_client = None - -def get_rspace_client(): - """ - Returns rspace ELN API client - """ - try: - import getpass - retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") - URL='https://researchspace2.eu.ngrok.io/' - global rspace_client - if rspace_client is None: - rspace_client = eln.ELNClient(URL, retrieved_password) - print(rspace_client.get_status()) - return rspace_client - except Exception as e: - print(f"Error connecting to RSpace: {e}") - return None - -def save_data(rspace_doc, attachments, gallery_file): - # Define the filename to save the state - state_filename = "notebook_state.pkl" - - # Save the variables to the file using dill - with open(state_filename, 'wb') as f: - dill.dump({'rspace_doc_for_notebook': rspace_doc, 'attachments_for_notebook': attachments, 'gallery_file_for_notebook': gallery_file}, f) - print(f"Variables saved to {state_filename}") - -def load_data(): - # Define the filename where the state was saved - state_filename = "notebook_state.pkl" - - # Check if the state file exists before attempting to load - if os.path.exists(state_filename): - # Load the variables from the file using dill - with open(state_filename, 'rb') as f: - try: - loaded_state = dill.load(f) - except Exception as e: - loaded_state = {} - rspace_doc = loaded_state.get('rspace_doc_for_notebook') - attachments = loaded_state.get('attachments_for_notebook') - gallery_file = loaded_state.get('gallery_file_for_notebook') - - print(f"Variables loaded from {state_filename}") - print(f"rspace_doc: {rspace_doc}") - print(f"attachments: {attachments}") - print(f"gallery_file: {gallery_file}") - - else: - print(f"State file '{state_filename}' not found. No variables loaded.") - -def get_notebook_as_dict(): - """ - Saves notebook using ipylab and then writes notebook to Rspace document as - an attachment - """ - # %pip install -q ipylab - # from ipylab import JupyterFrontEnd - # from ipywidgets import Output - # app = JupyterFrontEnd() - # app.commands.execute('docmanager:save') - # %history - # print(%dirs) - # print(app) - # print(app.sessions) - # print(app.sessions.running()) - # print(app.sessions.current_session) - # print(globals()['__session__']) - # print(os.environ) - # %pip install -q ipynbname - # import ipynbname - # nb_fname = ipynbname.name() - # nb_path = ipynbname.path() - # print(f"{nb_fname=}") - # print(f"{nb_path=}") - try: - import glob - load_data() - notebook_files = glob.glob("*.ipynb") - if notebook_files: - raw_notebook_file_id = 477 - # gallery_file_id = None - attachment_file_id = 476 - # raw_notebook_file_id = 444 - gallery_file_id = 443 - - save_data(raw_notebook_file_id,attachment_file_id,gallery_file_id) - # FIXME - Uses the most recently modified notebook which might not be this notebook - # latest_notebook = max(notebook_files, key=os.path.getmtime) - latest_notebook = 'RSpaceDemoCopyData.ipynb' - attachedData = "spectroscopy_data.csv" - attachments = None - updateDocAttachments = False - if raw_notebook_file_id: - print(f"A document with attachement to this notebook saved previously with RSpaceID {raw_notebook_file_id}" ) - else: - print("NO document with attachement to this notebook saved previously in RSpace") - if gallery_file_id: - print(f"This notebook saved previously to Gallery with RSpaceID {gallery_file_id}" ) - else: - print("Notebook not previously saved to RSpace Gallery") - # with open(latest_notebook) as f: - # d = json.load(f) - # print(d) - with open(attachedData, 'r', encoding='utf-8') as attch: - client = get_rspace_client() - if attachment_file_id is None: - print('start upload attachments') - attachment_file = client.upload_file(attch)['id'] - print(f"Attachment file ID is: {attachment_file}") - print('done upload attachments') - else: - print('start update attachments') - attachment_file = client.update_file(attch,attachment_file_id)['id'] - print(f"Attachment file ID is: {attachment_file}") - print('done update attachments') - attachment_file_id = attachment_file - with open(latest_notebook, 'r', encoding='utf-8') as f: - client = get_rspace_client() - if gallery_file_id is None: - print('start upload to gallery') - gallery_file_id = client.upload_file(f)['id'] - print(f"Gallery file ID is: {gallery_file_id}") - print('done upload to gallery') - else: - print('start update to gallery') - gallery_file_id = client.update_file(f,gallery_file_id)['id'] - print('end update to gallery') - print(f"Gallery file ID is: {gallery_file_id}") - location = os.getcwd() - if raw_notebook_file_id is None: - new_doc = client.create_document(name="DocumentFor_"+latest_notebook) - content = f""" -

A link to jupyter notebook inserted into gallery. - Notebook located at :{location} on server - data: -

-

A link to data used by this notebook. - data: -

- """ - - updated_doc = client.append_content(new_doc['id'], content) - print(f"Document with this notebook as attachement has ID: {new_doc['id']}") - elif updateDocAttachments: - updated_doc = client.append_content(raw_notebook_file_id, newContent) - print(f"Updated document with this notebook as attachement has ID: {new_doc['id']}") - # with open(latest_notebook, 'r', encoding='utf-8') as f: - # notebook_dict = json.load(f) - # docName = f.name - # client = get_rspace_client() - # if(raw_notebook_file_id): - # print('start doc update') - # raw_data_file = client.update_document( - # raw_notebook_file_id, - # name = docName, - # tags = ["Python", "API", "example"], - # fields = [{"content": json.dumps(notebook_dict)}], - # ) - # print(f"Updated notebook: {latest_notebook}") - # else: - # raw_data_file = client.create_document( - # name = docName, - # tags = ["Python", "API", "example"], - # fields = [{"content": json.dumps(notebook_dict)}], - # ) - # print(json.dumps(notebook_dict)) - # print(f"Created notebook: {latest_notebook}") - # raw_notebook_file_id = raw_data_file['id'] - # print(raw_notebook_file_id) - else: - print("No .ipynb files found in current directory") - return None - - except Exception as e: - print(f"Error reading notebook file: {e}") - return None - -notebook_dict = get_notebook_as_dict() - diff --git a/pyproject.toml b/pyproject.toml index dbb443e..2cb92ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "rspace-client" -version = "2.6.1" +version = "2.6.2" description = "A client for calling RSpace ELN and Inventory APIs" license = "Apache-2.0" authors = ["Research Innovations Ltd "] diff --git a/rspace_client/__init__.py b/rspace_client/__init__.py index d553760..e0a0857 100644 --- a/rspace_client/__init__.py +++ b/rspace_client/__init__.py @@ -6,6 +6,7 @@ from .eln.advanced_query_builder import AdvancedQueryBuilder from .utils import createELNClient from .eln.field_content import FieldContent +from .notebook_sync import sync_notebook __all__ = [ "ELNClient", @@ -13,4 +14,5 @@ "AdvancedQueryBuilder", "createELNClient", "FieldContent", + "sync_notebook.py" ] diff --git a/rspace_client/notebook_sync/notebook_sync_requirements.py b/rspace_client/notebook_sync/notebook_sync_requirements.py new file mode 100644 index 0000000..897e5f8 --- /dev/null +++ b/rspace_client/notebook_sync/notebook_sync_requirements.py @@ -0,0 +1,8 @@ +get_ipython().run_line_magic('pip', 'install -q pickleshare') +get_ipython().run_line_magic('pip', 'install -q notebook') +get_ipython().run_line_magic('pip', 'install -q keyring') +get_ipython().run_line_magic('pip', 'install -q keyrings.alt') +get_ipython().run_line_magic('pip', 'install -q dill') +get_ipython().run_line_magic('pip', 'install -q ipynbname') +get_ipython().run_line_magic('pip', 'install -q ipylab') +get_ipython().run_line_magic('pip', 'install -q lxml') \ No newline at end of file diff --git a/rspace_client/notebook_sync/notebook_sync_requirements_r.py b/rspace_client/notebook_sync/notebook_sync_requirements_r.py new file mode 100644 index 0000000..de07cac --- /dev/null +++ b/rspace_client/notebook_sync/notebook_sync_requirements_r.py @@ -0,0 +1,16 @@ +install.packages('reticulate') +library('reticulate') +library("future") +plan(multisession) +py_require(c('pickleshare')) +py_require(c('notebook')) +py_require(c('keyring')) +py_require(c('dill')) +py_require(c('ipynbname')) +py_require(c('ipylab')) +py_require(c('lxml')) +py_require(c('keyrings.alt')) +py_require(c('bs4')) +py_require(c('nbformat')) +py_require(c('keyring')) +asyncio <- import("asyncio") \ No newline at end of file From d190fe41a81c1b1da7d6e52d330d00d7c62f1e30 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Fri, 24 Oct 2025 12:40:13 +0100 Subject: [PATCH 29/34] RSDEV-782-Jupyter-Notebooks: test for kernal contains python --- rspace_client/notebook_sync/sync_notebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rspace_client/notebook_sync/sync_notebook.py b/rspace_client/notebook_sync/sync_notebook.py index 73fe736..72d2f8e 100644 --- a/rspace_client/notebook_sync/sync_notebook.py +++ b/rspace_client/notebook_sync/sync_notebook.py @@ -442,7 +442,8 @@ def assert_invariants(): def notebook_can_be_saved(current_notebook): with open(current_notebook, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) - if notebook_node.metadata.kernelspec.display_name.lower() == 'python': + kernel_type = notebook_node.metadata.kernelspec.display_name.lower() + if 'python' in kernel_type: return True return False From f6f29d8d388a7089c9937702da61967ba532183d Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Sun, 26 Oct 2025 11:17:59 +0000 Subject: [PATCH 30/34] RSDEV-782-Jupyter-Notebooks: only save or reload if code cell runs in same notebook as being synced and is python --- pyproject.toml | 8 +++ rspace_client/__init__.py | 3 +- .../notebook_sync_requirements.py | 8 --- .../notebook_sync_requirements_r.py | 16 ----- rspace_client/notebook_sync/sync_notebook.py | 64 +++++++++++++++---- 5 files changed, 61 insertions(+), 38 deletions(-) delete mode 100644 rspace_client/notebook_sync/notebook_sync_requirements.py delete mode 100644 rspace_client/notebook_sync/notebook_sync_requirements_r.py diff --git a/pyproject.toml b/pyproject.toml index 2cb92ad..e701543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,14 @@ python = "^3.9" requests = "^2.25.1" beautifulsoup4 = "^4.9.3" fs = "^2.4.16" +#pickleshare = "^0.7.5" +#notebook = "^7.3.2" +#keyring = "^25.6.0" +#'keyrings.alt' = "5.0.2" +#dill = "^0.3.9" +#ipynbname = "^2025.8.0.0" +#ipylab = "^1.1.0" +#lxml = "^6.0.2" [tool.poetry.group.dev.dependencies] python-dotenv = "^1.1.1" diff --git a/rspace_client/__init__.py b/rspace_client/__init__.py index e0a0857..89418c9 100644 --- a/rspace_client/__init__.py +++ b/rspace_client/__init__.py @@ -6,7 +6,6 @@ from .eln.advanced_query_builder import AdvancedQueryBuilder from .utils import createELNClient from .eln.field_content import FieldContent -from .notebook_sync import sync_notebook __all__ = [ "ELNClient", @@ -14,5 +13,5 @@ "AdvancedQueryBuilder", "createELNClient", "FieldContent", - "sync_notebook.py" + "notebook_sync" ] diff --git a/rspace_client/notebook_sync/notebook_sync_requirements.py b/rspace_client/notebook_sync/notebook_sync_requirements.py deleted file mode 100644 index 897e5f8..0000000 --- a/rspace_client/notebook_sync/notebook_sync_requirements.py +++ /dev/null @@ -1,8 +0,0 @@ -get_ipython().run_line_magic('pip', 'install -q pickleshare') -get_ipython().run_line_magic('pip', 'install -q notebook') -get_ipython().run_line_magic('pip', 'install -q keyring') -get_ipython().run_line_magic('pip', 'install -q keyrings.alt') -get_ipython().run_line_magic('pip', 'install -q dill') -get_ipython().run_line_magic('pip', 'install -q ipynbname') -get_ipython().run_line_magic('pip', 'install -q ipylab') -get_ipython().run_line_magic('pip', 'install -q lxml') \ No newline at end of file diff --git a/rspace_client/notebook_sync/notebook_sync_requirements_r.py b/rspace_client/notebook_sync/notebook_sync_requirements_r.py deleted file mode 100644 index de07cac..0000000 --- a/rspace_client/notebook_sync/notebook_sync_requirements_r.py +++ /dev/null @@ -1,16 +0,0 @@ -install.packages('reticulate') -library('reticulate') -library("future") -plan(multisession) -py_require(c('pickleshare')) -py_require(c('notebook')) -py_require(c('keyring')) -py_require(c('dill')) -py_require(c('ipynbname')) -py_require(c('ipylab')) -py_require(c('lxml')) -py_require(c('keyrings.alt')) -py_require(c('bs4')) -py_require(c('nbformat')) -py_require(c('keyring')) -asyncio <- import("asyncio") \ No newline at end of file diff --git a/rspace_client/notebook_sync/sync_notebook.py b/rspace_client/notebook_sync/sync_notebook.py index 72d2f8e..10df414 100644 --- a/rspace_client/notebook_sync/sync_notebook.py +++ b/rspace_client/notebook_sync/sync_notebook.py @@ -1,3 +1,11 @@ +get_ipython().run_line_magic('pip', 'install -q pickleshare') +get_ipython().run_line_magic('pip', 'install -q notebook') +get_ipython().run_line_magic('pip', 'install -q keyring') +get_ipython().run_line_magic('pip', 'install -q keyrings.alt') +get_ipython().run_line_magic('pip', 'install -q dill') +get_ipython().run_line_magic('pip', 'install -q ipynbname') +get_ipython().run_line_magic('pip', 'install -q ipylab') +get_ipython().run_line_magic('pip', 'install -q lxml') from notebook import app from rspace_client.eln import eln import os @@ -136,14 +144,21 @@ def get_server_roots(): return all_roots def get_notebook_name(): + if notebook_name is not None: + if '/' in notebook_name: + notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] + else: + notebook_name_alone = notebook_name + return {'name': notebook_name_alone, 'root_name': notebook_name_alone[:notebook_name_alone.rfind('.')], + 'name_path': notebook_name} + else: + return calculate_notebook_name() + + def calculate_notebook_name(): + """ + This code only works for python notebooks and a browser refresh is required after first install + """ try: - if notebook_name is not None: - if '/' in notebook_name: - notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] - else: - notebook_name_alone = notebook_name - return {'name': notebook_name_alone, 'root_name': notebook_name_alone[:notebook_name_alone.rfind('.')], - 'name_path': notebook_name} nb_fname = ipynbname.name() nb_path = str(ipynbname.path()) for srv_root in get_server_roots(): @@ -167,7 +182,7 @@ def get_password(): retrieved_password = keyring.get_password(service_id, rspace_username) if retrieved_password is None: retrieved_password = getpass.getpass("Please enter your RSpace Api key: ") - keyring.set_password(service_id, username, retrieved_password) + keyring.set_password(service_id, rspace_username, retrieved_password) return retrieved_password def get_rspace_client(): @@ -242,7 +257,8 @@ async def save_notebook(): curr_mod_time = os.path.getmtime(file_path) elapsed_time = time.time() - start_watch_time if elapsed_time > 30: - msg = "TIMEOUT ON SAVE ***** DID YOU MEAN TO SAVE NOTEBOOK: " + file_path + " ?" + # ******* save will time out if user does not refresh browser tab running jupyterlab after they first install the ipylab dependency! ******* + msg = "TIMEOUT ON SAVE ***** DID YOU MEAN TO SAVE NOTEBOOK: " + file_path + " ? If this is your first installation of the code, refresh the browser tab." raise Exception(msg) async def reload_notebook(): @@ -439,7 +455,17 @@ def assert_invariants(): if server_url is not None and notebook_name is None or notebook_name is not None and server_url is None: raise Exception("Both server_url and notebook_name must be either None or have a value") - def notebook_can_be_saved(current_notebook): + if notebook_name is not None and '.' not in notebook_name: + raise Exception( + "This is not a valid notebook name - it should have a suffix preceeded by a dot: '.' For example 'notebook.ipynb' is a valid name, 'notebook' with no suffix is invalid.") + + def notebook_should_be_saved(current_notebook): + return this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook) + + def notebook_should_be_reloaded(current_notebook): + return this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook) + + def notebook_is_python_based(current_notebook): with open(current_notebook, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) kernel_type = notebook_node.metadata.kernelspec.display_name.lower() @@ -447,11 +473,24 @@ def notebook_can_be_saved(current_notebook): return True return False + def this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook): + """ + True if this code is run in a cell of a python notebook being synced with RSpace and False if + this code is running in the cell of one notebook but syncing another notebook OR if this notebook is + not a python notebook + """ + try: + calculated_notebook_name = calculate_notebook_name() + except: + calculated_notebook_name is None + + return notebook_is_python_based(current_notebook) and get_notebook_name() == calculated_notebook_name + assert_invariants() current_notebook = get_notebook_name()['name'] # do not remove this print statement as it is required to ensure notebook is always in modified state when we call save_notebook print(f'Running sync on notebook:{current_notebook}') - if notebook_can_be_saved(current_notebook): + if notebook_should_be_saved(current_notebook): await save_notebook() get_server_urls() with open(current_notebook, 'r') as notebook: @@ -489,7 +528,8 @@ def notebook_can_be_saved(current_notebook): rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], fields=[ {'id': rspace_document_target_field_id, "content": new_content}]) - await reload_notebook() + if notebook_should_be_reloaded(current_notebook): + await reload_notebook() save_rspace_data(rspace_doc, attachment_files, nb_gallery_file, new_execution_count, history_data) return 'success' except Exception as e: From 755eb9cc6196a453192d1701638aad2e7d3a7167 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Sun, 26 Oct 2025 11:43:07 +0000 Subject: [PATCH 31/34] RSDEV-782-Jupyter-Notebooks: clean up .toml --- pyproject.toml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e701543..2cb92ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,14 +25,6 @@ python = "^3.9" requests = "^2.25.1" beautifulsoup4 = "^4.9.3" fs = "^2.4.16" -#pickleshare = "^0.7.5" -#notebook = "^7.3.2" -#keyring = "^25.6.0" -#'keyrings.alt' = "5.0.2" -#dill = "^0.3.9" -#ipynbname = "^2025.8.0.0" -#ipylab = "^1.1.0" -#lxml = "^6.0.2" [tool.poetry.group.dev.dependencies] python-dotenv = "^1.1.1" From a31a7ee3bea61b65414450aaae7c56a51303a42b Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Mon, 27 Oct 2025 11:48:03 +0000 Subject: [PATCH 32/34] RSDEV-782-Jupyter-Notebooks: fixed bug with save when notebook in sub directory --- .../rspace-demo-kaggle-v11.ipynb | 252 +----------------- rspace_client/notebook_sync/sync_notebook.py | 8 +- 2 files changed, 7 insertions(+), 253 deletions(-) diff --git a/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb b/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb index 887835f..3b6687c 100644 --- a/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb +++ b/jupyter_notebooks/rspace-demo-kaggle-v11.ipynb @@ -1,251 +1 @@ -{ - "metadata": { - "kernelspec": { - "language": "python", - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.7.12", - "mimetype": "text/x-python", - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "pygments_lexer": "ipython3", - "nbconvert_exporter": "python", - "file_extension": ".py" - } - }, - "nbformat_minor": 4, - "nbformat": 4, - "cells": [ - { - "cell_type": "raw", - "source": [ - "### Working with RSpace data.\n", - "\n", - "This notebook illustrates a workflow to get data from RSpace, analyse it, and send back results to RSpace.\n", - "To work with this tutorial, you'll need an account on RSpace, an RSpace API key and Python 3.6 or later.\n", - "\n", - "This project requires modules `rspace_client` (Version 2) , `pandas` and `matplotlib`.\n", - "\n", - "To install rspace client `pip install rspace-client==2.0.1`\n", - "\n", - "The top-level README.md has more information on getting set up. \n", - "\n", - "The notebook is split into 3 sections:\n", - "\n", - "1. Adding some data to RSpace to analyse. In reality, this might be done manually by a wet-lab scientist or be delivered programmatically by an instrument. \n", - "2. Getting the datasets to analyse\n", - "3. Sending back the analysis linked to an experimental record" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "markdown", - "source": "#### Setup Step 1 - configuring the RSpace Client. `rspace_client` is available from pip.\n\nIt's good practice to store API keys as environment variables rather than hard-coding it.", - "metadata": {} - }, - { - "cell_type": "code", - "source": "!pip install rspace-client==2.0.1\nprint(\"Kernel running OK\")", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:34:56.205067Z", - "iopub.execute_input": "2021-12-02T09:34:56.20544Z", - "iopub.status.idle": "2021-12-02T09:35:05.672998Z", - "shell.execute_reply.started": "2021-12-02T09:34:56.205406Z", - "shell.execute_reply": "2021-12-02T09:35:05.671894Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": "## get your API key and set your RSpace URL. Change this code as needed for your own environment\nfrom kaggle_secrets import UserSecretsClient\napi_key_label = \"mp_demos_key\"\nAPI_KEY = UserSecretsClient().get_secret(api_key_label)\nprint (f\"Retrieved API key {API_KEY[0:4]}...\")\nURL=\"https://demos.researchspace.com\"", - "metadata": {}, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": "from rspace_client.eln import eln\nimport os\n\napi = eln.ELNClient(URL, API_KEY)\n\n## sanity check that that the client is configured correctly\nprint(api.get_status())", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:11:50.086056Z", - "iopub.execute_input": "2021-12-02T09:11:50.086431Z", - "iopub.status.idle": "2021-12-02T09:11:51.361265Z", - "shell.execute_reply.started": "2021-12-02T09:11:50.086396Z", - "shell.execute_reply": "2021-12-02T09:11:51.360371Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": "#### Setup Step 2 - adding some test data.\n\nHere we'll add a CSV file to RSpace, containing some synthetic weather-related data.", - "metadata": {} - }, - { - "cell_type": "code", - "source": "import os\ndata_input_dir='/kaggle/input/rspacedemofiles'\ntemp_data_path=os.path.join(data_input_dir, 'temp_data.csv')\n\nwith open (temp_data_path) as f:\n raw_data_file = api.upload_file(f)['id']\nraw_data_file_id= raw_data_file\nprint(f\"Temperature data uploaded to RSpace with ID {raw_data_file_id}\")", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:29:29.73737Z", - "iopub.execute_input": "2021-12-02T09:29:29.737804Z", - "iopub.status.idle": "2021-12-02T09:29:30.742869Z", - "shell.execute_reply.started": "2021-12-02T09:29:29.737762Z", - "shell.execute_reply": "2021-12-02T09:29:30.741818Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": "#### Analysis Step 1 - retrieving dataset\n\nOK, now we can start working with this dataset. If this dataset had been uploaded by a colleague, we could have been notified by Slack, Teams, email or within RSpace itself that this file was available for analysis.", - "metadata": {} - }, - { - "cell_type": "code", - "source": "file_name = \"downloaded_\"+(api.get_file_info(raw_data_file_id)['name'])\nprint(file_name)\n\n## retrieve from RSpace - here we are downloading the file\nraw_temp_data = api.download_file(raw_data_file_id, file_name)", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:29:43.868586Z", - "iopub.execute_input": "2021-12-02T09:29:43.86891Z", - "iopub.status.idle": "2021-12-02T09:29:45.244744Z", - "shell.execute_reply.started": "2021-12-02T09:29:43.868874Z", - "shell.execute_reply": "2021-12-02T09:29:45.243908Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": "#### Analysis Step 2 - the analysis\n\nHere is where you do your actual analytis of the data... here we'll just plot the data and generate a summary, saving both to file.", - "metadata": {} - }, - { - "cell_type": "code", - "source": "import pandas as pd;\ndf = pd.read_csv(file_name)\nsummary_stats = df.describe()\n\ndf = df.set_index('city_id')\nplot = df.plot(ylabel='Celsius', title=f'Temperature plots from dataset {raw_data_file_id}')\nimg_f= f'Temperature_per_city-{raw_data_file_id}'\nplot.get_figure().savefig(img_f)\n\nsummary_stats_csv = f'{file_name[:file_name.rindex(\".\")]}-summarystats.csv'\nsummary_stats.to_csv(summary_stats_csv)", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:30:05.805607Z", - "iopub.execute_input": "2021-12-02T09:30:05.805916Z", - "iopub.status.idle": "2021-12-02T09:30:06.227801Z", - "shell.execute_reply.started": "2021-12-02T09:30:05.805882Z", - "shell.execute_reply": "2021-12-02T09:30:06.226521Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": "#### Analysis Step 3 - uploading back to RSpace\n\nYou can add captions to the file to help describe your analysis", - "metadata": {} - }, - { - "cell_type": "code", - "source": "with open(summary_stats_csv, 'rb') as f:\n summary_file = api.upload_file(f, caption=f\"Summary data for {raw_data_file_id}\")\n print(f\"uploaded id = {summary_file['id']}\")\nwith open(img_f+\".png\", 'rb') as f:\n uploaded_image = api.upload_file(f, caption=f\"City vs temperature for {raw_data_file_id}\")\n print(f\"uploaded id = {uploaded_image['id']}\") ", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:30:13.165456Z", - "iopub.execute_input": "2021-12-02T09:30:13.165815Z", - "iopub.status.idle": "2021-12-02T09:30:15.074355Z", - "shell.execute_reply.started": "2021-12-02T09:30:13.165782Z", - "shell.execute_reply": "2021-12-02T09:30:15.073388Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": "There are several options now:\n\n* You can create an RSpace document, and insert these files, and share the document with your group or colleage. \n* Your colleagues may have already created and shared document describing an experiment that generated these files, in which case you would already have access to a document.\n\nHere we'll go with a simple flow where we create a new RSpace document to share with the rest of our research group.\n\nThe content we'll insert will be HTML. However you don't need to figure out how to display the linked files. Just include file links as `` syntax and RSpace will turn these into formatted links\n", - "metadata": {} - }, - { - "cell_type": "code", - "source": "new_doc = api.create_document(name=f\"Analysis of dataset {raw_data_file_id}\")\ncontent = f\"\"\"\n

Analysis of temperature dataset from our standard locations.\n

No variation between locations:\nRaw data: \n

\nStatistical summary: \n

\nLocation vs temperature: \n\"\"\"\n\nupdated_doc = api.append_content(new_doc['id'], content)\n\n## a simple utility function so you can get a link to view the updated contents in a browser.\ndef api_to_browser(link):\n return '/globalId/SD'.join(link.split('/api/v1/documents/'))\n\nprint(f\"You can view this in a browser at {api_to_browser(updated_doc['_links'][0]['link'])}\")\n", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:30:33.035126Z", - "iopub.execute_input": "2021-12-02T09:30:33.036313Z", - "iopub.status.idle": "2021-12-02T09:30:38.342404Z", - "shell.execute_reply.started": "2021-12-02T09:30:33.036246Z", - "shell.execute_reply": "2021-12-02T09:30:38.341291Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": "If you're in a group, you can now share this with your group. You can get your groups' IDs: ", - "metadata": {} - }, - { - "cell_type": "code", - "source": "groups = api.get_groups()\nfor gp in groups:\n print(f\"{gp['name']:30}{gp['id']}\")\nchosen_group = None\n#chosen_group = input(\"please enter a group ID to share with\")\nchosen_group = chosen_group or groups[0]['id'] ## if not running interactively, choose 1st group", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:48:52.764733Z", - "iopub.execute_input": "2021-12-02T09:48:52.765029Z", - "iopub.status.idle": "2021-12-02T09:48:53.294518Z", - "shell.execute_reply.started": "2021-12-02T09:48:52.764997Z", - "shell.execute_reply": "2021-12-02T09:48:53.293245Z" - }, - "trusted": true - }, - "execution_count": 53, - "outputs": [] - }, - { - "cell_type": "code", - "source": "api.shareDocuments([new_doc['id']], chosen_group, permission=\"EDIT\")", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:31:28.045633Z", - "iopub.execute_input": "2021-12-02T09:31:28.045924Z", - "iopub.status.idle": "2021-12-02T09:31:29.469595Z", - "shell.execute_reply.started": "2021-12-02T09:31:28.045892Z", - "shell.execute_reply": "2021-12-02T09:31:29.468475Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": "### tidy up - remove output files\noutfile_dir=\"/kaggle/working\"\nfor root,dirs,files in os.walk(outfile_dir):\n for f in files:\n os.remove(f)\nprint (\"output files removed\")", - "metadata": { - "execution": { - "iopub.status.busy": "2021-12-02T09:32:58.850987Z", - "iopub.execute_input": "2021-12-02T09:32:58.851459Z", - "iopub.status.idle": "2021-12-02T09:32:58.859278Z", - "shell.execute_reply.started": "2021-12-02T09:32:58.851405Z", - "shell.execute_reply": "2021-12-02T09:32:58.858292Z" - }, - "trusted": true - }, - "execution_count": null, - "outputs": [] - } - ] -} +{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"### Working with RSpace data.\n\nThis notebook illustrates a workflow to get data from RSpace, analyse it, and send back results to RSpace.\nTo work with this tutorial, you'll need an account on RSpace, an RSpace API key and Python 3.6 or later.\n\nThis project requires modules `rspace_client` (Version 2) , `pandas` and `matplotlib`.\n\nTo install rspace client `pip install rspace-client==2.0.1`\n\nThe top-level README.md has more information on getting set up. \n\nThe notebook is split into 3 sections:\n\n1. Adding some data to RSpace to analyse. In reality, this might be done manually by a wet-lab scientist or be delivered programmatically by an instrument. \n2. Getting the datasets to analyse\n3. Sending back the analysis linked to an experimental record","metadata":{}},{"cell_type":"markdown","source":"#### Setup Step 1 - configuring the RSpace Client. `rspace_client` is available from pip.\n\nIt's good practice to store API keys as environment variables rather than hard-coding it.","metadata":{}},{"cell_type":"code","source":"!pip install rspace-client==2.0.1\nprint(\"Kernel running OK\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:34:56.205067Z","iopub.execute_input":"2021-12-02T09:34:56.20544Z","iopub.status.idle":"2021-12-02T09:35:05.672998Z","shell.execute_reply.started":"2021-12-02T09:34:56.205406Z","shell.execute_reply":"2021-12-02T09:35:05.671894Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## get your API key and set your RSpace URL. Change this code as needed for your own environment\nfrom kaggle_secrets import UserSecretsClient\napi_key_label = \"mp_demos_key\"\nAPI_KEY = UserSecretsClient().get_secret(api_key_label)\nprint (f\"Retrieved API key {API_KEY[0:4]}...\")\nURL=\"https://demos.researchspace.com\"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from rspace_client.eln import eln\nimport os\n\napi = eln.ELNClient(URL, API_KEY)\n\n## sanity check that that the client is configured correctly\nprint(api.get_status())","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:11:50.086056Z","iopub.execute_input":"2021-12-02T09:11:50.086431Z","iopub.status.idle":"2021-12-02T09:11:51.361265Z","shell.execute_reply.started":"2021-12-02T09:11:50.086396Z","shell.execute_reply":"2021-12-02T09:11:51.360371Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Setup Step 2 - adding some test data.\n\nHere we'll add a CSV file to RSpace, containing some synthetic weather-related data.","metadata":{}},{"cell_type":"code","source":"import os\ndata_input_dir='/kaggle/input/rspacedemofiles'\ntemp_data_path=os.path.join(data_input_dir, 'temp_data.csv')\n\nwith open (temp_data_path) as f:\n raw_data_file = api.upload_file(f)['id']\nraw_data_file_id= raw_data_file\nprint(f\"Temperature data uploaded to RSpace with ID {raw_data_file_id}\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:29:29.73737Z","iopub.execute_input":"2021-12-02T09:29:29.737804Z","iopub.status.idle":"2021-12-02T09:29:30.742869Z","shell.execute_reply.started":"2021-12-02T09:29:29.737762Z","shell.execute_reply":"2021-12-02T09:29:30.741818Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Analysis Step 1 - retrieving dataset\n\nOK, now we can start working with this dataset. If this dataset had been uploaded by a colleague, we could have been notified by Slack, Teams, email or within RSpace itself that this file was available for analysis.","metadata":{}},{"cell_type":"code","source":"file_name = \"downloaded_\"+(api.get_file_info(raw_data_file_id)['name'])\nprint(file_name)\n\n## retrieve from RSpace - here we are downloading the file\nraw_temp_data = api.download_file(raw_data_file_id, file_name)","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:29:43.868586Z","iopub.execute_input":"2021-12-02T09:29:43.86891Z","iopub.status.idle":"2021-12-02T09:29:45.244744Z","shell.execute_reply.started":"2021-12-02T09:29:43.868874Z","shell.execute_reply":"2021-12-02T09:29:45.243908Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Analysis Step 2 - the analysis\n\nHere is where you do your actual analytis of the data... here we'll just plot the data and generate a summary, saving both to file.","metadata":{}},{"cell_type":"code","source":"import pandas as pd;\ndf = pd.read_csv(file_name)\nsummary_stats = df.describe()\n\ndf = df.set_index('city_id')\nplot = df.plot(ylabel='Celsius', title=f'Temperature plots from dataset {raw_data_file_id}')\nimg_f= f'Temperature_per_city-{raw_data_file_id}'\nplot.get_figure().savefig(img_f)\n\nsummary_stats_csv = f'{file_name[:file_name.rindex(\".\")]}-summarystats.csv'\nsummary_stats.to_csv(summary_stats_csv)","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:30:05.805607Z","iopub.execute_input":"2021-12-02T09:30:05.805916Z","iopub.status.idle":"2021-12-02T09:30:06.227801Z","shell.execute_reply.started":"2021-12-02T09:30:05.805882Z","shell.execute_reply":"2021-12-02T09:30:06.226521Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Analysis Step 3 - uploading back to RSpace\n\nYou can add captions to the file to help describe your analysis","metadata":{}},{"cell_type":"code","source":"with open(summary_stats_csv, 'rb') as f:\n summary_file = api.upload_file(f, caption=f\"Summary data for {raw_data_file_id}\")\n print(f\"uploaded id = {summary_file['id']}\")\nwith open(img_f+\".png\", 'rb') as f:\n uploaded_image = api.upload_file(f, caption=f\"City vs temperature for {raw_data_file_id}\")\n print(f\"uploaded id = {uploaded_image['id']}\") ","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:30:13.165456Z","iopub.execute_input":"2021-12-02T09:30:13.165815Z","iopub.status.idle":"2021-12-02T09:30:15.074355Z","shell.execute_reply.started":"2021-12-02T09:30:13.165782Z","shell.execute_reply":"2021-12-02T09:30:15.073388Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"There are several options now:\n\n* You can create an RSpace document, and insert these files, and share the document with your group or colleage. \n* Your colleagues may have already created and shared document describing an experiment that generated these files, in which case you would already have access to a document.\n\nHere we'll go with a simple flow where we create a new RSpace document to share with the rest of our research group.\n\nThe content we'll insert will be HTML. However you don't need to figure out how to display the linked files. Just include file links as `` syntax and RSpace will turn these into formatted links\n","metadata":{}},{"cell_type":"code","source":"new_doc = api.create_document(name=f\"Analysis of dataset {raw_data_file_id}\")\ncontent = f\"\"\"\n

Analysis of temperature dataset from our standard locations.\n

No variation between locations:\nRaw data: \n

\nStatistical summary: \n

\nLocation vs temperature: \n\"\"\"\n\nupdated_doc = api.append_content(new_doc['id'], content)\n\n## a simple utility function so you can get a link to view the updated contents in a browser.\ndef api_to_browser(link):\n return '/globalId/SD'.join(link.split('/api/v1/documents/'))\n\nprint(f\"You can view this in a browser at {api_to_browser(updated_doc['_links'][0]['link'])}\")\n","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:30:33.035126Z","iopub.execute_input":"2021-12-02T09:30:33.036313Z","iopub.status.idle":"2021-12-02T09:30:38.342404Z","shell.execute_reply.started":"2021-12-02T09:30:33.036246Z","shell.execute_reply":"2021-12-02T09:30:38.341291Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"If you're in a group, you can now share this with your group. You can get your groups' IDs: ","metadata":{}},{"cell_type":"code","source":"groups = api.get_groups()\nfor gp in groups:\n print(f\"{gp['name']:30}{gp['id']}\")\nchosen_group = None\n#chosen_group = input(\"please enter a group ID to share with\")\nchosen_group = chosen_group or groups[0]['id'] ## if not running interactively, choose 1st group","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:48:52.764733Z","iopub.execute_input":"2021-12-02T09:48:52.765029Z","iopub.status.idle":"2021-12-02T09:48:53.294518Z","shell.execute_reply.started":"2021-12-02T09:48:52.764997Z","shell.execute_reply":"2021-12-02T09:48:53.293245Z"},"trusted":true},"execution_count":53,"outputs":[]},{"cell_type":"code","source":"api.shareDocuments([new_doc['id']], chosen_group, permission=\"EDIT\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:31:28.045633Z","iopub.execute_input":"2021-12-02T09:31:28.045924Z","iopub.status.idle":"2021-12-02T09:31:29.469595Z","shell.execute_reply.started":"2021-12-02T09:31:28.045892Z","shell.execute_reply":"2021-12-02T09:31:29.468475Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"### tidy up - remove output files\noutfile_dir=\"/kaggle/working\"\nfor root,dirs,files in os.walk(outfile_dir):\n for f in files:\n os.remove(f)\nprint (\"output files removed\")","metadata":{"execution":{"iopub.status.busy":"2021-12-02T09:32:58.850987Z","iopub.execute_input":"2021-12-02T09:32:58.851459Z","iopub.status.idle":"2021-12-02T09:32:58.859278Z","shell.execute_reply.started":"2021-12-02T09:32:58.851405Z","shell.execute_reply":"2021-12-02T09:32:58.858292Z"},"trusted":true},"execution_count":null,"outputs":[]}]} \ No newline at end of file diff --git a/rspace_client/notebook_sync/sync_notebook.py b/rspace_client/notebook_sync/sync_notebook.py index 10df414..8fd4b19 100644 --- a/rspace_client/notebook_sync/sync_notebook.py +++ b/rspace_client/notebook_sync/sync_notebook.py @@ -246,7 +246,11 @@ async def save_notebook(): 3) Timeout after 30s - infinite loop can happen when user enters an incorrect notebook name and then mistakely saves a different notebook which is unchanged ''' file_path = get_notebook_name()['name_path'] - start_mod_time = os.path.getmtime(file_path) + nested_dir_pos = file_path.count('/') + relative_path = file_path + for i in range(nested_dir_pos): + relative_path = "../" + relative_path + start_mod_time = os.path.getmtime(relative_path) curr_mod_time = start_mod_time start_watch_time = time.time() # this arbitrary 1 second sleep is to allow the UI time to update and register that it is the 'unsaved' state @@ -254,7 +258,7 @@ async def save_notebook(): app.commands.execute('docmanager:save') while start_mod_time == curr_mod_time: await asyncio.sleep(0.1) - curr_mod_time = os.path.getmtime(file_path) + curr_mod_time = os.path.getmtime(relative_path) elapsed_time = time.time() - start_watch_time if elapsed_time > 30: # ******* save will time out if user does not refresh browser tab running jupyterlab after they first install the ipylab dependency! ******* From e0225e39a2eb50fb5d22d9af98a3164713fe757e Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Mon, 27 Oct 2025 16:05:22 +0000 Subject: [PATCH 33/34] RSDEV-782-Jupyter-Notebooks: calculates relative paths to target notebook correctly --- rspace_client/notebook_sync/sync_notebook.py | 82 ++++++++++++-------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/rspace_client/notebook_sync/sync_notebook.py b/rspace_client/notebook_sync/sync_notebook.py index 8fd4b19..f28245b 100644 --- a/rspace_client/notebook_sync/sync_notebook.py +++ b/rspace_client/notebook_sync/sync_notebook.py @@ -143,7 +143,10 @@ def get_server_roots(): raise # Code may fail if server has a password/doesnt use token auth - see ipynbname README return all_roots - def get_notebook_name(): + def get_target_notebook_name(): + """ + This refers to the notebook being synced to RSpace and might not be the notebook actually running this code. + """ if notebook_name is not None: if '/' in notebook_name: notebook_name_alone = notebook_name[notebook_name.rfind('/') + 1:] @@ -157,6 +160,7 @@ def get_notebook_name(): def calculate_notebook_name(): """ This code only works for python notebooks and a browser refresh is required after first install + Certain jupyter installations might not be able to run this code (if the server is password protected) """ try: nb_fname = ipynbname.name() @@ -197,14 +201,14 @@ def get_rspace_client(): def save_rspace_data(rspace_doc, attachments, gallery_file, execution_count, history_data): # Define the filename to save the state - state_filename = get_notebook_name()['root_name'] + "_state.pkl" + state_filename = get_target_notebook_name()['root_name'] + "_state.pkl" with open(state_filename, 'wb') as f: dill.dump({RSPACE_DOC_FOR_NOTEBOOK: rspace_doc, RSPACE_ATTACHMENTS_FOR_NOTEBOOK: attachments, RSPACE_GALLERY_FILE_FOR_NOTEBOOK: gallery_file, RSPACE_EXECUTION_COUNT_FOR_NOTEBOOK: execution_count, RSPACE_HISTORY_DATA: history_data}, f) def load_data(): - state_filename = get_notebook_name()['root_name'] + "_state.pkl" + state_filename = get_target_notebook_name()['root_name'] + "_state.pkl" if os.path.exists(state_filename): # Load the variables from the file using dill @@ -217,7 +221,7 @@ def load_data(): loaded_state = {} return loaded_state - async def save_notebook(): + async def save_notebook(relative_notebook_path): ''' 'docmanager:save' does not appear to hook into any callback invoked when the document is actually saved. So we have no idea when it has completed. Jupyter Notebooks can be (at least) 100MB in size - there are some limitations imposed by the @@ -245,12 +249,7 @@ async def save_notebook(): 2) Loop until modified time changes 3) Timeout after 30s - infinite loop can happen when user enters an incorrect notebook name and then mistakely saves a different notebook which is unchanged ''' - file_path = get_notebook_name()['name_path'] - nested_dir_pos = file_path.count('/') - relative_path = file_path - for i in range(nested_dir_pos): - relative_path = "../" + relative_path - start_mod_time = os.path.getmtime(relative_path) + start_mod_time = os.path.getmtime(relative_notebook_path) curr_mod_time = start_mod_time start_watch_time = time.time() # this arbitrary 1 second sleep is to allow the UI time to update and register that it is the 'unsaved' state @@ -258,7 +257,7 @@ async def save_notebook(): app.commands.execute('docmanager:save') while start_mod_time == curr_mod_time: await asyncio.sleep(0.1) - curr_mod_time = os.path.getmtime(relative_path) + curr_mod_time = os.path.getmtime(relative_notebook_path) elapsed_time = time.time() - start_watch_time if elapsed_time > 30: # ******* save will time out if user does not refresh browser tab running jupyterlab after they first install the ipylab dependency! ******* @@ -399,10 +398,7 @@ def upload_attached_data(attachment_files): for attached_data in attached_data_files_list: if attached_data: # make file paths to data relative to the location of this notebook - nested_dir_pos = get_notebook_name()['name_path'].count('/') - relative_attached_data = attached_data - for i in range(nested_dir_pos): - relative_attached_data = "../" + relative_attached_data + relative_attached_data = get_relative_path_to_this_notebook(attached_data) with open(relative_attached_data, 'r', encoding='utf-8') as attch: attachment_file_id = attachment_files.get(attached_data, {}).get('id') attachment_file_hash = attachment_files.get(attached_data, {}).get('hash') @@ -463,41 +459,61 @@ def assert_invariants(): raise Exception( "This is not a valid notebook name - it should have a suffix preceeded by a dot: '.' For example 'notebook.ipynb' is a valid name, 'notebook' with no suffix is invalid.") - def notebook_should_be_saved(current_notebook): - return this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook) + def notebook_should_be_saved(relative_notebook_path): + return this_nb_pythonic_and_target_notebook_runs_this_code_and_notebook_name_can_be_calculated(relative_notebook_path) - def notebook_should_be_reloaded(current_notebook): - return this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook) + def notebook_should_be_reloaded(relative_notebook_path): + return this_nb_pythonic_and_target_notebook_runs_this_code_and_notebook_name_can_be_calculated(relative_notebook_path) - def notebook_is_python_based(current_notebook): - with open(current_notebook, 'r') as notebook: + def notebook_is_python_based(relative_notebook_path): + with open(relative_notebook_path, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) kernel_type = notebook_node.metadata.kernelspec.display_name.lower() if 'python' in kernel_type: return True return False - def this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook): + def this_nb_pythonic_and_target_notebook_runs_this_code_and_notebook_name_can_be_calculated(relative_notebook_path): """ True if this code is run in a cell of a python notebook being synced with RSpace and False if this code is running in the cell of one notebook but syncing another notebook OR if this notebook is - not a python notebook + not a python notebook. Also false if notebook name cannot be calculated - we use this code to determine + if the save_notebook and reload_notebook functions should be called and they dont work in jupyter setups + where notebook name cannot be calculated. """ try: calculated_notebook_name = calculate_notebook_name() except: calculated_notebook_name is None + # if we cant calculate a notebook name return false + return notebook_is_python_based(relative_notebook_path) and get_target_notebook_name() == calculated_notebook_name - return notebook_is_python_based(current_notebook) and get_notebook_name() == calculated_notebook_name + def get_relative_path_to_this_notebook(file_path): + """ + This calculates paths relative to the notebook running the sync code. + This is not **necessarily** the get_target_notebook_name()['name_path'] - we might have set that to a different notebook. + This will be true when the notebook running the sync code is not the actual notebook being synced + + Note - unsure how this will behave on windows os. + """ + root = get_ipython().getoutput('cd ~ ; pwd')[0] + path_to_this_dir = os.getcwd() + path_to_this_dir = path_to_this_dir.replace(str(root), '') + nested_dir_pos = path_to_this_dir.count('/') + relative_file_path = file_path + for i in range(nested_dir_pos): + relative_file_path = "../" + relative_file_path + return relative_file_path assert_invariants() - current_notebook = get_notebook_name()['name'] + target_nb_file_path = get_target_notebook_name()['name_path'] + relative_notebook_path = get_relative_path_to_this_notebook(target_nb_file_path) # do not remove this print statement as it is required to ensure notebook is always in modified state when we call save_notebook - print(f'Running sync on notebook:{current_notebook}') - if notebook_should_be_saved(current_notebook): - await save_notebook() + print(f'Running sync on notebook:{relative_notebook_path}') + if notebook_should_be_saved(relative_notebook_path): + await save_notebook(relative_notebook_path) get_server_urls() - with open(current_notebook, 'r') as notebook: + with open(relative_notebook_path, 'r') as notebook: notebook_node = nbformat.read(notebook, nbformat.NO_CONVERT) try: loaded_state = load_data() @@ -511,10 +527,10 @@ def this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook): attachment_files = loaded_state.get(RSPACE_ATTACHMENTS_FOR_NOTEBOOK, {}) nb_gallery_file = loaded_state.get(RSPACE_GALLERY_FILE_FOR_NOTEBOOK, {}) history_data = loaded_state.get(RSPACE_HISTORY_DATA, {'text': ''}) - current_notebook = get_notebook_name()['name'] + target_notebook_name = get_target_notebook_name()['name'] upload_attached_data(attachment_files) if rspace_doc is None and rspace_prexisting_document_id is None: - rspace_doc = client.create_document(name="DocumentFor_" + current_notebook, + rspace_doc = client.create_document(name="DocumentFor_" + target_notebook_name, tags=["Python", "API", "Jupyter"]) if rspace_document_target_field is not None: rspace_document_target_field_id = str(rspace_doc['fields'][int(rspace_document_target_field)]['id']) @@ -523,7 +539,7 @@ def this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook): rspace_document_file_id = str( rspace_doc['id']) if rspace_prexisting_document_id is None else rspace_prexisting_document_id rspace_doc = client.get_document(rspace_document_file_id) - nb_gallery_file = await upload_notebook_to_gallery(current_notebook, notebook_node, nb_gallery_file, + nb_gallery_file = await upload_notebook_to_gallery(relative_notebook_path, notebook_node, nb_gallery_file, attachment_files, rspace_doc, history_data) nb_gallery_file_id = nb_gallery_file.get('id') previous_content = get_field_content(rspace_doc) @@ -532,7 +548,7 @@ def this_nb_pythonic_and_target_notebook_runs_this_code(current_notebook): rspace_doc = client.update_document(rspace_document_file_id, tags=['Python', 'API', 'Jupyter'], fields=[ {'id': rspace_document_target_field_id, "content": new_content}]) - if notebook_should_be_reloaded(current_notebook): + if notebook_should_be_reloaded(relative_notebook_path): await reload_notebook() save_rspace_data(rspace_doc, attachment_files, nb_gallery_file, new_execution_count, history_data) return 'success' From b7f56e37922a39f501e369500278c8c40b2daad5 Mon Sep 17 00:00:00 2001 From: nhanlon2 Date: Mon, 27 Oct 2025 17:10:21 +0000 Subject: [PATCH 34/34] RSDEV-782-Jupyter-Notebooks: correctly inserts at specific fields in notebook --- rspace_client/notebook_sync/sync_notebook.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rspace_client/notebook_sync/sync_notebook.py b/rspace_client/notebook_sync/sync_notebook.py index f28245b..6110f9f 100644 --- a/rspace_client/notebook_sync/sync_notebook.py +++ b/rspace_client/notebook_sync/sync_notebook.py @@ -529,16 +529,20 @@ def get_relative_path_to_this_notebook(file_path): history_data = loaded_state.get(RSPACE_HISTORY_DATA, {'text': ''}) target_notebook_name = get_target_notebook_name()['name'] upload_attached_data(attachment_files) + # always get the latest version of the RSpace doc from RSpac, dont use the locally stored copy if rspace_doc is None and rspace_prexisting_document_id is None: rspace_doc = client.create_document(name="DocumentFor_" + target_notebook_name, tags=["Python", "API", "Jupyter"]) + elif rspace_prexisting_document_id is not None: + rspace_doc = client.get_document(rspace_prexisting_document_id) + else: + rspace_doc = client.get_document(rspace_doc['id']) if rspace_document_target_field is not None: rspace_document_target_field_id = str(rspace_doc['fields'][int(rspace_document_target_field)]['id']) else: rspace_document_target_field_id = str(rspace_doc['fields'][0]['id']) rspace_document_file_id = str( rspace_doc['id']) if rspace_prexisting_document_id is None else rspace_prexisting_document_id - rspace_doc = client.get_document(rspace_document_file_id) nb_gallery_file = await upload_notebook_to_gallery(relative_notebook_path, notebook_node, nb_gallery_file, attachment_files, rspace_doc, history_data) nb_gallery_file_id = nb_gallery_file.get('id')