From 85510c2997cac30742d4166367922d185331f626 Mon Sep 17 00:00:00 2001 From: Joe Futrelle Date: Thu, 14 Aug 2025 12:37:29 -0400 Subject: [PATCH 1/3] bin path caching, no change to accession logic yet --- .../0047_bin_data_directory_bin_path.py | 24 +++++++++++++++++++ ifcbdb/dashboard/models.py | 19 +++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) create mode 100644 ifcbdb/dashboard/migrations/0047_bin_data_directory_bin_path.py diff --git a/ifcbdb/dashboard/migrations/0047_bin_data_directory_bin_path.py b/ifcbdb/dashboard/migrations/0047_bin_data_directory_bin_path.py new file mode 100644 index 00000000..9d9f4ecc --- /dev/null +++ b/ifcbdb/dashboard/migrations/0047_bin_data_directory_bin_path.py @@ -0,0 +1,24 @@ +# Generated by Django 4.2.21 on 2025-08-14 16:19 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('dashboard', '0046_auto_20250721_2039'), + ] + + operations = [ + migrations.AddField( + model_name='bin', + name='data_directory', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='dashboard.datadirectory'), + ), + migrations.AddField( + model_name='bin', + name='path', + field=models.CharField(blank=True, max_length=1024), + ), + ] diff --git a/ifcbdb/dashboard/models.py b/ifcbdb/dashboard/models.py index 6a2c7655..def85be2 100644 --- a/ifcbdb/dashboard/models.py +++ b/ifcbdb/dashboard/models.py @@ -417,6 +417,9 @@ class Bin(models.Model): instrument = models.ForeignKey('Instrument', related_name='bins', null=True, on_delete=models.SET_NULL) # many-to-many relationship with datasets datasets = models.ManyToManyField('Dataset', related_name='bins') + # most recently located path of dataset, and which data directory it came from + path = models.CharField(max_length=1024, blank=True) + data_directory = models.ForeignKey('DataDirectory', null=True, blank=True, on_delete=models.SET_NULL) # accession added = models.DateTimeField(auto_now_add=True, null=True) # qaqc flags @@ -523,17 +526,19 @@ def _directories(self, kind=DataDirectory.RAW, version=None): yield directory def _get_bin(self): - cache_key = '{}_path'.format(self.pid) - cached_path = cache.get(cache_key) - if cached_path is not None and os.path.exists(cached_path+'.adc'): - return FilesetBin(Fileset(cached_path)) # return the underlying ifcb.Bin object backed by the raw filesets - for directory in self._directories(kind=DataDirectory.RAW): + if self.path and os.path.exists(self.path+'.adc'): + return FilesetBin(Fileset(self.path)) + to_search = [] if not self.data_directory else [self.data_directory] + to_search.extend(self._directories(kind=DataDirectory.RAW)) + for directory in to_search: dd = directory.get_raw_directory() try: b = dd[self.pid] - basepath, _ = os.path.splitext(b.fileset.adc_path) - cache.set(cache_key, basepath) + if not self.path: # cache path of first found fileset + self.path, _ = os.path.splitext(b.fileset.adc_path) + self.data_directory = directory + self.save() return b except KeyError: pass # keep searching From db56f516cfab07992998ea314146671a3a1d48cb Mon Sep 17 00:00:00 2001 From: Joe Futrelle Date: Thu, 14 Aug 2025 13:47:46 -0400 Subject: [PATCH 2/3] adjusted accession logic for bin caching --- ifcbdb/dashboard/accession.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/ifcbdb/dashboard/accession.py b/ifcbdb/dashboard/accession.py index 1954bf1a..2943d333 100644 --- a/ifcbdb/dashboard/accession.py +++ b/ifcbdb/dashboard/accession.py @@ -61,17 +61,21 @@ def scan(self): continue # skip and continue searching directory = ifcb.DataDirectory(dd.path) for b in directory: - yield b + yield (b, dd) def sync_one(self, pid): bin = None + dd_found = None for dd in self.dataset.directories.filter(kind=DataDirectory.RAW).order_by('priority'): if not os.path.exists(dd.path): continue # skip and continue searching directory = ifcb.DataDirectory(dd.path) try: bin = directory[pid] + dd_found = dd except KeyError: continue + if bin is not None: + break if bin is None: return 'bin {} not found'.format(pid) # create instrument if necessary @@ -86,11 +90,12 @@ def sync_one(self, pid): 'timestamp': timestamp, 'sample_time': timestamp, 'instrument': instrument, + 'path': os.path.splitext(bin.fileset.adc_path)[0], # path without extension + 'data_directory': dd_found, 'skip': True, # in case accession is interrupted }) - if not created and not self.dataset in b.datasets: - self.dataset.bins.add(b) - return + if not created: + return b2s, error = self.add_bin(bin, b) if error is not None: # there was an error. if we created a bin, delete it @@ -115,13 +120,14 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing): start_time = self.start_time() errors = {} while True: - bins = list(islice(scanner, self.batch_size)) - if not bins: + bin_dds = list(islice(scanner, self.batch_size)) + if not bin_dds: break - total_bins += len(bins) + total_bins += len(bin_dds) # create instrument(s) instruments = {} # keyed by instrument number - for bin in bins: + for bin_dd in bin_dds: + bin, dd = bin_dd i = bin.pid.instrument if not i in instruments: version = bin.pid.schema_version @@ -132,7 +138,8 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing): # create bins then = time.time() bins2save = [] - for bin in bins: + for bin_dd in bin_dds: + bin, dd = bin_dd pid = bin.lid most_recent_bin_id = pid log_callback('{} found'.format(pid)) @@ -144,10 +151,11 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing): 'timestamp': timestamp, 'sample_time': timestamp, 'instrument': instrument, + 'path': os.path.splitext(bin.fileset.adc_path)[0], # path without extension + 'data_directory': dd, 'skip': True, # in case accession is interrupted }) if not created: - self.dataset.bins.add(b) continue b2s, error = self.add_bin(bin, b) if error is not None: @@ -200,6 +208,9 @@ def add_bin(self, bin, b): # IFCB bin, Bin instance except Exception as e: b.qc_bad = True return b, 'ml_analyzed: {}'.format(str(e)) + # paths + if b.path is None: + b.path, _ = os.path.splitext(bin.fileset.adc_path) # metadata try: headers = bin.hdr_attributes From f938cf228d97cab720057785fb57bbdb8939c104 Mon Sep 17 00:00:00 2001 From: Joe Futrelle Date: Thu, 14 Aug 2025 14:06:30 -0400 Subject: [PATCH 3/3] adding --cache-paths directive --- ifcbdb/dashboard/management/commands/bintool.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ifcbdb/dashboard/management/commands/bintool.py b/ifcbdb/dashboard/management/commands/bintool.py index 9ab058f1..243593f6 100644 --- a/ifcbdb/dashboard/management/commands/bintool.py +++ b/ifcbdb/dashboard/management/commands/bintool.py @@ -16,6 +16,7 @@ def add_arguments(self, parser): parser.add_argument('--sample-type', type=str, help='Sample type') parser.add_argument('--remove-dataset', type=str, help='Dataset name to remove filtered bins from') parser.add_argument('--add-dataset', type=str, help='Dataset name to add filtered bins to') + parser.add_argument('--cache-paths', action='store_true', help='Cache paths for filtered bins') def handle(self, *args, **options): dataset_name = options['dataset'] @@ -58,5 +59,9 @@ def handle(self, *args, **options): except Dataset.DoesNotExist: self.stderr.write(f"Dataset '{add_dataset_name}' does not exist.") + if options['cache_paths']: + for b in bins: + b._get_bin() # this will cache the path if it isn't already + for bin_id in bin_ids: self.stdout.write(bin_id) \ No newline at end of file