WHOIGit · mike-kaimika · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025
diff --git a/ifcbdb/dashboard/accession.py b/ifcbdb/dashboard/accession.py
@@ -61,17 +61,21 @@ def scan(self):
                 continue # skip and continue searching
             directory = ifcb.DataDirectory(dd.path)
             for b in directory:
-                yield b
+                yield (b, dd)
     def sync_one(self, pid):
         bin = None
+        dd_found = None
         for dd in self.dataset.directories.filter(kind=DataDirectory.RAW).order_by('priority'):
             if not os.path.exists(dd.path):
                 continue # skip and continue searching
             directory = ifcb.DataDirectory(dd.path)
             try:
                 bin = directory[pid]
+                dd_found = dd
             except KeyError:
                 continue
+            if bin is not None:
+                break
         if bin is None:
             return 'bin {} not found'.format(pid)
         # create instrument if necessary
@@ -86,11 +90,12 @@ def sync_one(self, pid):
             'timestamp': timestamp,
             'sample_time': timestamp,
             'instrument': instrument,
+            'path': os.path.splitext(bin.fileset.adc_path)[0], # path without extension
+            'data_directory': dd_found,
             'skip': True, # in case accession is interrupted
         })
-        if not created and not self.dataset in b.datasets:
-            self.dataset.bins.add(b)
-            return 
+        if not created:
+            return
         b2s, error = self.add_bin(bin, b)
         if error is not None:
             # there was an error. if we created a bin, delete it
@@ -115,13 +120,14 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing):
         start_time = self.start_time()
         errors = {}
         while True:
-            bins = list(islice(scanner, self.batch_size))
-            if not bins:
+            bin_dds = list(islice(scanner, self.batch_size))
+            if not bin_dds:
                 break
-            total_bins += len(bins)
+            total_bins += len(bin_dds)
             # create instrument(s)
             instruments = {} # keyed by instrument number
-            for bin in bins:
+            for bin_dd in bin_dds:
+                bin, dd = bin_dd
                 i = bin.pid.instrument
                 if not i in instruments:
                     version = bin.pid.schema_version
@@ -132,7 +138,8 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing):
             # create bins
             then = time.time()
             bins2save = []
-            for bin in bins:
+            for bin_dd in bin_dds:
+                bin, dd = bin_dd
                 pid = bin.lid
                 most_recent_bin_id = pid
                 log_callback('{} found'.format(pid))
@@ -144,10 +151,11 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing):
                     'timestamp': timestamp,
                     'sample_time': timestamp,
                     'instrument': instrument,
+                    'path': os.path.splitext(bin.fileset.adc_path)[0], # path without extension
+                    'data_directory': dd,
                     'skip': True, # in case accession is interrupted
                 })
                 if not created:
-                    self.dataset.bins.add(b)
                     continue
                 b2s, error = self.add_bin(bin, b)
                 if error is not None:
@@ -200,6 +208,9 @@ def add_bin(self, bin, b): # IFCB bin, Bin instance
         except Exception as e:
             b.qc_bad = True
             return b, 'ml_analyzed: {}'.format(str(e))
+        # paths
+        if b.path is None:
+            b.path, _ = os.path.splitext(bin.fileset.adc_path)
         # metadata
         try:
             headers = bin.hdr_attributes

diff --git a/ifcbdb/dashboard/management/commands/bintool.py b/ifcbdb/dashboard/management/commands/bintool.py
@@ -16,6 +16,7 @@ def add_arguments(self, parser):
         parser.add_argument('--sample-type', type=str, help='Sample type')
         parser.add_argument('--remove-dataset', type=str, help='Dataset name to remove filtered bins from')
         parser.add_argument('--add-dataset', type=str, help='Dataset name to add filtered bins to')
+        parser.add_argument('--cache-paths', action='store_true', help='Cache paths for filtered bins')
 
     def handle(self, *args, **options):
         dataset_name = options['dataset']
@@ -58,5 +59,9 @@ def handle(self, *args, **options):
             except Dataset.DoesNotExist:
                 self.stderr.write(f"Dataset '{add_dataset_name}' does not exist.")
 
+        if options['cache_paths']:
+            for b in bins:
+                b._get_bin() # this will cache the path if it isn't already
+
         for bin_id in bin_ids:
             self.stdout.write(bin_id)
diff --git a/ifcbdb/dashboard/migrations/0047_bin_data_directory_bin_path.py b/ifcbdb/dashboard/migrations/0047_bin_data_directory_bin_path.py
@@ -0,0 +1,24 @@
+# Generated by Django 4.2.21 on 2025-08-14 16:19
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('dashboard', '0046_auto_20250721_2039'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='bin',
+            name='data_directory',
+            field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='dashboard.datadirectory'),
+        ),
+        migrations.AddField(
+            model_name='bin',
+            name='path',
+            field=models.CharField(blank=True, max_length=1024),
+        ),
+    ]
diff --git a/ifcbdb/dashboard/models.py b/ifcbdb/dashboard/models.py
@@ -417,6 +417,9 @@ class Bin(models.Model):
     instrument = models.ForeignKey('Instrument', related_name='bins', null=True, on_delete=models.SET_NULL)
     # many-to-many relationship with datasets
     datasets = models.ManyToManyField('Dataset', related_name='bins')
+    # most recently located path of dataset, and which data directory it came from
+    path = models.CharField(max_length=1024, blank=True)
+    data_directory = models.ForeignKey('DataDirectory', null=True, blank=True, on_delete=models.SET_NULL)
     # accession
     added = models.DateTimeField(auto_now_add=True, null=True)
     # qaqc flags
@@ -523,17 +526,19 @@ def _directories(self, kind=DataDirectory.RAW, version=None):
                 yield directory
 
     def _get_bin(self):
-        cache_key = '{}_path'.format(self.pid)
-        cached_path = cache.get(cache_key)
-        if cached_path is not None and os.path.exists(cached_path+'.adc'):
-            return FilesetBin(Fileset(cached_path))
         # return the underlying ifcb.Bin object backed by the raw filesets
-        for directory in self._directories(kind=DataDirectory.RAW):
+        if self.path and os.path.exists(self.path+'.adc'):
+            return FilesetBin(Fileset(self.path))
+        to_search = [] if not self.data_directory else [self.data_directory]
+        to_search.extend(self._directories(kind=DataDirectory.RAW))
+        for directory in to_search:
             dd = directory.get_raw_directory()
             try:
                 b = dd[self.pid]
-                basepath, _  = os.path.splitext(b.fileset.adc_path)
-                cache.set(cache_key, basepath)
+                if not self.path: # cache path of first found fileset
+                    self.path, _  = os.path.splitext(b.fileset.adc_path)
+                    self.data_directory = directory
+                    self.save()
                 return b
             except KeyError:
                 pass # keep searching