Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions ifcbdb/dashboard/accession.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,21 @@ def scan(self):
continue # skip and continue searching
directory = ifcb.DataDirectory(dd.path)
for b in directory:
yield b
yield (b, dd)
def sync_one(self, pid):
bin = None
dd_found = None
for dd in self.dataset.directories.filter(kind=DataDirectory.RAW).order_by('priority'):
if not os.path.exists(dd.path):
continue # skip and continue searching
directory = ifcb.DataDirectory(dd.path)
try:
bin = directory[pid]
dd_found = dd
except KeyError:
continue
if bin is not None:
break
if bin is None:
return 'bin {} not found'.format(pid)
# create instrument if necessary
Expand All @@ -86,11 +90,12 @@ def sync_one(self, pid):
'timestamp': timestamp,
'sample_time': timestamp,
'instrument': instrument,
'path': os.path.splitext(bin.fileset.adc_path)[0], # path without extension
'data_directory': dd_found,
'skip': True, # in case accession is interrupted
})
if not created and not self.dataset in b.datasets:
self.dataset.bins.add(b)
return
if not created:
return
b2s, error = self.add_bin(bin, b)
if error is not None:
# there was an error. if we created a bin, delete it
Expand All @@ -115,13 +120,14 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing):
start_time = self.start_time()
errors = {}
while True:
bins = list(islice(scanner, self.batch_size))
if not bins:
bin_dds = list(islice(scanner, self.batch_size))
if not bin_dds:
break
total_bins += len(bins)
total_bins += len(bin_dds)
# create instrument(s)
instruments = {} # keyed by instrument number
for bin in bins:
for bin_dd in bin_dds:
bin, dd = bin_dd
i = bin.pid.instrument
if not i in instruments:
version = bin.pid.schema_version
Expand All @@ -132,7 +138,8 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing):
# create bins
then = time.time()
bins2save = []
for bin in bins:
for bin_dd in bin_dds:
bin, dd = bin_dd
pid = bin.lid
most_recent_bin_id = pid
log_callback('{} found'.format(pid))
Expand All @@ -144,10 +151,11 @@ def sync(self, progress_callback=do_nothing, log_callback=do_nothing):
'timestamp': timestamp,
'sample_time': timestamp,
'instrument': instrument,
'path': os.path.splitext(bin.fileset.adc_path)[0], # path without extension
'data_directory': dd,
'skip': True, # in case accession is interrupted
})
if not created:
self.dataset.bins.add(b)
continue
b2s, error = self.add_bin(bin, b)
if error is not None:
Expand Down Expand Up @@ -200,6 +208,9 @@ def add_bin(self, bin, b): # IFCB bin, Bin instance
except Exception as e:
b.qc_bad = True
return b, 'ml_analyzed: {}'.format(str(e))
# paths
if b.path is None:
b.path, _ = os.path.splitext(bin.fileset.adc_path)
# metadata
try:
headers = bin.hdr_attributes
Expand Down
5 changes: 5 additions & 0 deletions ifcbdb/dashboard/management/commands/bintool.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def add_arguments(self, parser):
parser.add_argument('--sample-type', type=str, help='Sample type')
parser.add_argument('--remove-dataset', type=str, help='Dataset name to remove filtered bins from')
parser.add_argument('--add-dataset', type=str, help='Dataset name to add filtered bins to')
parser.add_argument('--cache-paths', action='store_true', help='Cache paths for filtered bins')

def handle(self, *args, **options):
dataset_name = options['dataset']
Expand Down Expand Up @@ -58,5 +59,9 @@ def handle(self, *args, **options):
except Dataset.DoesNotExist:
self.stderr.write(f"Dataset '{add_dataset_name}' does not exist.")

if options['cache_paths']:
for b in bins:
b._get_bin() # this will cache the path if it isn't already

for bin_id in bin_ids:
self.stdout.write(bin_id)
24 changes: 24 additions & 0 deletions ifcbdb/dashboard/migrations/0047_bin_data_directory_bin_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.2.21 on 2025-08-14 16:19

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('dashboard', '0046_auto_20250721_2039'),
]

operations = [
migrations.AddField(
model_name='bin',
name='data_directory',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='dashboard.datadirectory'),
),
migrations.AddField(
model_name='bin',
name='path',
field=models.CharField(blank=True, max_length=1024),
),
]
19 changes: 12 additions & 7 deletions ifcbdb/dashboard/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,9 @@ class Bin(models.Model):
instrument = models.ForeignKey('Instrument', related_name='bins', null=True, on_delete=models.SET_NULL)
# many-to-many relationship with datasets
datasets = models.ManyToManyField('Dataset', related_name='bins')
# most recently located path of dataset, and which data directory it came from
path = models.CharField(max_length=1024, blank=True)
data_directory = models.ForeignKey('DataDirectory', null=True, blank=True, on_delete=models.SET_NULL)
# accession
added = models.DateTimeField(auto_now_add=True, null=True)
# qaqc flags
Expand Down Expand Up @@ -523,17 +526,19 @@ def _directories(self, kind=DataDirectory.RAW, version=None):
yield directory

def _get_bin(self):
cache_key = '{}_path'.format(self.pid)
cached_path = cache.get(cache_key)
if cached_path is not None and os.path.exists(cached_path+'.adc'):
return FilesetBin(Fileset(cached_path))
# return the underlying ifcb.Bin object backed by the raw filesets
for directory in self._directories(kind=DataDirectory.RAW):
if self.path and os.path.exists(self.path+'.adc'):
return FilesetBin(Fileset(self.path))
to_search = [] if not self.data_directory else [self.data_directory]
to_search.extend(self._directories(kind=DataDirectory.RAW))
for directory in to_search:
dd = directory.get_raw_directory()
try:
b = dd[self.pid]
basepath, _ = os.path.splitext(b.fileset.adc_path)
cache.set(cache_key, basepath)
if not self.path: # cache path of first found fileset
self.path, _ = os.path.splitext(b.fileset.adc_path)
self.data_directory = directory
self.save()
return b
except KeyError:
pass # keep searching
Expand Down