diff --git a/dev b/dev
index 48be8a2..1a282ab 160000
--- a/dev
+++ b/dev
@@ -1 +1 @@
-Subproject commit 48be8a29f0a78ba17aff96be116a62b7c8f7976f
+Subproject commit 1a282ab38d94ea7f3fd12ec2f10580e6a40cc3ae
diff --git a/scidk/app.py b/scidk/app.py
index 81a0215..cda9048 100644
--- a/scidk/app.py
+++ b/scidk/app.py
@@ -972,6 +972,12 @@ def api_scan():
folders = []
files_skipped = 0
files_hashed = 0
+ # Cache variables (initialized for all providers)
+ use_cache = False
+ prev_scan_id = None
+ cache_hits = 0
+ cache_misses = 0
+
if provider_id in ('local_fs', 'mounted_fs'):
# Local/Mounted: enumerate filesystem and ingest into SQLite index
base = Path(path)
@@ -991,25 +997,98 @@ def api_scan():
fs.last_scan_source = 'python'
except Exception:
fs.last_scan_source = 'python'
+
+ # Cache-aware traversal optimization: check for previous scan and reuse if unchanged
+ use_cache = os.environ.get('SCIDK_CACHE_SCAN', '1').strip() in ('1', 'true', 'yes', 'on')
+
+ if use_cache:
+ try:
+ prev_scan_id = pix.get_previous_scan_for_path(str(base))
+ except Exception:
+ prev_scan_id = None
+
+ def _dir_unchanged(dir_path: Path, prev_sid: Optional[str]) -> bool:
+ """Check if directory listing hasn't changed since previous scan."""
+ if not prev_sid:
+ return False
+ try:
+ cached_children = pix.get_cached_directory(prev_sid, str(dir_path))
+ if cached_children is None:
+ return False
+ # Check if current directory listing matches cache
+ current_children = set()
+ for child in dir_path.iterdir():
+ current_children.add(child.name)
+ return set(cached_children) == current_children
+ except Exception:
+ return False
+
try:
if recursive:
- for p in base.rglob('*'):
- try:
- if p.is_dir():
- items_dirs.add(p)
- else:
- items_files.append(p)
- # ensure parent chain exists in dirs set
- parent = p.parent
- while parent and parent != parent.parent and str(parent).startswith(str(base)):
- items_dirs.add(parent)
- if parent == base:
- break
- parent = parent.parent
- except Exception:
+ # For recursive scans, use cache-aware traversal
+ dirs_to_scan = [base]
+ visited = set()
+
+ while dirs_to_scan:
+ current_dir = dirs_to_scan.pop(0)
+ if str(current_dir) in visited:
continue
- # include base itself as a folder
- items_dirs.add(base)
+ visited.add(str(current_dir))
+ items_dirs.add(current_dir)
+
+ # Check if we can use cached data for this directory
+ if use_cache and prev_scan_id and _dir_unchanged(current_dir, prev_scan_id):
+ # Use cached scan_items for this directory subtree
+ cache_hits += 1
+ try:
+ cached_children = pix.get_cached_directory(prev_scan_id, str(current_dir))
+ if cached_children:
+ for child_name in cached_children:
+ child_path = current_dir / child_name
+ if child_path.exists():
+ if child_path.is_dir():
+ dirs_to_scan.append(child_path)
+ else:
+ items_files.append(child_path)
+ # ensure parent chain exists
+ parent = current_dir.parent
+ while parent and parent != parent.parent and str(parent).startswith(str(base)):
+ items_dirs.add(parent)
+ if parent == base:
+ break
+ parent = parent.parent
+ except Exception:
+ cache_misses += 1
+ # Fallback to filesystem scan for this directory
+ for child in current_dir.iterdir():
+ try:
+ if child.is_dir():
+ dirs_to_scan.append(child)
+ else:
+ items_files.append(child)
+ except Exception:
+ continue
+ else:
+ # Filesystem scan for this directory
+ cache_misses += 1
+ try:
+ for child in current_dir.iterdir():
+ try:
+ if child.is_dir():
+ dirs_to_scan.append(child)
+ else:
+ items_files.append(child)
+ # ensure parent chain exists
+ parent = child.parent
+ while parent and parent != parent.parent and str(parent).startswith(str(base)):
+ items_dirs.add(parent)
+ if parent == base:
+ break
+ parent = parent.parent
+ except Exception:
+ continue
+ except Exception:
+ continue
else:
for p in base.iterdir():
try:
@@ -1074,6 +1153,31 @@ def _row_from_local(pth: Path, typ: str) -> tuple:
for fpath in items_files:
rows.append(_row_from_local(fpath, 'file'))
ingested = pix.batch_insert_files(rows)
+
+ # Populate scan_items and directory_cache for selective scanning optimization
+ try:
+ scan_item_rows = []
+ dir_cache_map = {} # path -> list of children names
+ for row in rows:
+ # row format: (path, parent, name, depth, type, size, mtime, ext, mime, etag, hash, remote, scan_id, extra)
+ full_path, parent, name, depth, typ, size, mtime, ext, mime, etag, ahash, remote, _, extra = row
+ # Build scan_items row: (path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json)
+ scan_item_rows.append((full_path, typ, size, mtime, ext, mime, etag, ahash, extra))
+ # Build directory cache: track children per parent directory
+ if parent:
+ dir_cache_map.setdefault(parent, []).append(name)
+
+ # Insert scan_items
+ if scan_item_rows:
+ pix.record_scan_items(scan_id, scan_item_rows)
+
+ # Insert directory_cache for each directory
+ for dir_path, children_names in dir_cache_map.items():
+ pix.cache_directory_listing(scan_id, dir_path, children_names)
+ except Exception as cache_err:
+ # Non-fatal: log but continue
+ app.extensions['scidk'].setdefault('telemetry', {})['last_cache_error'] = str(cache_err)
+
# Also create in-memory datasets (keep legacy behavior)
count = 0
for fpath in items_files:
@@ -1354,6 +1458,12 @@ def _add_folder(full_path: str, name: str, parent: str):
'source': app.extensions['scidk'].get('interpreters', {}).get('source', 'default'),
}
},
+ 'cache_stats': {
+ 'enabled': use_cache,
+ 'prev_scan_id': prev_scan_id,
+ 'cache_hits': cache_hits,
+ 'cache_misses': cache_misses,
+ },
}
scans = app.extensions['scidk'].setdefault('scans', {})
scans[scan_id] = scan
@@ -1400,7 +1510,23 @@ def _add_folder(full_path: str, name: str, parent: str):
'root_label': root_label,
})
drec.setdefault('scan_ids', []).append(scan_id)
- return jsonify({"status": "ok", "scan_id": scan_id, "scanned": count, "folder_count": len(folders), "ingested_rows": int(ingested), "duration_sec": duration, "path": str(path), "recursive": bool(recursive), "provider_id": provider_id}), 200
+ return jsonify({
+ "status": "ok",
+ "scan_id": scan_id,
+ "scanned": count,
+ "folder_count": len(folders),
+ "ingested_rows": int(ingested),
+ "duration_sec": duration,
+ "path": str(path),
+ "recursive": bool(recursive),
+ "provider_id": provider_id,
+ "cache_stats": {
+ 'enabled': use_cache,
+ 'prev_scan_id': prev_scan_id,
+ 'cache_hits': cache_hits,
+ 'cache_misses': cache_misses,
+ }
+ }), 200
except Exception as e:
return jsonify({"status": "error", "error": str(e)}), 400
diff --git a/scidk/core/path_index_sqlite.py b/scidk/core/path_index_sqlite.py
index 86ee6cc..40f4270 100644
--- a/scidk/core/path_index_sqlite.py
+++ b/scidk/core/path_index_sqlite.py
@@ -338,3 +338,150 @@ def apply_basic_change_history(scan_id: str, target_root: str) -> dict:
return {"created": int(created), "modified": int(modified), "deleted": int(deleted)}
finally:
conn.close()
+
+
+def record_scan_items(scan_id: str, rows: Iterable[Tuple], batch_size: int = 10000) -> int:
+ """
+ Record scan items into scan_items table for caching.
+ Rows: (path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json)
+ Returns total inserted.
+ """
+ from .migrations import migrate
+ conn = connect()
+ migrate(conn)
+ total = 0
+ try:
+ cur = conn.cursor()
+ buf: List[Tuple] = []
+ for r in rows:
+ # Expand row to match scan_items schema
+ buf.append((scan_id,) + r)
+ if len(buf) >= batch_size:
+ cur.executemany(
+ """INSERT INTO scan_items(scan_id, path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json)
+ VALUES (?,?,?,?,?,?,?,?,?,?)""",
+ buf,
+ )
+ conn.commit()
+ total += len(buf)
+ buf.clear()
+ if buf:
+ cur.executemany(
+ """INSERT INTO scan_items(scan_id, path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json)
+ VALUES (?,?,?,?,?,?,?,?,?,?)""",
+ buf,
+ )
+ conn.commit()
+ total += len(buf)
+ return total
+ finally:
+ conn.close()
+
+
+def cache_directory_listing(scan_id: str, dir_path: str, children: List[str]) -> None:
+ """
+ Cache directory listing in directory_cache table.
+ children: list of child file/folder names (not full paths)
+ """
+ import json
+ import time
+ from .migrations import migrate
+ conn = connect()
+ migrate(conn)
+ try:
+ children_json = json.dumps(children)
+ created = time.time()
+ conn.execute(
+ """INSERT OR REPLACE INTO directory_cache(scan_id, path, children_json, created)
+ VALUES (?,?,?,?)""",
+ (scan_id, dir_path, children_json, created)
+ )
+ conn.commit()
+ finally:
+ conn.close()
+
+
+def get_cached_directory(scan_id: str, dir_path: str) -> Optional[List[str]]:
+ """
+ Retrieve cached directory listing from directory_cache.
+ Returns list of child names or None if not cached.
+ """
+ import json
+ from .migrations import migrate
+ conn = connect()
+ migrate(conn)
+ try:
+ cur = conn.cursor()
+ cur.execute(
+ "SELECT children_json FROM directory_cache WHERE scan_id=? AND path=?",
+ (scan_id, dir_path)
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ try:
+ return json.loads(row[0] or "[]")
+ except Exception:
+ return None
+ finally:
+ conn.close()
+
+
+def get_previous_scan_for_path(path: str) -> Optional[str]:
+ """
+ Find the most recent scan_id that includes this path.
+ Returns scan_id or None.
+ """
+ conn = connect()
+ init_db(conn)
+ try:
+ cur = conn.cursor()
+ # Try scan_items first (more structured)
+ cur.execute(
+ "SELECT scan_id FROM scan_items WHERE path=? ORDER BY rowid DESC LIMIT 1",
+ (path,)
+ )
+ row = cur.fetchone()
+ if row:
+ return row[0]
+ # Fallback to files table
+ cur.execute(
+ "SELECT scan_id FROM files WHERE path=? ORDER BY rowid DESC LIMIT 1",
+ (path,)
+ )
+ row = cur.fetchone()
+ return row[0] if row else None
+ finally:
+ conn.close()
+
+
+def get_scan_item(scan_id: str, path: str) -> Optional[Dict]:
+ """
+ Retrieve scan item metadata from scan_items table.
+ Returns dict with path, type, size, modified_time, hash, etc. or None.
+ """
+ from .migrations import migrate
+ conn = connect()
+ migrate(conn)
+ try:
+ cur = conn.cursor()
+ cur.execute(
+ """SELECT path, type, size, modified_time, file_extension, mime_type, etag, hash
+ FROM scan_items WHERE scan_id=? AND path=?""",
+ (scan_id, path)
+ )
+ row = cur.fetchone()
+ if not row:
+ return None
+ return {
+ 'path': row[0],
+ 'type': row[1],
+ 'size': row[2],
+ 'modified_time': row[3],
+ 'file_extension': row[4],
+ 'mime_type': row[5],
+ 'etag': row[6],
+ 'hash': row[7],
+ }
+ finally:
+ conn.close()
diff --git a/scidk/ui/templates/datasets.html b/scidk/ui/templates/datasets.html
index b2f4c27..e058ade 100644
--- a/scidk/ui/templates/datasets.html
+++ b/scidk/ui/templates/datasets.html
@@ -47,19 +47,20 @@
Files
-
Select a provider, root, and item to see details.
-
-
-
+
+
+
Current Location:
+
+ No folder selected
+
+
+
+
+
+ Select a folder to enable scanning. All scans run as background tasks with progress tracking.
+
@@ -236,8 +237,8 @@
Start Background Scan
const provList = document.getElementById('prov-list');
const provCrumb = document.getElementById('prov-crumb');
const provPanel = document.getElementById('prov-panel-content');
- const provScanForm = document.getElementById('prov-scan-form');
- const provScanMsg = document.getElementById('prov-scan-msg');
+ const provScanBtn = document.getElementById('prov-scan-btn');
+ const provCurrentPath = document.getElementById('prov-current-path');
const btnROC = document.getElementById('open-rocrate');
const btnROCClose = document.getElementById('close-rocrate');
const rocFrame = document.getElementById('rocrate-frame');
@@ -342,7 +343,18 @@
Start Background Scan
}).join('');
provList.innerHTML = rows || '
| Empty folder. |
';
attachProvHandlers();
- } catch(e){ provList.innerHTML = '
| Browse failed. |
'; }
+
+ // Update current path display and enable scan button
+ if (provCurrentPath) {
+ provCurrentPath.textContent = fullPath || '(root)';
+ }
+ if (provScanBtn) {
+ provScanBtn.disabled = false;
+ }
+ } catch(e){
+ provList.innerHTML = '
| Browse failed. |
';
+ if (provScanBtn) provScanBtn.disabled = true;
+ }
}
function attachProvHandlers(){
@@ -433,75 +445,65 @@
Start Background Scan
if (chkFast) chkFast.addEventListener('change', reBrowse);
if (inpDepth) inpDepth.addEventListener('change', reBrowse);
}
- if (provScanForm){
- provScanForm.addEventListener('submit', async (e) => {
- e.preventDefault();
- // Derive values from UI at submit-time to avoid race with async init
+
+ // Unified scan button handler - uses background tasks
+ if (provScanBtn){
+ provScanBtn.addEventListener('click', async () => {
+ // Get current browse context
const provId = (provSelect && provSelect.value) || currentProv || 'local_fs';
const rootId = (rootSelect && rootSelect.value) || currentRoot || '/';
const inputPath = (provPathInput && provPathInput.value && provPathInput.value.trim()) || '';
const relOrAbs = inputPath || currentPath || '';
- // Compose full scan target for Rclone; keep other providers untouched
+
+ // Compose full scan path
let scanPath = relOrAbs || rootId || '/';
if (provId === 'rclone') {
- scanPath = composePath(provId, rootId, relOrAbs); // e.g., "dropbox:AIPT"
+ scanPath = composePath(provId, rootId, relOrAbs);
}
- currentProv = provId;
- currentRoot = rootId;
- currentPath = relOrAbs;
- if (!provId || !scanPath){ provScanMsg.textContent = 'Select a provider and folder first.'; return; }
- const recursive = document.getElementById('prov-scan-recursive').checked;
- const chkFast = document.getElementById('prov-browse-fast-list');
- const fastList = !!(chkFast && chkFast.checked);
- const btn = provScanForm.querySelector('button[type="submit"]');
- if (btn) { btn.disabled = true; btn.textContent = 'Scanningβ¦'; }
- provScanMsg.textContent = `Starting scan for ${scanPath}β¦`;
- // Add a local pseudo-task so status appears with other progress bars
- const localId = 'provscan-' + Date.now();
- const localTask = { id: localId, type: 'scan', status: 'running', path: scanPath, processed: 0, total: null, progress: 0 };
- try { (window.scidkLocalTasks||[]).push(localTask); } catch(_) { /* ignore */ }
- fetchTasks(); // trigger re-render with local task
+
+ if (!provId || !scanPath){
+ alert('Please select a provider and folder first.');
+ return;
+ }
+
+ // Populate background scan form and trigger it
+ const bgPathInput = document.getElementById('scan-path');
+ const bgRecursive = document.getElementById('scan-recursive');
+ if (bgPathInput) bgPathInput.value = scanPath;
+ if (bgRecursive) bgRecursive.checked = true;
+
+ // Trigger background scan via tasks API (same as "Start Background Scan" button)
+ const recursive = true;
+ provScanBtn.disabled = true;
+ provScanBtn.textContent = 'Starting scan...';
+
try {
- const r = await fetch('/api/scan', { method: 'POST', headers: { 'Content-Type':'application/json' }, body: JSON.stringify({ provider_id: provId, root_id: rootId||'/', path: scanPath, recursive, fast_list: fastList }) });
- const ctype = (r.headers && r.headers.get('content-type')) || '';
- let j = null;
- if (ctype.includes('application/json')){
- try { j = await r.json(); } catch(_) { j = null; }
- } else {
- // Non-JSON response (proxy/gateway error). Read text for clarity
- try { const txt = await r.text(); throw new Error(`HTTP ${r.status}: ${txt}`); } catch(e){ throw e; }
- }
- if (r.ok && j){
- const files = j.scanned || 0;
- const folders = (j.folder_count !== undefined) ? j.folder_count : undefined;
- const dur = j.duration_sec ? (Math.round(j.duration_sec*10)/10+'s') : '';
- let msg = `Scan complete: ${j.scan_id} β files: ${files}`;
- if (folders !== undefined) msg += `, folders: ${folders}`;
- if (files === 0 && folders > 0 && !recursive){ msg += ' β Only folders found. Enable Recursive to include files in subfolders.'; }
- if (dur) msg += ` (${dur})`;
- provScanMsg.textContent = msg;
- // Mark local task completed
- localTask.status = 'completed';
- localTask.processed = files;
- localTask.total = files || localTask.processed;
- localTask.progress = 1;
- localTask.scan_id = j.scan_id;
+ const payload = {
+ type: 'scan',
+ path: scanPath,
+ recursive,
+ provider_id: provId,
+ root_id: rootId
+ };
+ const r = await fetch('/api/tasks', {
+ method: 'POST',
+ headers: { 'Content-Type':'application/json' },
+ body: JSON.stringify(payload)
+ });
+
+ if (r.status === 202){
+ startPolling();
+ fetchTasks();
+ alert(`Background scan started for: ${scanPath}\nCheck progress in "Scans Summary" section below.`);
} else {
- const err = (j && (j.error||j.message)) || `HTTP ${r.status}`;
- provScanMsg.textContent = `Scan error: ${err}`;
- localTask.status = 'error';
- localTask.error = err;
- localTask.progress = 1;
+ const j = await r.json();
+ alert('Scan error: ' + (j.error || r.status));
}
} catch(err){
- provScanMsg.textContent = 'Scan error: ' + err;
- localTask.status = 'error';
- localTask.error = (err && err.message) ? err.message : String(err);
- localTask.progress = 1;
- }
- finally {
- if (btn) { btn.disabled = false; btn.textContent = 'Scan'; }
- fetchTasks(); // refresh tasks view to reflect final status
+ alert('Scan error: ' + err);
+ } finally {
+ provScanBtn.disabled = false;
+ provScanBtn.textContent = 'π Scan This Folder';
}
});
}
diff --git a/tests/test_files_page_e2e.py b/tests/test_files_page_e2e.py
new file mode 100644
index 0000000..6b87ae4
--- /dev/null
+++ b/tests/test_files_page_e2e.py
@@ -0,0 +1,308 @@
+"""
+End-to-end tests for Files page UX workflows.
+
+Validates the consolidated scan functionality and browser-to-scan integration.
+"""
+import os
+import time
+from pathlib import Path
+import pytest
+from bs4 import BeautifulSoup
+
+
+def test_files_page_loads_successfully():
+ """Test that the Files page loads without errors."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ resp = client.get('/datasets')
+ assert resp.status_code == 200
+ assert b'Files' in resp.data
+ assert b'Provider' in resp.data
+
+
+def test_scan_button_uses_background_tasks_only():
+ """Verify that the scan button uses /api/tasks, not /api/scan."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ resp = client.get('/datasets')
+ assert resp.status_code == 200
+
+ # Check that the template has the new unified scan button
+ html = resp.data.decode('utf-8')
+ assert 'prov-scan-btn' in html
+ assert 'π Scan This Folder' in html
+
+ # Check that the old sync scan form is removed
+ assert 'prov-scan-form' not in html
+ assert 'prov-scan-recursive' not in html # old checkbox removed
+
+
+def test_browse_and_scan_integration(tmp_path: Path):
+ """Test the full workflow: browse folder β scan it via background task."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ # Create test directory
+ test_dir = tmp_path / 'test_project'
+ test_dir.mkdir()
+ (test_dir / 'file1.txt').write_text('content1', encoding='utf-8')
+ (test_dir / 'file2.txt').write_text('content2', encoding='utf-8')
+ (test_dir / 'subdir').mkdir()
+ (test_dir / 'subdir' / 'file3.txt').write_text('content3', encoding='utf-8')
+
+ with app.test_client() as client:
+ # Browse the directory
+ browse_resp = client.get(f'/api/browse?provider_id=local_fs&root_id=/&path={str(test_dir)}')
+ assert browse_resp.status_code == 200
+ browse_data = browse_resp.get_json()
+ assert 'entries' in browse_data
+ assert len(browse_data['entries']) >= 3 # 2 files + 1 subdir
+
+ # Trigger scan via background task (unified mechanism)
+ scan_resp = client.post('/api/tasks', json={
+ 'type': 'scan',
+ 'path': str(test_dir),
+ 'recursive': True,
+ 'provider_id': 'local_fs',
+ 'root_id': '/'
+ })
+ assert scan_resp.status_code == 202 # Accepted
+ scan_data = scan_resp.get_json()
+ assert 'task_id' in scan_data
+ task_id = scan_data['task_id']
+
+ # Poll for task completion (max 10 seconds)
+ max_wait = 10
+ start_time = time.time()
+ task_completed = False
+
+ while time.time() - start_time < max_wait:
+ task_resp = client.get(f'/api/tasks/{task_id}')
+ assert task_resp.status_code == 200
+ task_data = task_resp.get_json()
+
+ if task_data['status'] == 'completed':
+ task_completed = True
+ assert task_data['processed'] >= 3
+ break
+
+ time.sleep(0.5)
+
+ assert task_completed, "Scan task did not complete in time"
+
+
+def test_scan_history_unified_display(tmp_path: Path):
+ """Test that all scans appear in unified history."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ test_dir = tmp_path / 'scan_test'
+ test_dir.mkdir()
+ (test_dir / 'test.txt').write_text('test', encoding='utf-8')
+
+ with app.test_client() as client:
+ # Create first scan
+ resp1 = client.post('/api/tasks', json={
+ 'type': 'scan',
+ 'path': str(test_dir),
+ 'recursive': True,
+ 'provider_id': 'local_fs',
+ 'root_id': '/'
+ })
+ assert resp1.status_code == 202
+
+ time.sleep(1) # Allow scan to process
+
+ # Get all scans
+ scans_resp = client.get('/api/scans')
+ assert scans_resp.status_code == 200
+ scans = scans_resp.get_json()
+ assert isinstance(scans, list)
+ assert len(scans) >= 1
+
+ # Verify scan appears in summary
+ found = any(s.get('path') == str(test_dir) for s in scans)
+ assert found, "Scan not found in unified history"
+
+
+def test_rclone_scan_with_options():
+ """Test that rclone-specific options are handled correctly."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ # Mock rclone scan with fast-list option
+ # Note: This will fail in test without actual rclone, but validates API contract
+ resp = client.post('/api/tasks', json={
+ 'type': 'scan',
+ 'path': 'dropbox:test',
+ 'recursive': True,
+ 'provider_id': 'rclone',
+ 'root_id': 'dropbox:',
+ 'fast_list': True
+ })
+
+ # Should accept the request format (will fail on execution without rclone)
+ assert resp.status_code in (202, 400, 500) # 202 if rclone available, error otherwise
+
+
+def test_snapshot_browser_after_scan(tmp_path: Path):
+ """Test viewing scan snapshot after completion."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ test_dir = tmp_path / 'snapshot_test'
+ test_dir.mkdir()
+ (test_dir / 'data.csv').write_text('col1,col2\n1,2\n', encoding='utf-8')
+
+ with app.test_client() as client:
+ # Perform scan
+ scan_resp = client.post('/api/tasks', json={
+ 'type': 'scan',
+ 'path': str(test_dir),
+ 'recursive': True,
+ 'provider_id': 'local_fs',
+ 'root_id': '/'
+ })
+ assert scan_resp.status_code == 202
+ task_id = scan_resp.get_json()['task_id']
+
+ # Wait for completion
+ max_wait = 10
+ start_time = time.time()
+ scan_id = None
+
+ while time.time() - start_time < max_wait:
+ task_resp = client.get(f'/api/tasks/{task_id}')
+ task_data = task_resp.get_json()
+
+ if task_data['status'] == 'completed':
+ scan_id = task_data.get('scan_id')
+ break
+
+ time.sleep(0.5)
+
+ assert scan_id is not None, "Scan did not complete"
+
+ # Browse snapshot
+ snapshot_resp = client.get(f'/api/scans/{scan_id}/browse')
+ assert snapshot_resp.status_code == 200
+ snapshot_data = snapshot_resp.get_json()
+
+ assert 'entries' in snapshot_data
+ # Should find the data.csv file or parent folder
+ assert len(snapshot_data['entries']) >= 1
+
+
+def test_no_synchronous_scan_in_ui():
+ """Verify that synchronous /api/scan is NOT used by the Files page UI."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ resp = client.get('/datasets')
+ html = resp.data.decode('utf-8')
+
+ # Check that the JavaScript does NOT call /api/scan from provider panel
+ # (it should only use /api/tasks)
+ assert "'/api/scan'" not in html or html.count("'/api/scan'") <= 1
+ # Allow one mention in comments/strings, but not active code
+
+ # Verify /api/tasks is used instead
+ assert "'/api/tasks'" in html
+
+
+def test_current_location_display_updates():
+ """Test that the 'Current Location' panel updates when browsing."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ resp = client.get('/datasets')
+ html = resp.data.decode('utf-8')
+
+ # Check that current location display exists
+ assert 'prov-current-path' in html
+ assert 'Current Location:' in html
+
+ # Verify scan button is present and starts disabled
+ assert 'prov-scan-btn' in html
+ assert 'disabled' in html # Button should start disabled
+
+
+def test_scan_button_integration_with_background_form():
+ """Test that clicking scan button populates background scan form."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ resp = client.get('/datasets')
+ html = resp.data.decode('utf-8')
+
+ # Verify the scan button handler references background scan form elements
+ assert 'scan-path' in html # Background scan path input
+ assert 'scan-recursive' in html # Background scan recursive checkbox
+
+ # The JavaScript should populate these when scan button is clicked
+ # (Verified by manual testing and code inspection)
+
+
+def test_files_page_structure_consolidated():
+ """Verify that redundant sections have been removed/consolidated."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ resp = client.get('/datasets')
+ html = resp.data.decode('utf-8')
+ soup = BeautifulSoup(html, 'html.parser')
+
+ # Count h2 headings (main sections)
+ sections = soup.find_all('h2')
+ section_titles = [s.get_text() for s in sections]
+
+ # Should have core sections: Files, Snapshot browse, Scans Summary, Background Scan
+ assert 'Files' in section_titles
+ assert 'Snapshot (scanned) browse' in section_titles or 'Snapshot browse' in section_titles
+ assert 'Scans Summary' in section_titles
+ assert 'Start Background Scan' in section_titles
+
+ # Verify old sync scan form is gone
+ old_form = soup.find('form', id='prov-scan-form')
+ assert old_form is None, "Old synchronous scan form still present"
+
+
+def test_provider_selector_and_roots_load():
+ """Test that providers and roots load correctly."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ # Get providers
+ prov_resp = client.get('/api/providers')
+ assert prov_resp.status_code == 200
+ providers = prov_resp.get_json()
+ assert isinstance(providers, list)
+ assert len(providers) > 0
+
+ # Get roots for first provider
+ first_prov = providers[0]['id']
+ roots_resp = client.get(f'/api/provider_roots?provider_id={first_prov}')
+ assert roots_resp.status_code == 200
+ roots = roots_resp.get_json()
+ assert isinstance(roots, list)
diff --git a/tests/test_selective_scan_cache.py b/tests/test_selective_scan_cache.py
new file mode 100644
index 0000000..3f69836
--- /dev/null
+++ b/tests/test_selective_scan_cache.py
@@ -0,0 +1,220 @@
+"""
+Test selective scan cache integration using scan_items and directory_cache.
+"""
+import os
+import time
+from pathlib import Path
+import pytest
+
+
+def test_scan_populates_cache_tables(tmp_path: Path):
+ """Test that scans populate scan_items and directory_cache tables."""
+ from scidk.core import path_index_sqlite as pix
+ from scidk.core.migrations import migrate
+
+ # Setup test directory structure
+ (tmp_path / 'dir1').mkdir()
+ (tmp_path / 'dir1' / 'file1.txt').write_text('content1', encoding='utf-8')
+ (tmp_path / 'dir1' / 'file2.txt').write_text('content2', encoding='utf-8')
+ (tmp_path / 'dir2').mkdir()
+ (tmp_path / 'dir2' / 'file3.txt').write_text('content3', encoding='utf-8')
+
+ # Run a scan via the API
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ resp = client.post('/api/scan', json={
+ 'path': str(tmp_path),
+ 'recursive': True,
+ 'provider_id': 'local_fs'
+ })
+ assert resp.status_code == 200
+ data = resp.get_json()
+ assert data['status'] == 'ok'
+ scan_id = data['scan_id']
+
+ # Verify scan_items table is populated
+ conn = pix.connect()
+ migrate(conn)
+ cur = conn.cursor()
+ cur.execute("SELECT COUNT(*) FROM scan_items WHERE scan_id=?", (scan_id,))
+ count = cur.fetchone()[0]
+ assert count >= 5 # 2 dirs + 3 files
+
+ # Verify directory_cache table is populated
+ cur.execute("SELECT COUNT(*) FROM directory_cache WHERE scan_id=?", (scan_id,))
+ cache_count = cur.fetchone()[0]
+ assert cache_count >= 2 # At least tmp_path and dir1/dir2
+ conn.close()
+
+
+def test_rescan_uses_cache(tmp_path: Path):
+ """Test that rescanning unchanged directories uses cache and is faster."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ # Create a larger directory structure
+ for i in range(5):
+ subdir = tmp_path / f'dir{i}'
+ subdir.mkdir()
+ for j in range(10):
+ (subdir / f'file{j}.txt').write_text(f'content{i}{j}', encoding='utf-8')
+
+ # First scan (cold)
+ with app.test_client() as client:
+ resp1 = client.post('/api/scan', json={
+ 'path': str(tmp_path),
+ 'recursive': True,
+ 'provider_id': 'local_fs'
+ })
+ assert resp1.status_code == 200
+ data1 = resp1.get_json()
+ duration1 = data1['duration_sec']
+ cache_stats1 = data1.get('cache_stats', {})
+ hits1 = cache_stats1.get('cache_hits', 0)
+
+ # First scan may or may not have cache hits depending on DB state
+ # (if test DB persists, there might be cached data from previous runs)
+
+ time.sleep(0.1) # Small delay to ensure different scan_id
+
+ # Second scan (warm - should use cache)
+ resp2 = client.post('/api/scan', json={
+ 'path': str(tmp_path),
+ 'recursive': True,
+ 'provider_id': 'local_fs'
+ })
+ assert resp2.status_code == 200
+ data2 = resp2.get_json()
+ duration2 = data2['duration_sec']
+ cache_stats2 = data2.get('cache_stats', {})
+ hits2 = cache_stats2.get('cache_hits', 0)
+
+ # Second scan should have at least as many cache hits as first
+ # (directory contents unchanged, so cache should be effective)
+ assert hits2 >= hits1
+
+
+def test_cache_detects_changes(tmp_path: Path):
+ """Test that cache correctly detects when directories change."""
+ from scidk.app import create_app
+ app = create_app()
+ app.config['TESTING'] = True
+
+ # Create initial structure
+ (tmp_path / 'dir1').mkdir()
+ (tmp_path / 'dir1' / 'file1.txt').write_text('content1', encoding='utf-8')
+
+ # First scan
+ with app.test_client() as client:
+ resp1 = client.post('/api/scan', json={
+ 'path': str(tmp_path),
+ 'recursive': True,
+ 'provider_id': 'local_fs'
+ })
+ assert resp1.status_code == 200
+
+ # Add a new file (changes directory)
+ (tmp_path / 'dir1' / 'file2.txt').write_text('content2', encoding='utf-8')
+ time.sleep(0.1)
+
+ # Second scan - cache should miss because directory changed
+ resp2 = client.post('/api/scan', json={
+ 'path': str(tmp_path),
+ 'recursive': True,
+ 'provider_id': 'local_fs'
+ })
+ assert resp2.status_code == 200
+ data2 = resp2.get_json()
+
+ # Should detect the new file
+ assert data2['scanned'] >= 2
+
+
+def test_cache_can_be_disabled(tmp_path: Path):
+ """Test that cache can be disabled via environment variable."""
+ from scidk.app import create_app
+
+ # Create test structure
+ (tmp_path / 'file.txt').write_text('content', encoding='utf-8')
+
+ # Disable cache
+ original = os.environ.get('SCIDK_CACHE_SCAN')
+ try:
+ os.environ['SCIDK_CACHE_SCAN'] = '0'
+ app = create_app()
+ app.config['TESTING'] = True
+
+ with app.test_client() as client:
+ # First scan
+ resp1 = client.post('/api/scan', json={
+ 'path': str(tmp_path),
+ 'recursive': True,
+ 'provider_id': 'local_fs'
+ })
+ assert resp1.status_code == 200
+ data1 = resp1.get_json()
+ cache_stats1 = data1.get('cache_stats', {})
+
+ # Cache should be disabled
+ assert cache_stats1.get('enabled') is False
+
+ time.sleep(0.1)
+
+ # Second scan
+ resp2 = client.post('/api/scan', json={
+ 'path': str(tmp_path),
+ 'recursive': True,
+ 'provider_id': 'local_fs'
+ })
+ assert resp2.status_code == 200
+ data2 = resp2.get_json()
+ cache_stats2 = data2.get('cache_stats', {})
+
+ # Cache should still be disabled
+ assert cache_stats2.get('enabled') is False
+ finally:
+ if original is not None:
+ os.environ['SCIDK_CACHE_SCAN'] = original
+ elif 'SCIDK_CACHE_SCAN' in os.environ:
+ del os.environ['SCIDK_CACHE_SCAN']
+
+
+def test_cache_helpers():
+ """Test cache helper functions in path_index_sqlite."""
+ import hashlib
+ import time
+ from scidk.core import path_index_sqlite as pix
+ from scidk.core.migrations import migrate
+
+ # Use unique scan_id to avoid conflicts
+ scan_id = f"test_scan_{hashlib.sha1(str(time.time()).encode()).hexdigest()[:12]}"
+
+ # Test record_scan_items
+ rows = [
+ ('/tmp/file1.txt', 'file', 100, 1234567890.0, '.txt', 'text/plain', None, 'hash1', None),
+ ('/tmp/file2.txt', 'file', 200, 1234567891.0, '.txt', 'text/plain', None, 'hash2', None),
+ ]
+ inserted = pix.record_scan_items(scan_id, rows)
+ assert inserted == 2
+
+ # Test cache_directory_listing
+ pix.cache_directory_listing(scan_id, '/tmp', ['file1.txt', 'file2.txt'])
+
+ # Test get_cached_directory
+ cached = pix.get_cached_directory(scan_id, '/tmp')
+ assert cached == ['file1.txt', 'file2.txt']
+
+ # Test get_previous_scan_for_path
+ prev = pix.get_previous_scan_for_path('/tmp/file1.txt')
+ assert prev == scan_id
+
+ # Test get_scan_item
+ item = pix.get_scan_item(scan_id, '/tmp/file1.txt')
+ assert item is not None
+ assert item['path'] == '/tmp/file1.txt'
+ assert item['type'] == 'file'
+ assert item['size'] == 100