diff --git a/dev b/dev index 48be8a2..1a282ab 160000 --- a/dev +++ b/dev @@ -1 +1 @@ -Subproject commit 48be8a29f0a78ba17aff96be116a62b7c8f7976f +Subproject commit 1a282ab38d94ea7f3fd12ec2f10580e6a40cc3ae diff --git a/scidk/app.py b/scidk/app.py index 81a0215..cda9048 100644 --- a/scidk/app.py +++ b/scidk/app.py @@ -972,6 +972,12 @@ def api_scan(): folders = [] files_skipped = 0 files_hashed = 0 + # Cache variables (initialized for all providers) + use_cache = False + prev_scan_id = None + cache_hits = 0 + cache_misses = 0 + if provider_id in ('local_fs', 'mounted_fs'): # Local/Mounted: enumerate filesystem and ingest into SQLite index base = Path(path) @@ -991,25 +997,98 @@ def api_scan(): fs.last_scan_source = 'python' except Exception: fs.last_scan_source = 'python' + + # Cache-aware traversal optimization: check for previous scan and reuse if unchanged + use_cache = os.environ.get('SCIDK_CACHE_SCAN', '1').strip() in ('1', 'true', 'yes', 'on') + + if use_cache: + try: + prev_scan_id = pix.get_previous_scan_for_path(str(base)) + except Exception: + prev_scan_id = None + + def _dir_unchanged(dir_path: Path, prev_sid: Optional[str]) -> bool: + """Check if directory listing hasn't changed since previous scan.""" + if not prev_sid: + return False + try: + cached_children = pix.get_cached_directory(prev_sid, str(dir_path)) + if cached_children is None: + return False + # Check if current directory listing matches cache + current_children = set() + for child in dir_path.iterdir(): + current_children.add(child.name) + return set(cached_children) == current_children + except Exception: + return False + try: if recursive: - for p in base.rglob('*'): - try: - if p.is_dir(): - items_dirs.add(p) - else: - items_files.append(p) - # ensure parent chain exists in dirs set - parent = p.parent - while parent and parent != parent.parent and str(parent).startswith(str(base)): - items_dirs.add(parent) - if parent == base: - break - parent = parent.parent - except Exception: + # For recursive scans, use cache-aware traversal + dirs_to_scan = [base] + visited = set() + + while dirs_to_scan: + current_dir = dirs_to_scan.pop(0) + if str(current_dir) in visited: continue - # include base itself as a folder - items_dirs.add(base) + visited.add(str(current_dir)) + items_dirs.add(current_dir) + + # Check if we can use cached data for this directory + if use_cache and prev_scan_id and _dir_unchanged(current_dir, prev_scan_id): + # Use cached scan_items for this directory subtree + cache_hits += 1 + try: + cached_children = pix.get_cached_directory(prev_scan_id, str(current_dir)) + if cached_children: + for child_name in cached_children: + child_path = current_dir / child_name + if child_path.exists(): + if child_path.is_dir(): + dirs_to_scan.append(child_path) + else: + items_files.append(child_path) + # ensure parent chain exists + parent = current_dir.parent + while parent and parent != parent.parent and str(parent).startswith(str(base)): + items_dirs.add(parent) + if parent == base: + break + parent = parent.parent + except Exception: + cache_misses += 1 + # Fallback to filesystem scan for this directory + for child in current_dir.iterdir(): + try: + if child.is_dir(): + dirs_to_scan.append(child) + else: + items_files.append(child) + except Exception: + continue + else: + # Filesystem scan for this directory + cache_misses += 1 + try: + for child in current_dir.iterdir(): + try: + if child.is_dir(): + dirs_to_scan.append(child) + else: + items_files.append(child) + # ensure parent chain exists + parent = child.parent + while parent and parent != parent.parent and str(parent).startswith(str(base)): + items_dirs.add(parent) + if parent == base: + break + parent = parent.parent + except Exception: + continue + except Exception: + continue else: for p in base.iterdir(): try: @@ -1074,6 +1153,31 @@ def _row_from_local(pth: Path, typ: str) -> tuple: for fpath in items_files: rows.append(_row_from_local(fpath, 'file')) ingested = pix.batch_insert_files(rows) + + # Populate scan_items and directory_cache for selective scanning optimization + try: + scan_item_rows = [] + dir_cache_map = {} # path -> list of children names + for row in rows: + # row format: (path, parent, name, depth, type, size, mtime, ext, mime, etag, hash, remote, scan_id, extra) + full_path, parent, name, depth, typ, size, mtime, ext, mime, etag, ahash, remote, _, extra = row + # Build scan_items row: (path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json) + scan_item_rows.append((full_path, typ, size, mtime, ext, mime, etag, ahash, extra)) + # Build directory cache: track children per parent directory + if parent: + dir_cache_map.setdefault(parent, []).append(name) + + # Insert scan_items + if scan_item_rows: + pix.record_scan_items(scan_id, scan_item_rows) + + # Insert directory_cache for each directory + for dir_path, children_names in dir_cache_map.items(): + pix.cache_directory_listing(scan_id, dir_path, children_names) + except Exception as cache_err: + # Non-fatal: log but continue + app.extensions['scidk'].setdefault('telemetry', {})['last_cache_error'] = str(cache_err) + # Also create in-memory datasets (keep legacy behavior) count = 0 for fpath in items_files: @@ -1354,6 +1458,12 @@ def _add_folder(full_path: str, name: str, parent: str): 'source': app.extensions['scidk'].get('interpreters', {}).get('source', 'default'), } }, + 'cache_stats': { + 'enabled': use_cache, + 'prev_scan_id': prev_scan_id, + 'cache_hits': cache_hits, + 'cache_misses': cache_misses, + }, } scans = app.extensions['scidk'].setdefault('scans', {}) scans[scan_id] = scan @@ -1400,7 +1510,23 @@ def _add_folder(full_path: str, name: str, parent: str): 'root_label': root_label, }) drec.setdefault('scan_ids', []).append(scan_id) - return jsonify({"status": "ok", "scan_id": scan_id, "scanned": count, "folder_count": len(folders), "ingested_rows": int(ingested), "duration_sec": duration, "path": str(path), "recursive": bool(recursive), "provider_id": provider_id}), 200 + return jsonify({ + "status": "ok", + "scan_id": scan_id, + "scanned": count, + "folder_count": len(folders), + "ingested_rows": int(ingested), + "duration_sec": duration, + "path": str(path), + "recursive": bool(recursive), + "provider_id": provider_id, + "cache_stats": { + 'enabled': use_cache, + 'prev_scan_id': prev_scan_id, + 'cache_hits': cache_hits, + 'cache_misses': cache_misses, + } + }), 200 except Exception as e: return jsonify({"status": "error", "error": str(e)}), 400 diff --git a/scidk/core/path_index_sqlite.py b/scidk/core/path_index_sqlite.py index 86ee6cc..40f4270 100644 --- a/scidk/core/path_index_sqlite.py +++ b/scidk/core/path_index_sqlite.py @@ -338,3 +338,150 @@ def apply_basic_change_history(scan_id: str, target_root: str) -> dict: return {"created": int(created), "modified": int(modified), "deleted": int(deleted)} finally: conn.close() + + +def record_scan_items(scan_id: str, rows: Iterable[Tuple], batch_size: int = 10000) -> int: + """ + Record scan items into scan_items table for caching. + Rows: (path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json) + Returns total inserted. + """ + from .migrations import migrate + conn = connect() + migrate(conn) + total = 0 + try: + cur = conn.cursor() + buf: List[Tuple] = [] + for r in rows: + # Expand row to match scan_items schema + buf.append((scan_id,) + r) + if len(buf) >= batch_size: + cur.executemany( + """INSERT INTO scan_items(scan_id, path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json) + VALUES (?,?,?,?,?,?,?,?,?,?)""", + buf, + ) + conn.commit() + total += len(buf) + buf.clear() + if buf: + cur.executemany( + """INSERT INTO scan_items(scan_id, path, type, size, modified_time, file_extension, mime_type, etag, hash, extra_json) + VALUES (?,?,?,?,?,?,?,?,?,?)""", + buf, + ) + conn.commit() + total += len(buf) + return total + finally: + conn.close() + + +def cache_directory_listing(scan_id: str, dir_path: str, children: List[str]) -> None: + """ + Cache directory listing in directory_cache table. + children: list of child file/folder names (not full paths) + """ + import json + import time + from .migrations import migrate + conn = connect() + migrate(conn) + try: + children_json = json.dumps(children) + created = time.time() + conn.execute( + """INSERT OR REPLACE INTO directory_cache(scan_id, path, children_json, created) + VALUES (?,?,?,?)""", + (scan_id, dir_path, children_json, created) + ) + conn.commit() + finally: + conn.close() + + +def get_cached_directory(scan_id: str, dir_path: str) -> Optional[List[str]]: + """ + Retrieve cached directory listing from directory_cache. + Returns list of child names or None if not cached. + """ + import json + from .migrations import migrate + conn = connect() + migrate(conn) + try: + cur = conn.cursor() + cur.execute( + "SELECT children_json FROM directory_cache WHERE scan_id=? AND path=?", + (scan_id, dir_path) + ) + row = cur.fetchone() + if not row: + return None + try: + return json.loads(row[0] or "[]") + except Exception: + return None + finally: + conn.close() + + +def get_previous_scan_for_path(path: str) -> Optional[str]: + """ + Find the most recent scan_id that includes this path. + Returns scan_id or None. + """ + conn = connect() + init_db(conn) + try: + cur = conn.cursor() + # Try scan_items first (more structured) + cur.execute( + "SELECT scan_id FROM scan_items WHERE path=? ORDER BY rowid DESC LIMIT 1", + (path,) + ) + row = cur.fetchone() + if row: + return row[0] + # Fallback to files table + cur.execute( + "SELECT scan_id FROM files WHERE path=? ORDER BY rowid DESC LIMIT 1", + (path,) + ) + row = cur.fetchone() + return row[0] if row else None + finally: + conn.close() + + +def get_scan_item(scan_id: str, path: str) -> Optional[Dict]: + """ + Retrieve scan item metadata from scan_items table. + Returns dict with path, type, size, modified_time, hash, etc. or None. + """ + from .migrations import migrate + conn = connect() + migrate(conn) + try: + cur = conn.cursor() + cur.execute( + """SELECT path, type, size, modified_time, file_extension, mime_type, etag, hash + FROM scan_items WHERE scan_id=? AND path=?""", + (scan_id, path) + ) + row = cur.fetchone() + if not row: + return None + return { + 'path': row[0], + 'type': row[1], + 'size': row[2], + 'modified_time': row[3], + 'file_extension': row[4], + 'mime_type': row[5], + 'etag': row[6], + 'hash': row[7], + } + finally: + conn.close() diff --git a/scidk/ui/templates/datasets.html b/scidk/ui/templates/datasets.html index b2f4c27..e058ade 100644 --- a/scidk/ui/templates/datasets.html +++ b/scidk/ui/templates/datasets.html @@ -47,19 +47,20 @@

Files

-
Select a provider, root, and item to see details. -
-
-
Scan selected folder (recursive):
-
- - -
-
- -
-
-
+
+
+ Current Location: +
+ No folder selected +
+
+
+ +
+ Select a folder to enable scanning. All scans run as background tasks with progress tracking. +
@@ -236,8 +237,8 @@

Start Background Scan

const provList = document.getElementById('prov-list'); const provCrumb = document.getElementById('prov-crumb'); const provPanel = document.getElementById('prov-panel-content'); - const provScanForm = document.getElementById('prov-scan-form'); - const provScanMsg = document.getElementById('prov-scan-msg'); + const provScanBtn = document.getElementById('prov-scan-btn'); + const provCurrentPath = document.getElementById('prov-current-path'); const btnROC = document.getElementById('open-rocrate'); const btnROCClose = document.getElementById('close-rocrate'); const rocFrame = document.getElementById('rocrate-frame'); @@ -342,7 +343,18 @@

Start Background Scan

}).join(''); provList.innerHTML = rows || 'Empty folder.'; attachProvHandlers(); - } catch(e){ provList.innerHTML = 'Browse failed.'; } + + // Update current path display and enable scan button + if (provCurrentPath) { + provCurrentPath.textContent = fullPath || '(root)'; + } + if (provScanBtn) { + provScanBtn.disabled = false; + } + } catch(e){ + provList.innerHTML = 'Browse failed.'; + if (provScanBtn) provScanBtn.disabled = true; + } } function attachProvHandlers(){ @@ -433,75 +445,65 @@

Start Background Scan

if (chkFast) chkFast.addEventListener('change', reBrowse); if (inpDepth) inpDepth.addEventListener('change', reBrowse); } - if (provScanForm){ - provScanForm.addEventListener('submit', async (e) => { - e.preventDefault(); - // Derive values from UI at submit-time to avoid race with async init + + // Unified scan button handler - uses background tasks + if (provScanBtn){ + provScanBtn.addEventListener('click', async () => { + // Get current browse context const provId = (provSelect && provSelect.value) || currentProv || 'local_fs'; const rootId = (rootSelect && rootSelect.value) || currentRoot || '/'; const inputPath = (provPathInput && provPathInput.value && provPathInput.value.trim()) || ''; const relOrAbs = inputPath || currentPath || ''; - // Compose full scan target for Rclone; keep other providers untouched + + // Compose full scan path let scanPath = relOrAbs || rootId || '/'; if (provId === 'rclone') { - scanPath = composePath(provId, rootId, relOrAbs); // e.g., "dropbox:AIPT" + scanPath = composePath(provId, rootId, relOrAbs); } - currentProv = provId; - currentRoot = rootId; - currentPath = relOrAbs; - if (!provId || !scanPath){ provScanMsg.textContent = 'Select a provider and folder first.'; return; } - const recursive = document.getElementById('prov-scan-recursive').checked; - const chkFast = document.getElementById('prov-browse-fast-list'); - const fastList = !!(chkFast && chkFast.checked); - const btn = provScanForm.querySelector('button[type="submit"]'); - if (btn) { btn.disabled = true; btn.textContent = 'Scanning…'; } - provScanMsg.textContent = `Starting scan for ${scanPath}…`; - // Add a local pseudo-task so status appears with other progress bars - const localId = 'provscan-' + Date.now(); - const localTask = { id: localId, type: 'scan', status: 'running', path: scanPath, processed: 0, total: null, progress: 0 }; - try { (window.scidkLocalTasks||[]).push(localTask); } catch(_) { /* ignore */ } - fetchTasks(); // trigger re-render with local task + + if (!provId || !scanPath){ + alert('Please select a provider and folder first.'); + return; + } + + // Populate background scan form and trigger it + const bgPathInput = document.getElementById('scan-path'); + const bgRecursive = document.getElementById('scan-recursive'); + if (bgPathInput) bgPathInput.value = scanPath; + if (bgRecursive) bgRecursive.checked = true; + + // Trigger background scan via tasks API (same as "Start Background Scan" button) + const recursive = true; + provScanBtn.disabled = true; + provScanBtn.textContent = 'Starting scan...'; + try { - const r = await fetch('/api/scan', { method: 'POST', headers: { 'Content-Type':'application/json' }, body: JSON.stringify({ provider_id: provId, root_id: rootId||'/', path: scanPath, recursive, fast_list: fastList }) }); - const ctype = (r.headers && r.headers.get('content-type')) || ''; - let j = null; - if (ctype.includes('application/json')){ - try { j = await r.json(); } catch(_) { j = null; } - } else { - // Non-JSON response (proxy/gateway error). Read text for clarity - try { const txt = await r.text(); throw new Error(`HTTP ${r.status}: ${txt}`); } catch(e){ throw e; } - } - if (r.ok && j){ - const files = j.scanned || 0; - const folders = (j.folder_count !== undefined) ? j.folder_count : undefined; - const dur = j.duration_sec ? (Math.round(j.duration_sec*10)/10+'s') : ''; - let msg = `Scan complete: ${j.scan_id} β€” files: ${files}`; - if (folders !== undefined) msg += `, folders: ${folders}`; - if (files === 0 && folders > 0 && !recursive){ msg += ' β€” Only folders found. Enable Recursive to include files in subfolders.'; } - if (dur) msg += ` (${dur})`; - provScanMsg.textContent = msg; - // Mark local task completed - localTask.status = 'completed'; - localTask.processed = files; - localTask.total = files || localTask.processed; - localTask.progress = 1; - localTask.scan_id = j.scan_id; + const payload = { + type: 'scan', + path: scanPath, + recursive, + provider_id: provId, + root_id: rootId + }; + const r = await fetch('/api/tasks', { + method: 'POST', + headers: { 'Content-Type':'application/json' }, + body: JSON.stringify(payload) + }); + + if (r.status === 202){ + startPolling(); + fetchTasks(); + alert(`Background scan started for: ${scanPath}\nCheck progress in "Scans Summary" section below.`); } else { - const err = (j && (j.error||j.message)) || `HTTP ${r.status}`; - provScanMsg.textContent = `Scan error: ${err}`; - localTask.status = 'error'; - localTask.error = err; - localTask.progress = 1; + const j = await r.json(); + alert('Scan error: ' + (j.error || r.status)); } } catch(err){ - provScanMsg.textContent = 'Scan error: ' + err; - localTask.status = 'error'; - localTask.error = (err && err.message) ? err.message : String(err); - localTask.progress = 1; - } - finally { - if (btn) { btn.disabled = false; btn.textContent = 'Scan'; } - fetchTasks(); // refresh tasks view to reflect final status + alert('Scan error: ' + err); + } finally { + provScanBtn.disabled = false; + provScanBtn.textContent = 'πŸ” Scan This Folder'; } }); } diff --git a/tests/test_files_page_e2e.py b/tests/test_files_page_e2e.py new file mode 100644 index 0000000..6b87ae4 --- /dev/null +++ b/tests/test_files_page_e2e.py @@ -0,0 +1,308 @@ +""" +End-to-end tests for Files page UX workflows. + +Validates the consolidated scan functionality and browser-to-scan integration. +""" +import os +import time +from pathlib import Path +import pytest +from bs4 import BeautifulSoup + + +def test_files_page_loads_successfully(): + """Test that the Files page loads without errors.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + resp = client.get('/datasets') + assert resp.status_code == 200 + assert b'Files' in resp.data + assert b'Provider' in resp.data + + +def test_scan_button_uses_background_tasks_only(): + """Verify that the scan button uses /api/tasks, not /api/scan.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + resp = client.get('/datasets') + assert resp.status_code == 200 + + # Check that the template has the new unified scan button + html = resp.data.decode('utf-8') + assert 'prov-scan-btn' in html + assert 'πŸ” Scan This Folder' in html + + # Check that the old sync scan form is removed + assert 'prov-scan-form' not in html + assert 'prov-scan-recursive' not in html # old checkbox removed + + +def test_browse_and_scan_integration(tmp_path: Path): + """Test the full workflow: browse folder β†’ scan it via background task.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + # Create test directory + test_dir = tmp_path / 'test_project' + test_dir.mkdir() + (test_dir / 'file1.txt').write_text('content1', encoding='utf-8') + (test_dir / 'file2.txt').write_text('content2', encoding='utf-8') + (test_dir / 'subdir').mkdir() + (test_dir / 'subdir' / 'file3.txt').write_text('content3', encoding='utf-8') + + with app.test_client() as client: + # Browse the directory + browse_resp = client.get(f'/api/browse?provider_id=local_fs&root_id=/&path={str(test_dir)}') + assert browse_resp.status_code == 200 + browse_data = browse_resp.get_json() + assert 'entries' in browse_data + assert len(browse_data['entries']) >= 3 # 2 files + 1 subdir + + # Trigger scan via background task (unified mechanism) + scan_resp = client.post('/api/tasks', json={ + 'type': 'scan', + 'path': str(test_dir), + 'recursive': True, + 'provider_id': 'local_fs', + 'root_id': '/' + }) + assert scan_resp.status_code == 202 # Accepted + scan_data = scan_resp.get_json() + assert 'task_id' in scan_data + task_id = scan_data['task_id'] + + # Poll for task completion (max 10 seconds) + max_wait = 10 + start_time = time.time() + task_completed = False + + while time.time() - start_time < max_wait: + task_resp = client.get(f'/api/tasks/{task_id}') + assert task_resp.status_code == 200 + task_data = task_resp.get_json() + + if task_data['status'] == 'completed': + task_completed = True + assert task_data['processed'] >= 3 + break + + time.sleep(0.5) + + assert task_completed, "Scan task did not complete in time" + + +def test_scan_history_unified_display(tmp_path: Path): + """Test that all scans appear in unified history.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + test_dir = tmp_path / 'scan_test' + test_dir.mkdir() + (test_dir / 'test.txt').write_text('test', encoding='utf-8') + + with app.test_client() as client: + # Create first scan + resp1 = client.post('/api/tasks', json={ + 'type': 'scan', + 'path': str(test_dir), + 'recursive': True, + 'provider_id': 'local_fs', + 'root_id': '/' + }) + assert resp1.status_code == 202 + + time.sleep(1) # Allow scan to process + + # Get all scans + scans_resp = client.get('/api/scans') + assert scans_resp.status_code == 200 + scans = scans_resp.get_json() + assert isinstance(scans, list) + assert len(scans) >= 1 + + # Verify scan appears in summary + found = any(s.get('path') == str(test_dir) for s in scans) + assert found, "Scan not found in unified history" + + +def test_rclone_scan_with_options(): + """Test that rclone-specific options are handled correctly.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + # Mock rclone scan with fast-list option + # Note: This will fail in test without actual rclone, but validates API contract + resp = client.post('/api/tasks', json={ + 'type': 'scan', + 'path': 'dropbox:test', + 'recursive': True, + 'provider_id': 'rclone', + 'root_id': 'dropbox:', + 'fast_list': True + }) + + # Should accept the request format (will fail on execution without rclone) + assert resp.status_code in (202, 400, 500) # 202 if rclone available, error otherwise + + +def test_snapshot_browser_after_scan(tmp_path: Path): + """Test viewing scan snapshot after completion.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + test_dir = tmp_path / 'snapshot_test' + test_dir.mkdir() + (test_dir / 'data.csv').write_text('col1,col2\n1,2\n', encoding='utf-8') + + with app.test_client() as client: + # Perform scan + scan_resp = client.post('/api/tasks', json={ + 'type': 'scan', + 'path': str(test_dir), + 'recursive': True, + 'provider_id': 'local_fs', + 'root_id': '/' + }) + assert scan_resp.status_code == 202 + task_id = scan_resp.get_json()['task_id'] + + # Wait for completion + max_wait = 10 + start_time = time.time() + scan_id = None + + while time.time() - start_time < max_wait: + task_resp = client.get(f'/api/tasks/{task_id}') + task_data = task_resp.get_json() + + if task_data['status'] == 'completed': + scan_id = task_data.get('scan_id') + break + + time.sleep(0.5) + + assert scan_id is not None, "Scan did not complete" + + # Browse snapshot + snapshot_resp = client.get(f'/api/scans/{scan_id}/browse') + assert snapshot_resp.status_code == 200 + snapshot_data = snapshot_resp.get_json() + + assert 'entries' in snapshot_data + # Should find the data.csv file or parent folder + assert len(snapshot_data['entries']) >= 1 + + +def test_no_synchronous_scan_in_ui(): + """Verify that synchronous /api/scan is NOT used by the Files page UI.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + resp = client.get('/datasets') + html = resp.data.decode('utf-8') + + # Check that the JavaScript does NOT call /api/scan from provider panel + # (it should only use /api/tasks) + assert "'/api/scan'" not in html or html.count("'/api/scan'") <= 1 + # Allow one mention in comments/strings, but not active code + + # Verify /api/tasks is used instead + assert "'/api/tasks'" in html + + +def test_current_location_display_updates(): + """Test that the 'Current Location' panel updates when browsing.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + resp = client.get('/datasets') + html = resp.data.decode('utf-8') + + # Check that current location display exists + assert 'prov-current-path' in html + assert 'Current Location:' in html + + # Verify scan button is present and starts disabled + assert 'prov-scan-btn' in html + assert 'disabled' in html # Button should start disabled + + +def test_scan_button_integration_with_background_form(): + """Test that clicking scan button populates background scan form.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + resp = client.get('/datasets') + html = resp.data.decode('utf-8') + + # Verify the scan button handler references background scan form elements + assert 'scan-path' in html # Background scan path input + assert 'scan-recursive' in html # Background scan recursive checkbox + + # The JavaScript should populate these when scan button is clicked + # (Verified by manual testing and code inspection) + + +def test_files_page_structure_consolidated(): + """Verify that redundant sections have been removed/consolidated.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + resp = client.get('/datasets') + html = resp.data.decode('utf-8') + soup = BeautifulSoup(html, 'html.parser') + + # Count h2 headings (main sections) + sections = soup.find_all('h2') + section_titles = [s.get_text() for s in sections] + + # Should have core sections: Files, Snapshot browse, Scans Summary, Background Scan + assert 'Files' in section_titles + assert 'Snapshot (scanned) browse' in section_titles or 'Snapshot browse' in section_titles + assert 'Scans Summary' in section_titles + assert 'Start Background Scan' in section_titles + + # Verify old sync scan form is gone + old_form = soup.find('form', id='prov-scan-form') + assert old_form is None, "Old synchronous scan form still present" + + +def test_provider_selector_and_roots_load(): + """Test that providers and roots load correctly.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + # Get providers + prov_resp = client.get('/api/providers') + assert prov_resp.status_code == 200 + providers = prov_resp.get_json() + assert isinstance(providers, list) + assert len(providers) > 0 + + # Get roots for first provider + first_prov = providers[0]['id'] + roots_resp = client.get(f'/api/provider_roots?provider_id={first_prov}') + assert roots_resp.status_code == 200 + roots = roots_resp.get_json() + assert isinstance(roots, list) diff --git a/tests/test_selective_scan_cache.py b/tests/test_selective_scan_cache.py new file mode 100644 index 0000000..3f69836 --- /dev/null +++ b/tests/test_selective_scan_cache.py @@ -0,0 +1,220 @@ +""" +Test selective scan cache integration using scan_items and directory_cache. +""" +import os +import time +from pathlib import Path +import pytest + + +def test_scan_populates_cache_tables(tmp_path: Path): + """Test that scans populate scan_items and directory_cache tables.""" + from scidk.core import path_index_sqlite as pix + from scidk.core.migrations import migrate + + # Setup test directory structure + (tmp_path / 'dir1').mkdir() + (tmp_path / 'dir1' / 'file1.txt').write_text('content1', encoding='utf-8') + (tmp_path / 'dir1' / 'file2.txt').write_text('content2', encoding='utf-8') + (tmp_path / 'dir2').mkdir() + (tmp_path / 'dir2' / 'file3.txt').write_text('content3', encoding='utf-8') + + # Run a scan via the API + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + resp = client.post('/api/scan', json={ + 'path': str(tmp_path), + 'recursive': True, + 'provider_id': 'local_fs' + }) + assert resp.status_code == 200 + data = resp.get_json() + assert data['status'] == 'ok' + scan_id = data['scan_id'] + + # Verify scan_items table is populated + conn = pix.connect() + migrate(conn) + cur = conn.cursor() + cur.execute("SELECT COUNT(*) FROM scan_items WHERE scan_id=?", (scan_id,)) + count = cur.fetchone()[0] + assert count >= 5 # 2 dirs + 3 files + + # Verify directory_cache table is populated + cur.execute("SELECT COUNT(*) FROM directory_cache WHERE scan_id=?", (scan_id,)) + cache_count = cur.fetchone()[0] + assert cache_count >= 2 # At least tmp_path and dir1/dir2 + conn.close() + + +def test_rescan_uses_cache(tmp_path: Path): + """Test that rescanning unchanged directories uses cache and is faster.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + # Create a larger directory structure + for i in range(5): + subdir = tmp_path / f'dir{i}' + subdir.mkdir() + for j in range(10): + (subdir / f'file{j}.txt').write_text(f'content{i}{j}', encoding='utf-8') + + # First scan (cold) + with app.test_client() as client: + resp1 = client.post('/api/scan', json={ + 'path': str(tmp_path), + 'recursive': True, + 'provider_id': 'local_fs' + }) + assert resp1.status_code == 200 + data1 = resp1.get_json() + duration1 = data1['duration_sec'] + cache_stats1 = data1.get('cache_stats', {}) + hits1 = cache_stats1.get('cache_hits', 0) + + # First scan may or may not have cache hits depending on DB state + # (if test DB persists, there might be cached data from previous runs) + + time.sleep(0.1) # Small delay to ensure different scan_id + + # Second scan (warm - should use cache) + resp2 = client.post('/api/scan', json={ + 'path': str(tmp_path), + 'recursive': True, + 'provider_id': 'local_fs' + }) + assert resp2.status_code == 200 + data2 = resp2.get_json() + duration2 = data2['duration_sec'] + cache_stats2 = data2.get('cache_stats', {}) + hits2 = cache_stats2.get('cache_hits', 0) + + # Second scan should have at least as many cache hits as first + # (directory contents unchanged, so cache should be effective) + assert hits2 >= hits1 + + +def test_cache_detects_changes(tmp_path: Path): + """Test that cache correctly detects when directories change.""" + from scidk.app import create_app + app = create_app() + app.config['TESTING'] = True + + # Create initial structure + (tmp_path / 'dir1').mkdir() + (tmp_path / 'dir1' / 'file1.txt').write_text('content1', encoding='utf-8') + + # First scan + with app.test_client() as client: + resp1 = client.post('/api/scan', json={ + 'path': str(tmp_path), + 'recursive': True, + 'provider_id': 'local_fs' + }) + assert resp1.status_code == 200 + + # Add a new file (changes directory) + (tmp_path / 'dir1' / 'file2.txt').write_text('content2', encoding='utf-8') + time.sleep(0.1) + + # Second scan - cache should miss because directory changed + resp2 = client.post('/api/scan', json={ + 'path': str(tmp_path), + 'recursive': True, + 'provider_id': 'local_fs' + }) + assert resp2.status_code == 200 + data2 = resp2.get_json() + + # Should detect the new file + assert data2['scanned'] >= 2 + + +def test_cache_can_be_disabled(tmp_path: Path): + """Test that cache can be disabled via environment variable.""" + from scidk.app import create_app + + # Create test structure + (tmp_path / 'file.txt').write_text('content', encoding='utf-8') + + # Disable cache + original = os.environ.get('SCIDK_CACHE_SCAN') + try: + os.environ['SCIDK_CACHE_SCAN'] = '0' + app = create_app() + app.config['TESTING'] = True + + with app.test_client() as client: + # First scan + resp1 = client.post('/api/scan', json={ + 'path': str(tmp_path), + 'recursive': True, + 'provider_id': 'local_fs' + }) + assert resp1.status_code == 200 + data1 = resp1.get_json() + cache_stats1 = data1.get('cache_stats', {}) + + # Cache should be disabled + assert cache_stats1.get('enabled') is False + + time.sleep(0.1) + + # Second scan + resp2 = client.post('/api/scan', json={ + 'path': str(tmp_path), + 'recursive': True, + 'provider_id': 'local_fs' + }) + assert resp2.status_code == 200 + data2 = resp2.get_json() + cache_stats2 = data2.get('cache_stats', {}) + + # Cache should still be disabled + assert cache_stats2.get('enabled') is False + finally: + if original is not None: + os.environ['SCIDK_CACHE_SCAN'] = original + elif 'SCIDK_CACHE_SCAN' in os.environ: + del os.environ['SCIDK_CACHE_SCAN'] + + +def test_cache_helpers(): + """Test cache helper functions in path_index_sqlite.""" + import hashlib + import time + from scidk.core import path_index_sqlite as pix + from scidk.core.migrations import migrate + + # Use unique scan_id to avoid conflicts + scan_id = f"test_scan_{hashlib.sha1(str(time.time()).encode()).hexdigest()[:12]}" + + # Test record_scan_items + rows = [ + ('/tmp/file1.txt', 'file', 100, 1234567890.0, '.txt', 'text/plain', None, 'hash1', None), + ('/tmp/file2.txt', 'file', 200, 1234567891.0, '.txt', 'text/plain', None, 'hash2', None), + ] + inserted = pix.record_scan_items(scan_id, rows) + assert inserted == 2 + + # Test cache_directory_listing + pix.cache_directory_listing(scan_id, '/tmp', ['file1.txt', 'file2.txt']) + + # Test get_cached_directory + cached = pix.get_cached_directory(scan_id, '/tmp') + assert cached == ['file1.txt', 'file2.txt'] + + # Test get_previous_scan_for_path + prev = pix.get_previous_scan_for_path('/tmp/file1.txt') + assert prev == scan_id + + # Test get_scan_item + item = pix.get_scan_item(scan_id, '/tmp/file1.txt') + assert item is not None + assert item['path'] == '/tmp/file1.txt' + assert item['type'] == 'file' + assert item['size'] == 100