From 3743fa72a598968707d687749888ccf3b9028619 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 11 Nov 2025 11:03:54 +0000 Subject: [PATCH 1/6] Initial plan From 875dd2a02009aa303e3f5201b98e40fb7e1359b3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 11 Nov 2025 11:12:01 +0000 Subject: [PATCH 2/6] Fix vector search, add file content retrieval, markdown rendering, and incremental indexing Co-authored-by: Mte90 <403283+Mte90@users.noreply.github.com> --- ai/analyzer.py | 31 ++++++++- db/models.py | 2 + endpoints/project_endpoints.py | 20 ++++-- endpoints/query_endpoints.py | 7 ++- endpoints/web_endpoints.py | 45 ++++++++++--- services/search_service.py | 14 +++-- templates/index.html | 112 ++++++++++++++++++++++++++++++--- 7 files changed, 199 insertions(+), 32 deletions(-) diff --git a/ai/analyzer.py b/ai/analyzer.py index 2b59993..7789c82 100644 --- a/ai/analyzer.py +++ b/ai/analyzer.py @@ -440,17 +440,42 @@ def analyze_local_path_background(local_path: str, database_path: str, venv_path -def search_semantic(query: str, database_path: str, top_k: int = 5): +def search_semantic(query: str, database_path: str, top_k: int = 5, include_content: bool = True): """ Uses sqlite-vector's vector_full_scan to retrieve best-matching chunks and returns - a list of {file_id, path, chunk_index, score}. + a list of {file_id, path, chunk_index, score, content (optional)}. + + Args: + query: Search query text + database_path: Path to the SQLite database + top_k: Number of results to return + include_content: Whether to retrieve and include the actual chunk text + + Returns: + List of dicts with file_id, path, chunk_index, score, and optionally content """ q_emb = _embedding_client.embed_text(query, file_path="", chunk_index=0) if not q_emb: return [] try: - return _search_vectors(database_path, q_emb, top_k=top_k) + results = _search_vectors(database_path, q_emb, top_k=top_k) + + # If content is requested, retrieve chunk text for each result + if include_content: + for result in results: + try: + chunk_text = _get_chunk_text( + database_path, + result["file_id"], + result["chunk_index"] + ) + result["content"] = chunk_text or "" + except Exception as e: + logger.warning(f"Failed to retrieve chunk text for {result['path']} chunk {result['chunk_index']}: {e}") + result["content"] = "" + + return results except Exception: raise diff --git a/db/models.py b/db/models.py index 41e7608..fb82fc3 100644 --- a/db/models.py +++ b/db/models.py @@ -12,10 +12,12 @@ class CreateProjectRequest(BaseModel): class IndexProjectRequest(BaseModel): project_id: str + incremental: Optional[bool] = True # Default to incremental indexing class QueryRequest(BaseModel): project_id: str query: str top_k: Optional[int] = 5 + include_content: Optional[bool] = True # Whether to include file content in results diff --git a/endpoints/project_endpoints.py b/endpoints/project_endpoints.py index 73e5eb0..4d96007 100644 --- a/endpoints/project_endpoints.py +++ b/endpoints/project_endpoints.py @@ -160,11 +160,12 @@ def api_index_project(http_request: Request, request: IndexProjectRequest, backg Index or re-index a project in the background. - **project_id**: Unique project identifier + - **incremental**: If True (default), only index new/changed files. If False, re-index all files. Starts background indexing process: - Scans project directory for code files - Generates embeddings for semantic search - - Uses incremental indexing (skips unchanged files) + - Uses incremental indexing by default (skips unchanged files) Rate limit: 10 requests per minute per IP. @@ -195,20 +196,31 @@ def api_index_project(http_request: Request, request: IndexProjectRequest, backg # Update status to indexing update_project_status(request.project_id, "indexing") - # Start background indexing + # Start background indexing with incremental flag venv_path = CFG.get("venv_path") + incremental = request.incremental if request.incremental is not None else True def index_callback(): try: - analyze_local_path_background(project_path, db_path, venv_path, MAX_FILE_SIZE, CFG) + from ai.analyzer import analyze_local_path_sync + # Use sync version directly with incremental flag + analyze_local_path_sync(project_path, db_path, venv_path, MAX_FILE_SIZE, CFG, incremental=incremental) update_project_status(request.project_id, "ready", datetime.utcnow().isoformat()) except Exception as e: + logger.exception(f"Indexing failed for project {request.project_id}: {e}") update_project_status(request.project_id, "error") raise background_tasks.add_task(index_callback) - return JSONResponse({"status": "indexing", "project_id": request.project_id}) + indexing_type = "incremental" if incremental else "full" + logger.info(f"Started {indexing_type} indexing for project {request.project_id}") + + return JSONResponse({ + "status": "indexing", + "project_id": request.project_id, + "incremental": incremental + }) except Exception as e: logger.exception(f"Error starting project indexing: {e}") return JSONResponse({"error": "Failed to start indexing"}, status_code=500) diff --git a/endpoints/query_endpoints.py b/endpoints/query_endpoints.py index 51674db..f8ba75c 100644 --- a/endpoints/query_endpoints.py +++ b/endpoints/query_endpoints.py @@ -29,16 +29,18 @@ def api_query(http_request: Request, request: QueryRequest): - **project_id**: Unique project identifier - **query**: Search query text - **top_k**: Number of results to return (default: 5, max: 20) + - **include_content**: Whether to include file content in results (default: True) Performs semantic search using vector embeddings: - Generates embedding for query - Finds most similar code chunks - Returns ranked results with scores + - Optionally includes actual file content Rate limit: 100 requests per minute per IP. Returns: - - **results**: Array of matching code chunks + - **results**: Array of matching code chunks (with content if requested) - **project_id**: Project identifier - **query**: Original query text """ @@ -58,7 +60,8 @@ def api_query(http_request: Request, request: QueryRequest): project_id=request.project_id, query=request.query, top_k=request.top_k, - use_cache=True + use_cache=True, + include_content=request.include_content if request.include_content is not None else True ) return JSONResponse(result) except ValueError as e: diff --git a/endpoints/web_endpoints.py b/endpoints/web_endpoints.py index b1866ae..279e499 100644 --- a/endpoints/web_endpoints.py +++ b/endpoints/web_endpoints.py @@ -184,24 +184,53 @@ async def code_endpoint(request: Request): # If RAG requested, perform semantic search and build context if use_rag: try: - retrieved = search_semantic(prompt, database_path, top_k=top_k) - # Build context WITHOUT including snippets: only include file references and scores + # Retrieve with content included + retrieved = search_semantic(prompt, database_path, top_k=top_k, include_content=True) + # Build context WITH actual file content for better RAG results context_parts = [] total_len = len(combined_context) for r in retrieved: - part = f"File: {r.get('path')} (score: {r.get('score', 0):.4f})\n" + content = r.get("content", "") + path = r.get("path", "") + score = r.get("score", 0) + + # Include file path, score, and actual content + part = f"File: {path} (score: {score:.4f})\n{content}\n" + if total_len + len(part) > TOTAL_CONTEXT_LIMIT: + # If full content doesn't fit, try to include at least partial content + remaining = TOTAL_CONTEXT_LIMIT - total_len + if remaining > 200: # Only include if we have meaningful space + truncated_content = content[:remaining - 100] + "..." + part = f"File: {path} (score: {score:.4f})\n{truncated_content}\n" + context_parts.append(part) + used_context.append({ + "path": path, + "score": score, + "content": truncated_content, + "file_id": r.get("file_id"), + "chunk_index": r.get("chunk_index") + }) break + context_parts.append(part) total_len += len(part) - used_context.append({"path": r.get("path"), "score": r.get("score")}) + used_context.append({ + "path": path, + "score": score, + "content": content, + "file_id": r.get("file_id"), + "chunk_index": r.get("chunk_index") + }) + if context_parts: - retrieved_text = "\n".join(context_parts) + retrieved_text = "\n---\n".join(context_parts) if combined_context: - combined_context = combined_context + "\n\nRetrieved:\n" + retrieved_text + combined_context = combined_context + "\n\nRetrieved Context:\n" + retrieved_text else: - combined_context = "Retrieved:\n" + retrieved_text - except Exception: + combined_context = "Retrieved Context:\n" + retrieved_text + except Exception as e: + logger.exception(f"RAG search failed: {e}") used_context = [] # Call the coding model with prompt and combined_context diff --git a/services/search_service.py b/services/search_service.py index df71187..588e4dc 100644 --- a/services/search_service.py +++ b/services/search_service.py @@ -24,7 +24,8 @@ def semantic_search( project_id: str, query: str, top_k: int = 5, - use_cache: bool = True + use_cache: bool = True, + include_content: bool = True ) -> Dict[str, Any]: """ Perform semantic search on a project. @@ -34,6 +35,7 @@ def semantic_search( query: Search query text top_k: Number of results to return use_cache: Whether to use result caching + include_content: Whether to include actual file content in results Returns: Dictionary with results, project_id, and query @@ -53,8 +55,8 @@ def semantic_search( if stats.get("file_count", 0) == 0: raise ValueError(f"Project not indexed: {project_id}") - # Check cache - if use_cache: + # Check cache (only if content is not required, as content makes cache key complex) + if use_cache and not include_content: cache_key = SearchService._make_cache_key(project_id, query, top_k) cached = search_cache.get(cache_key) if cached is not None: @@ -63,7 +65,7 @@ def semantic_search( # Perform search try: - results = search_semantic(query, db_path, top_k=top_k) + results = search_semantic(query, db_path, top_k=top_k, include_content=include_content) response = { "results": results, @@ -72,8 +74,8 @@ def semantic_search( "count": len(results) } - # Cache results - if use_cache: + # Cache results (only if content not included to keep cache size reasonable) + if use_cache and not include_content: search_cache.set(cache_key, response) logger.info(f"Search completed: {len(results)} results for '{query[:50]}'") diff --git a/templates/index.html b/templates/index.html index c9bb355..d0fe043 100644 --- a/templates/index.html +++ b/templates/index.html @@ -5,6 +5,10 @@ PicoCode - Local Codebase Assistant + + + +