From cba1446dd69fc890b44c2271ebdf5cea77e2ecc2 Mon Sep 17 00:00:00 2001 From: David Sariel Date: Thu, 8 May 2025 20:08:03 +0300 Subject: [PATCH 1/4] Make TEMPEST_TEST_PATTERN less restrictive And TOBIKO_TEST_PATTERN as well. Previous pattrens were too restrictive TEMPEST_TEST_PATTERN = "tempest-test" TOBIKO_TEST_PATTERN = "tobiko-test" They missed pathes like logs/controller-0/ci-framework-data/tests/test_operator/post-deployment-tempest-tempest//stestr_results.html After the change patterns will match any path that contains 'tempest' or 'tobiko' in the name of the testsuite. --- data_scraper/processors/ci_logs_provider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_scraper/processors/ci_logs_provider.py b/data_scraper/processors/ci_logs_provider.py index 7ec16d0..d1316ec 100644 --- a/data_scraper/processors/ci_logs_provider.py +++ b/data_scraper/processors/ci_logs_provider.py @@ -36,8 +36,8 @@ # Test path constants TEST_OPERATOR_PATH = "logs/controller-0/ci-framework-data/tests/test_operator" -TEMPEST_TEST_PATTERN = "tempest-tests" -TOBIKO_TEST_PATTERN = "tobiko-tests" +TEMPEST_TEST_PATTERN = "tempest-" +TOBIKO_TEST_PATTERN = "tobiko-" async def fetch_with_gssapi(url, params=None, timeout=30.0): """ From 2126b1141d2dc0cd173c5ecb6739ab8a2e09b5cb Mon Sep 17 00:00:00 2001 From: David Sariel Date: Mon, 12 May 2025 17:55:33 +0300 Subject: [PATCH 2/4] Scraper.run method gets fields to combine record_id from Previously record_id was based on record['url']. This change allows to specify which record fields to use for record_id. The change is used in ci_logs_scraper to combine url with test_name. --- data_scraper/core/ci_logs_scraper.py | 4 ++-- data_scraper/core/scraper.py | 15 +++++++++------ data_scraper/main.py | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/data_scraper/core/ci_logs_scraper.py b/data_scraper/core/ci_logs_scraper.py index dac2407..3063d84 100644 --- a/data_scraper/core/ci_logs_scraper.py +++ b/data_scraper/core/ci_logs_scraper.py @@ -14,7 +14,7 @@ class CILogsRecord(TypedDict): """CILogs data point.""" url: str - topic: str + test_name: str text: str components: list[str] kind: str @@ -37,7 +37,7 @@ def get_records(self, documents: list[dict]) -> list[CILogsRecord]: for document in documents: ci_logs_records.append({ "url": document["url"], - "topic": document["test_name"], + "test_name": document["test_name"], "text": document["traceback"], "components": [], "kind": "zuul_jobs", diff --git a/data_scraper/core/scraper.py b/data_scraper/core/scraper.py index 0807aaa..0d4cf3c 100644 --- a/data_scraper/core/scraper.py +++ b/data_scraper/core/scraper.py @@ -92,7 +92,10 @@ def record_postprocessing(self, record: dict) -> None: after it has been created but before storing in vectorDB.""" raise NotImplementedError - def store_records(self, records: list, recreate: bool = True) -> None: + def store_records(self, + records: list, + record_fields_for_key: tuple[str, ...], + recreate: bool = True) -> None: """Process text and store embeddings in database.""" vector_size = self.get_embedding_dimension() @@ -107,9 +110,9 @@ def store_records(self, records: list, recreate: bool = True) -> None: raise IOError for record in tqdm(records, desc="Processing embeddings"): - if record['url']: - record_id = str(uuid.uuid5(uuid.NAMESPACE_URL, record["url"])) - else: + combined_key = "_".join([record[field] for field in record_fields_for_key]) + record_id = str(uuid.uuid5(uuid.NAMESPACE_URL, combined_key)) + if not record['url']: LOG.error("Missing required URL field") continue @@ -148,7 +151,7 @@ def get_records(self, documents: List[Dict]) -> list[dict]: """Convert raw data into list of dictionaries.""" raise NotImplementedError - def run(self): + def run(self, record_fields_for_key=("url",)): """Main execution method.""" documents = self.get_documents() if not documents: @@ -159,7 +162,7 @@ def run(self): records = self.cleanup_records(records) # Process and store embeddings - self.store_records(records, self.config["recreate_collection"]) + self.store_records(records, record_fields_for_key, self.config["recreate_collection"]) # Print final stats stats = self.db_manager.get_collection_stats(self.config["db_collection_name"]) diff --git a/data_scraper/main.py b/data_scraper/main.py index 8bfcfc3..7db72fa 100644 --- a/data_scraper/main.py +++ b/data_scraper/main.py @@ -248,7 +248,7 @@ def ci_logs_scraper() -> None: # when json is ready, proceed with tracebacks and store them to QdrantDB scraper = CILogsScraper(config_args) - scraper.run() + scraper.run(("url", "test_name")) def solutions_scraper() -> None: From e425ad0bce28cb94b32c8ff34605e30a351d911f Mon Sep 17 00:00:00 2001 From: Emilien Macchi Date: Tue, 13 May 2025 09:27:40 -0400 Subject: [PATCH 3/4] Address comments --- data_scraper/core/scraper.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/data_scraper/core/scraper.py b/data_scraper/core/scraper.py index 0d4cf3c..310a9b7 100644 --- a/data_scraper/core/scraper.py +++ b/data_scraper/core/scraper.py @@ -113,6 +113,17 @@ def store_records(self, combined_key = "_".join([record[field] for field in record_fields_for_key]) record_id = str(uuid.uuid5(uuid.NAMESPACE_URL, combined_key)) if not record['url']: + # Check if all required fields for the key are present + missing_fields = [ + field for field in record_fields_for_key + if field not in record or not record[field] + ] + if missing_fields: + LOG.error("Missing required fields for key generation: %s", missing_fields) + continue + + combined_key = "_".join([record[field] for field in record_fields_for_key]) + record_id = str(uuid.uuid5(uuid.NAMESPACE_URL, combined_key)) LOG.error("Missing required URL field") continue @@ -151,7 +162,7 @@ def get_records(self, documents: List[Dict]) -> list[dict]: """Convert raw data into list of dictionaries.""" raise NotImplementedError - def run(self, record_fields_for_key=("url",)): + def run(self, record_fields_for_key: tuple[str,...] = ("url",)): """Main execution method.""" documents = self.get_documents() if not documents: From 9fa89a77f7925ce8aa5df7fd5df62f013cf57e86 Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Tue, 13 May 2025 17:10:35 +0200 Subject: [PATCH 4/4] Check for missing keys and values --- data_scraper/core/scraper.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/data_scraper/core/scraper.py b/data_scraper/core/scraper.py index 310a9b7..043b908 100644 --- a/data_scraper/core/scraper.py +++ b/data_scraper/core/scraper.py @@ -110,22 +110,16 @@ def store_records(self, raise IOError for record in tqdm(records, desc="Processing embeddings"): + missing_fields = [ + field for field in record_fields_for_key + if field not in record or not record[field] + ] + if missing_fields: + LOG.error("Missing required fields for key generation: %s", missing_fields) + continue + combined_key = "_".join([record[field] for field in record_fields_for_key]) record_id = str(uuid.uuid5(uuid.NAMESPACE_URL, combined_key)) - if not record['url']: - # Check if all required fields for the key are present - missing_fields = [ - field for field in record_fields_for_key - if field not in record or not record[field] - ] - if missing_fields: - LOG.error("Missing required fields for key generation: %s", missing_fields) - continue - - combined_key = "_".join([record[field] for field in record_fields_for_key]) - record_id = str(uuid.uuid5(uuid.NAMESPACE_URL, combined_key)) - LOG.error("Missing required URL field") - continue chunks: list[str] = self.get_chunks(record)