diff --git a/data_scraper/core/ci_logs_scraper.py b/data_scraper/core/ci_logs_scraper.py index 3063d84..5497395 100644 --- a/data_scraper/core/ci_logs_scraper.py +++ b/data_scraper/core/ci_logs_scraper.py @@ -59,7 +59,7 @@ def record_postprocessing(self, record): # pylint: disable=R0801 def cleanup_records( - self, records: list, backup_path: str = "ci_logs_all_data.pickle" + self, records: list, backup: bool, backup_path: str ) -> list: df = pd.DataFrame(records) @@ -72,8 +72,9 @@ def cleanup_records( LOG.info("Records stats AFTER cleanup:") LOG.info(df.info()) - LOG.info("Saving backup to: %s", backup_path) - df.to_pickle(backup_path) + if backup: + LOG.info("Saving backup to: %s", backup_path) + df.to_pickle(backup_path) return [CILogsRecord(**row) for row in df.to_dict(orient="records")] diff --git a/data_scraper/core/errata_scraper.py b/data_scraper/core/errata_scraper.py index 7828833..20e7041 100644 --- a/data_scraper/core/errata_scraper.py +++ b/data_scraper/core/errata_scraper.py @@ -149,7 +149,7 @@ def record_postprocessing(self, record): pass def cleanup_records( - self, records: list, backup_path: str = "errata_all_data.pickle" + self, records: list, backup: bool, backup_path: str ) -> list: df = pd.DataFrame(records) @@ -162,7 +162,8 @@ def cleanup_records( LOG.info("Records stats AFTER cleanup:") LOG.info(df.info()) - LOG.info("Saving backup to: %s", backup_path) - df.to_pickle(backup_path) + if backup: + LOG.info("Saving backup to: %s", backup_path) + df.to_pickle(backup_path) return [ErrataRecord(**row) for row in df.to_dict(orient="records")] diff --git a/data_scraper/core/scraper.py b/data_scraper/core/scraper.py index 043b908..089d98f 100644 --- a/data_scraper/core/scraper.py +++ b/data_scraper/core/scraper.py @@ -74,6 +74,8 @@ def __init__(self, config: Dict): organization="", api_key=config["llm_api_key"], ) + self.backup = config["backup"] + self.backup_path = config["backup_path"] def get_embedding_dimension(self) -> int: """Get embedding dimension for the model.""" @@ -142,7 +144,7 @@ def store_records(self, self.db_manager.upsert_data(self.config["db_collection_name"], [point]) def cleanup_records( - self, records: list, backup_path: str = "all_data.pickle" + self, records: list, backup: bool, backup_path: str ) -> list: """Cleanup Records""" @@ -164,7 +166,7 @@ def run(self, record_fields_for_key: tuple[str,...] = ("url",)): return records = self.get_records(documents) - records = self.cleanup_records(records) + records = self.cleanup_records(records, backup=self.backup, backup_path=self.backup_path) # Process and store embeddings self.store_records(records, record_fields_for_key, self.config["recreate_collection"]) @@ -295,7 +297,7 @@ def get_chunks(self, record: dict) -> list[str]: return chunks def cleanup_records( - self, records: list[JiraRecord], backup_path: str = "jira_all_bugs.pickle" + self, records: list[JiraRecord], backup: bool, backup_path: str ) -> list[JiraRecord]: """Cleanup Jira Records""" df = pd.DataFrame(records) @@ -309,8 +311,9 @@ def cleanup_records( LOG.info("Jira records stats AFTER cleanup:") LOG.info(df.info()) - LOG.info("Saving backup to: %s", backup_path) - df.to_pickle(backup_path) + if backup: + LOG.info("Saving backup to: %s", backup_path) + df.to_pickle(backup_path) return [JiraRecord(**row) for row in df.to_dict(orient="records")] @@ -410,7 +413,7 @@ def get_chunks(self, record: dict) -> list[str]: return self.text_processor.split_text(record["text"]) def cleanup_records( - self, records: list[dict], backup_path: str = "osp_all_docs.pickle" + self, records: list[dict], backup: bool, backup_path: str ) -> list[dict]: """Cleanup document records""" df = pd.DataFrame(records) @@ -424,7 +427,8 @@ def cleanup_records( LOG.info("Document records stats AFTER cleanup:") LOG.info(df.info()) - LOG.info("Saving backup to: %s", backup_path) - df.to_pickle(backup_path) + if backup: + LOG.info("Saving backup to: %s", backup_path) + df.to_pickle(backup_path) return [JiraRecord(**row) for row in df.to_dict(orient="records")] diff --git a/data_scraper/core/solutions_scraper.py b/data_scraper/core/solutions_scraper.py index fed0b21..a867c1a 100644 --- a/data_scraper/core/solutions_scraper.py +++ b/data_scraper/core/solutions_scraper.py @@ -76,7 +76,7 @@ def record_postprocessing(self, record): pass def cleanup_records( - self, records: list, backup_path: str = "solutions_all_data.csv" + self, records: list, backup: bool, backup_path: str ) -> list: df = pd.DataFrame(records) @@ -87,7 +87,8 @@ def cleanup_records( LOG.info("Records stats AFTER cleanup: %d", df.shape[0]) - LOG.info("Saving backup to: %s", backup_path) - df.to_csv(backup_path) + if backup: + LOG.info("Saving backup to: %s", backup_path) + df.to_csv(backup_path) return [SolutionsRecord(**row) for row in df.to_dict(orient="records")] diff --git a/data_scraper/main.py b/data_scraper/main.py index 4e978ff..47a8fe3 100644 --- a/data_scraper/main.py +++ b/data_scraper/main.py @@ -48,6 +48,10 @@ def jira_scraper(): ) parser.add_argument("--recreate_collection", action='store_true', default=False, help="Recreate database collection from scratch.") + parser.add_argument("--backup", action='store_true', default=False, + help="Save a scraper output to file") + parser.add_argument("--backup_path", type=str, + default="all_data.pickle") args = parser.parse_args() config_args = { @@ -65,6 +69,8 @@ def jira_scraper(): "date_cutoff": args.date_cutoff, "scraper_processes": args.scraper_processes, "recreate_collection": args.recreate_collection, + "backup": args.backup, + "backup_path": args.backup_path, } scraper = JiraScraper(config_args) @@ -100,6 +106,10 @@ def osp_doc_scraper(): parser.add_argument( "--rhoso_docs_path", type=str, default="", help="Path to downstream RHOSO docs generated by get_rhoso_plaintext_docs.sh") + parser.add_argument("--backup", action='store_true', default=False, + help="Save a scraper output to file") + parser.add_argument("--backup_path", type=str, + default="osp_all_docs.pickle") args = parser.parse_args() config_args = { @@ -114,6 +124,8 @@ def osp_doc_scraper(): "osp_version": args.osp_version, "recreate_collection": args.recreate_collection, "rhoso_docs_path": args.rhoso_docs_path, + "backup": args.backup, + "backup_path": args.backup_path, } scraper = OSPDocScraper(config_args) @@ -156,6 +168,10 @@ def errata_scraper() -> None: ) parser.add_argument("--recreate_collection", action='store_true', default=False, help="Recreate database collection from scratch.") + parser.add_argument("--backup", action='store_true', default=False, + help="Save a scraper output to file") + parser.add_argument("--backup_path", type=str, + default="errata_all_data.pickle") args = parser.parse_args() config_args = { @@ -176,6 +192,8 @@ def errata_scraper() -> None: "scraper_processes": args.scraper_processes, "date_cutoff": args.date_cutoff, "recreate_collection": args.recreate_collection, + "backup": args.backup, + "backup_path": args.backup_path, } scraper = ErrataScraper(config_args) @@ -215,6 +233,10 @@ def ci_logs_scraper() -> None: default=constants.DEFAULT_ZULL_TENANTS) parser.add_argument("--populate_db_from_json", type=bool, default=False, help="Used from Zuul jobs that create json file at the end of their runs.") + parser.add_argument("--backup", action='store_true', default=False, + help="Save a scraper output to file") + parser.add_argument("--backup_path", type=str, + default="aci_logs_all_data.pickle") args = parser.parse_args() config_args = { @@ -230,7 +252,9 @@ def ci_logs_scraper() -> None: "recreate_collection": args.recreate_collection, "pipelines": args.pipelines, "tenants": args.tenants, - "tracebacks_json": "/tmp/tracebacks.json" + "tracebacks_json": "/tmp/tracebacks.json", + "backup": args.backup, + "backup_path": args.backup_path, } @@ -277,6 +301,10 @@ def solutions_scraper() -> None: default=constants.SOLUTIONS_PRODUCT_NAME) parser.add_argument("--recreate_collection", action='store_true', default=False, help="Recreate database collection from scratch.") + parser.add_argument("--backup", action='store_true', default=False, + help="Save a scraper output to file") + parser.add_argument("--backup_path", type=str, + default="solutions_all_data.csv") args = parser.parse_args() config_args = { @@ -292,6 +320,8 @@ def solutions_scraper() -> None: "product_name": args.product_name, "max_results": args.max_results, "recreate_collection": args.recreate_collection, + "backup": args.backup, + "backup_path": args.backup_path, } scraper = SolutionsScraper(config_args)