Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions data_scraper/core/ci_logs_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def record_postprocessing(self, record):

# pylint: disable=R0801
def cleanup_records(
self, records: list, backup_path: str = "ci_logs_all_data.pickle"
self, records: list, backup: bool, backup_path: str
) -> list:
df = pd.DataFrame(records)

Expand All @@ -72,8 +72,9 @@ def cleanup_records(
LOG.info("Records stats AFTER cleanup:")
LOG.info(df.info())

LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)
if backup:
LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)

return [CILogsRecord(**row) for row in df.to_dict(orient="records")]

Expand Down
7 changes: 4 additions & 3 deletions data_scraper/core/errata_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def record_postprocessing(self, record):
pass

def cleanup_records(
self, records: list, backup_path: str = "errata_all_data.pickle"
self, records: list, backup: bool, backup_path: str
) -> list:
df = pd.DataFrame(records)

Expand All @@ -162,7 +162,8 @@ def cleanup_records(
LOG.info("Records stats AFTER cleanup:")
LOG.info(df.info())

LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)
if backup:
LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)

return [ErrataRecord(**row) for row in df.to_dict(orient="records")]
20 changes: 12 additions & 8 deletions data_scraper/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def __init__(self, config: Dict):
organization="",
api_key=config["llm_api_key"],
)
self.backup = config["backup"]
self.backup_path = config["backup_path"]

def get_embedding_dimension(self) -> int:
"""Get embedding dimension for the model."""
Expand Down Expand Up @@ -142,7 +144,7 @@ def store_records(self,
self.db_manager.upsert_data(self.config["db_collection_name"], [point])

def cleanup_records(
self, records: list, backup_path: str = "all_data.pickle"
self, records: list, backup: bool, backup_path: str
) -> list:
"""Cleanup Records"""

Expand All @@ -164,7 +166,7 @@ def run(self, record_fields_for_key: tuple[str,...] = ("url",)):
return

records = self.get_records(documents)
records = self.cleanup_records(records)
records = self.cleanup_records(records, backup=self.backup, backup_path=self.backup_path)

# Process and store embeddings
self.store_records(records, record_fields_for_key, self.config["recreate_collection"])
Expand Down Expand Up @@ -295,7 +297,7 @@ def get_chunks(self, record: dict) -> list[str]:
return chunks

def cleanup_records(
self, records: list[JiraRecord], backup_path: str = "jira_all_bugs.pickle"
self, records: list[JiraRecord], backup: bool, backup_path: str
) -> list[JiraRecord]:
"""Cleanup Jira Records"""
df = pd.DataFrame(records)
Expand All @@ -309,8 +311,9 @@ def cleanup_records(
LOG.info("Jira records stats AFTER cleanup:")
LOG.info(df.info())

LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)
if backup:
LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)

return [JiraRecord(**row) for row in df.to_dict(orient="records")]

Expand Down Expand Up @@ -410,7 +413,7 @@ def get_chunks(self, record: dict) -> list[str]:
return self.text_processor.split_text(record["text"])

def cleanup_records(
self, records: list[dict], backup_path: str = "osp_all_docs.pickle"
self, records: list[dict], backup: bool, backup_path: str
) -> list[dict]:
"""Cleanup document records"""
df = pd.DataFrame(records)
Expand All @@ -424,7 +427,8 @@ def cleanup_records(
LOG.info("Document records stats AFTER cleanup:")
LOG.info(df.info())

LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)
if backup:
LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)

return [JiraRecord(**row) for row in df.to_dict(orient="records")]
7 changes: 4 additions & 3 deletions data_scraper/core/solutions_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def record_postprocessing(self, record):
pass

def cleanup_records(
self, records: list, backup_path: str = "solutions_all_data.csv"
self, records: list, backup: bool, backup_path: str
) -> list:
df = pd.DataFrame(records)

Expand All @@ -87,7 +87,8 @@ def cleanup_records(

LOG.info("Records stats AFTER cleanup: %d", df.shape[0])

LOG.info("Saving backup to: %s", backup_path)
df.to_csv(backup_path)
if backup:
LOG.info("Saving backup to: %s", backup_path)
df.to_csv(backup_path)

return [SolutionsRecord(**row) for row in df.to_dict(orient="records")]
32 changes: 31 additions & 1 deletion data_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ def jira_scraper():
)
parser.add_argument("--recreate_collection", action='store_true', default=False,
help="Recreate database collection from scratch.")
parser.add_argument("--backup", action='store_true', default=False,
help="Save a scraper output to file")
parser.add_argument("--backup_path", type=str,
default="all_data.pickle")
args = parser.parse_args()

config_args = {
Expand All @@ -65,6 +69,8 @@ def jira_scraper():
"date_cutoff": args.date_cutoff,
"scraper_processes": args.scraper_processes,
"recreate_collection": args.recreate_collection,
"backup": args.backup,
"backup_path": args.backup_path,
}

scraper = JiraScraper(config_args)
Expand Down Expand Up @@ -100,6 +106,10 @@ def osp_doc_scraper():
parser.add_argument(
"--rhoso_docs_path", type=str, default="",
help="Path to downstream RHOSO docs generated by get_rhoso_plaintext_docs.sh")
parser.add_argument("--backup", action='store_true', default=False,
help="Save a scraper output to file")
parser.add_argument("--backup_path", type=str,
default="osp_all_docs.pickle")
args = parser.parse_args()

config_args = {
Expand All @@ -114,6 +124,8 @@ def osp_doc_scraper():
"osp_version": args.osp_version,
"recreate_collection": args.recreate_collection,
"rhoso_docs_path": args.rhoso_docs_path,
"backup": args.backup,
"backup_path": args.backup_path,
}

scraper = OSPDocScraper(config_args)
Expand Down Expand Up @@ -156,6 +168,10 @@ def errata_scraper() -> None:
)
parser.add_argument("--recreate_collection", action='store_true', default=False,
help="Recreate database collection from scratch.")
parser.add_argument("--backup", action='store_true', default=False,
help="Save a scraper output to file")
parser.add_argument("--backup_path", type=str,
default="errata_all_data.pickle")
args = parser.parse_args()

config_args = {
Expand All @@ -176,6 +192,8 @@ def errata_scraper() -> None:
"scraper_processes": args.scraper_processes,
"date_cutoff": args.date_cutoff,
"recreate_collection": args.recreate_collection,
"backup": args.backup,
"backup_path": args.backup_path,
}

scraper = ErrataScraper(config_args)
Expand Down Expand Up @@ -215,6 +233,10 @@ def ci_logs_scraper() -> None:
default=constants.DEFAULT_ZULL_TENANTS)
parser.add_argument("--populate_db_from_json", type=bool, default=False,
help="Used from Zuul jobs that create json file at the end of their runs.")
parser.add_argument("--backup", action='store_true', default=False,
help="Save a scraper output to file")
parser.add_argument("--backup_path", type=str,
default="aci_logs_all_data.pickle")
args = parser.parse_args()

config_args = {
Expand All @@ -230,7 +252,9 @@ def ci_logs_scraper() -> None:
"recreate_collection": args.recreate_collection,
"pipelines": args.pipelines,
"tenants": args.tenants,
"tracebacks_json": "/tmp/tracebacks.json"
"tracebacks_json": "/tmp/tracebacks.json",
"backup": args.backup,
"backup_path": args.backup_path,
}


Expand Down Expand Up @@ -277,6 +301,10 @@ def solutions_scraper() -> None:
default=constants.SOLUTIONS_PRODUCT_NAME)
parser.add_argument("--recreate_collection", action='store_true', default=False,
help="Recreate database collection from scratch.")
parser.add_argument("--backup", action='store_true', default=False,
help="Save a scraper output to file")
parser.add_argument("--backup_path", type=str,
default="solutions_all_data.csv")
args = parser.parse_args()

config_args = {
Expand All @@ -292,6 +320,8 @@ def solutions_scraper() -> None:
"product_name": args.product_name,
"max_results": args.max_results,
"recreate_collection": args.recreate_collection,
"backup": args.backup,
"backup_path": args.backup_path,
}

scraper = SolutionsScraper(config_args)
Expand Down
Loading