Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions data_scraper/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,23 @@
JIRA_COLLECTION_NAME = "rca-knowledge-base"
OSP_DOCS_COLLECTION_NAME = "rca-osp-docs-knowledge-base"
ERRATA_COLLECTION_NAME = "rca-errata"
CI_LOGS_COLLECTION_NAME = "rca-ci"
DEFAULT_EMBEDDING_MODEL = "BAAI/bge-m3"
DEFAULT_JIRA_URL = "https://issues.redhat.com"
DEFAULT_JIRA_PROJECTS = {
"OSP",
"OSPCIX",
"OSPRH",
}
DEFAULT_ZULL_PIPELINES = {
"openstack-uni-jobs-periodic-integration-rhoso18.0-rhel9",
"openstack-uni-update-jobs-periodic-integration-rhoso18.0-rhel9",
"ci-framework-integrity",
}

DEFAULT_ZULL_TENANTS = {
"components-integration",
}
DEFAULT_CHUNK_SIZE = 1024
DEFAULT_MAX_RESULTS = 10000
DEFAULT_DATE_CUTOFF = "2000-01-01T00:00:00Z"
Expand Down
125 changes: 125 additions & 0 deletions data_scraper/core/ci_logs_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""Code for scraping test operator logs data"""
import logging
from typing import TypedDict, List, Dict
import json

import pandas as pd

from data_scraper.core.scraper import Scraper

LOG = logging.getLogger(__name__)
LOG.setLevel(logging.INFO)


class CILogsRecord(TypedDict):
"""CILogs data point."""
url: str
topic: str
text: str
components: list[str]
kind: str

class CILogsScraper(Scraper):
"""Main class for test operator logs scraping and processing."""

def __init__(self, config: dict):
super().__init__(config=config)
self.config = config

def get_documents(self) -> list[dict]:
all_failures = self._load_test_operator_results()

return all_failures

def get_records(self, documents: list[dict]) -> list[CILogsRecord]:
ci_logs_records: list[CILogsRecord] = []

for document in documents:
ci_logs_records.append({
"url": document["url"],
"topic": document["test_name"],
"text": document["traceback"],
"components": [],
"kind": "zuul_jobs",
})

return ci_logs_records

def get_chunks(self, record: dict) -> list[str]:
chunks = []

for field in ["text"]:
chunks += self.text_processor.split_text(record[field])

return chunks

def record_postprocessing(self, record):
# Postprocessing is not required
pass

# pylint: disable=R0801
def cleanup_records(
self, records: list, backup_path: str = "ci_logs_all_data.pickle"
) -> list:
df = pd.DataFrame(records)

LOG.info("Records stats BEFORE cleanup:")
LOG.info(df.info())

df = df.dropna()
df = df.drop_duplicates(subset=["text"])

LOG.info("Records stats AFTER cleanup:")
LOG.info(df.info())

LOG.info("Saving backup to: %s", backup_path)
df.to_pickle(backup_path)

return [CILogsRecord(**row) for row in df.to_dict(orient="records")]

def _get_job_url(self, url: str) -> str:
"""
Extract the job URL from a test report URL by removing everything from 'logs/controller-0'.

Args:
url (str): The complete report URL

Returns:
str: The job URL
"""
split_marker = "logs/controller-0"
if split_marker in url:
base_url = url.split(split_marker)[0]
return base_url.rstrip('/')

return url

def _load_test_operator_results(self) -> List[Dict[str, str]]:
"""
Loads test results from the tempest_tests_tracebacks.json

Args:
url: URL of the tempest report (used as a key to find matching results)

Returns:
List of dictionaries containing test names and tracebacks
"""
try:
file_name = self.config["tracebacks_json"]
with open(file_name, "r", encoding="utf-8") as f:
tracebacks = json.load(f)

results = []
for report in tracebacks:
job_url = self._get_job_url(report.get("url", "Unknown url"))
for test in report.get("failed_tests", []):
results.append({
"url": job_url,
"test_name": test.get("name", "Unknown test"),
"traceback": test.get("traceback", "No traceback available")
})
return results

except json.JSONDecodeError:
LOG.error("%s", "Error parsing tempest_tests_tracebacks.json file")
return []
74 changes: 73 additions & 1 deletion data_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from data_scraper.common import constants
from data_scraper.core.scraper import JiraScraper, OSPDocScraper
from data_scraper.core.errata_scraper import ErrataScraper

from data_scraper.core.ci_logs_scraper import CILogsScraper
from data_scraper.processors.ci_logs_provider import TestOperatorReportsProvider
logging.basicConfig(
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
Expand Down Expand Up @@ -178,3 +179,74 @@ def errata_scraper() -> None:

scraper = ErrataScraper(config_args)
scraper.run()




def ci_logs_scraper() -> None:
"""Entry point for command line execution."""
parser = ArgumentParser("ci_logs_scraper")

# Required arguments
parser.add_argument("--database_client_url", type=str, required=True)
parser.add_argument("--llm_server_url", type=str, required=True)
parser.add_argument("--llm_api_key", type=str, required=True)
parser.add_argument("--database_api_key", type=str, required=True)
parser.add_argument("--zuul-url", type=str, required=True)

# Optional arguments
parser.add_argument("--chunk_size", type=int,
default=constants.DEFAULT_CHUNK_SIZE)
parser.add_argument("--embedding_model", type=str,
default=constants.DEFAULT_EMBEDDING_MODEL)
parser.add_argument("--db_collection_name", type=str,
default=constants.CI_LOGS_COLLECTION_NAME)
parser.add_argument("--date_cutoff", type=datetime.fromisoformat,
default=datetime.fromisoformat(constants.DEFAULT_DATE_CUTOFF),
help=(
"No issues from before this date will be used. "
"Date must follow ISO format 'YYYY-MM-DD'"
)
)
parser.add_argument("--recreate_collection", type=bool, default=True,
help="Recreate database collection from scratch.")
parser.add_argument("--pipelines", nargs='+', type=str,
default=constants.DEFAULT_ZULL_PIPELINES)
parser.add_argument("--tenants", nargs='+', type=str,
default=constants.DEFAULT_ZULL_TENANTS)
parser.add_argument("--populate_db_from_json", type=bool, default=False,
help="Used from Zuul jobs that create json file at the end of their runs.")
args = parser.parse_args()

config_args = {
"database_client_url": args.database_client_url,
"llm_server_url": args.llm_server_url,
"llm_api_key": args.llm_api_key,
"database_api_key": args.database_api_key,
"chunk_size": args.chunk_size,
"embedding_model": args.embedding_model,
"db_collection_name": args.db_collection_name,
"zuul_url": args.zuul_url,
"date_cutoff": args.date_cutoff,
"recreate_collection": args.recreate_collection,
"pipelines": args.pipelines,
"tenants": args.tenants,
"tracebacks_json": "/tmp/tracebacks.json"
}



tenant = list(constants.DEFAULT_ZULL_TENANTS)[0] # just one tenenat is testsed ATM
pipelines = list(constants.DEFAULT_ZULL_PIPELINES) # Just RHOSO pipelines are tested ATM

if not args.populate_db_from_json:
# Get test operator reports, save tracebacks and create json
provider = TestOperatorReportsProvider(args.zuul_url,
tenant,
pipelines,
config_args["tracebacks_json"])
provider.run()

# when json is ready, proceed with tracebacks and store them to QdrantDB
scraper = CILogsScraper(config_args)
scraper.run()
Loading
Loading