diff --git a/data_scraper/common/constants.py b/data_scraper/common/constants.py index c42ec9b..6d417a8 100644 --- a/data_scraper/common/constants.py +++ b/data_scraper/common/constants.py @@ -3,6 +3,7 @@ JIRA_COLLECTION_NAME = "rca-knowledge-base" OSP_DOCS_COLLECTION_NAME = "rca-osp-docs-knowledge-base" ERRATA_COLLECTION_NAME = "rca-errata" +CI_LOGS_COLLECTION_NAME = "rca-ci" DEFAULT_EMBEDDING_MODEL = "BAAI/bge-m3" DEFAULT_JIRA_URL = "https://issues.redhat.com" DEFAULT_JIRA_PROJECTS = { @@ -10,6 +11,15 @@ "OSPCIX", "OSPRH", } +DEFAULT_ZULL_PIPELINES = { + "openstack-uni-jobs-periodic-integration-rhoso18.0-rhel9", + "openstack-uni-update-jobs-periodic-integration-rhoso18.0-rhel9", + "ci-framework-integrity", +} + +DEFAULT_ZULL_TENANTS = { + "components-integration", +} DEFAULT_CHUNK_SIZE = 1024 DEFAULT_MAX_RESULTS = 10000 DEFAULT_DATE_CUTOFF = "2000-01-01T00:00:00Z" diff --git a/data_scraper/core/ci_logs_scraper.py b/data_scraper/core/ci_logs_scraper.py new file mode 100644 index 0000000..dac2407 --- /dev/null +++ b/data_scraper/core/ci_logs_scraper.py @@ -0,0 +1,125 @@ +"""Code for scraping test operator logs data""" +import logging +from typing import TypedDict, List, Dict +import json + +import pandas as pd + +from data_scraper.core.scraper import Scraper + +LOG = logging.getLogger(__name__) +LOG.setLevel(logging.INFO) + + +class CILogsRecord(TypedDict): + """CILogs data point.""" + url: str + topic: str + text: str + components: list[str] + kind: str + +class CILogsScraper(Scraper): + """Main class for test operator logs scraping and processing.""" + + def __init__(self, config: dict): + super().__init__(config=config) + self.config = config + + def get_documents(self) -> list[dict]: + all_failures = self._load_test_operator_results() + + return all_failures + + def get_records(self, documents: list[dict]) -> list[CILogsRecord]: + ci_logs_records: list[CILogsRecord] = [] + + for document in documents: + ci_logs_records.append({ + "url": document["url"], + "topic": document["test_name"], + "text": document["traceback"], + "components": [], + "kind": "zuul_jobs", + }) + + return ci_logs_records + + def get_chunks(self, record: dict) -> list[str]: + chunks = [] + + for field in ["text"]: + chunks += self.text_processor.split_text(record[field]) + + return chunks + + def record_postprocessing(self, record): + # Postprocessing is not required + pass + + # pylint: disable=R0801 + def cleanup_records( + self, records: list, backup_path: str = "ci_logs_all_data.pickle" + ) -> list: + df = pd.DataFrame(records) + + LOG.info("Records stats BEFORE cleanup:") + LOG.info(df.info()) + + df = df.dropna() + df = df.drop_duplicates(subset=["text"]) + + LOG.info("Records stats AFTER cleanup:") + LOG.info(df.info()) + + LOG.info("Saving backup to: %s", backup_path) + df.to_pickle(backup_path) + + return [CILogsRecord(**row) for row in df.to_dict(orient="records")] + + def _get_job_url(self, url: str) -> str: + """ + Extract the job URL from a test report URL by removing everything from 'logs/controller-0'. + + Args: + url (str): The complete report URL + + Returns: + str: The job URL + """ + split_marker = "logs/controller-0" + if split_marker in url: + base_url = url.split(split_marker)[0] + return base_url.rstrip('/') + + return url + + def _load_test_operator_results(self) -> List[Dict[str, str]]: + """ + Loads test results from the tempest_tests_tracebacks.json + + Args: + url: URL of the tempest report (used as a key to find matching results) + + Returns: + List of dictionaries containing test names and tracebacks + """ + try: + file_name = self.config["tracebacks_json"] + with open(file_name, "r", encoding="utf-8") as f: + tracebacks = json.load(f) + + results = [] + for report in tracebacks: + job_url = self._get_job_url(report.get("url", "Unknown url")) + for test in report.get("failed_tests", []): + results.append({ + "url": job_url, + "test_name": test.get("name", "Unknown test"), + "traceback": test.get("traceback", "No traceback available") + }) + return results + + except json.JSONDecodeError: + LOG.error("%s", "Error parsing tempest_tests_tracebacks.json file") + return [] diff --git a/data_scraper/main.py b/data_scraper/main.py index 54ef3ce..43940d2 100644 --- a/data_scraper/main.py +++ b/data_scraper/main.py @@ -6,7 +6,8 @@ from data_scraper.common import constants from data_scraper.core.scraper import JiraScraper, OSPDocScraper from data_scraper.core.errata_scraper import ErrataScraper - +from data_scraper.core.ci_logs_scraper import CILogsScraper +from data_scraper.processors.ci_logs_provider import TestOperatorReportsProvider logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) @@ -178,3 +179,74 @@ def errata_scraper() -> None: scraper = ErrataScraper(config_args) scraper.run() + + + + +def ci_logs_scraper() -> None: + """Entry point for command line execution.""" + parser = ArgumentParser("ci_logs_scraper") + + # Required arguments + parser.add_argument("--database_client_url", type=str, required=True) + parser.add_argument("--llm_server_url", type=str, required=True) + parser.add_argument("--llm_api_key", type=str, required=True) + parser.add_argument("--database_api_key", type=str, required=True) + parser.add_argument("--zuul-url", type=str, required=True) + + # Optional arguments + parser.add_argument("--chunk_size", type=int, + default=constants.DEFAULT_CHUNK_SIZE) + parser.add_argument("--embedding_model", type=str, + default=constants.DEFAULT_EMBEDDING_MODEL) + parser.add_argument("--db_collection_name", type=str, + default=constants.CI_LOGS_COLLECTION_NAME) + parser.add_argument("--date_cutoff", type=datetime.fromisoformat, + default=datetime.fromisoformat(constants.DEFAULT_DATE_CUTOFF), + help=( + "No issues from before this date will be used. " + "Date must follow ISO format 'YYYY-MM-DD'" + ) + ) + parser.add_argument("--recreate_collection", type=bool, default=True, + help="Recreate database collection from scratch.") + parser.add_argument("--pipelines", nargs='+', type=str, + default=constants.DEFAULT_ZULL_PIPELINES) + parser.add_argument("--tenants", nargs='+', type=str, + default=constants.DEFAULT_ZULL_TENANTS) + parser.add_argument("--populate_db_from_json", type=bool, default=False, + help="Used from Zuul jobs that create json file at the end of their runs.") + args = parser.parse_args() + + config_args = { + "database_client_url": args.database_client_url, + "llm_server_url": args.llm_server_url, + "llm_api_key": args.llm_api_key, + "database_api_key": args.database_api_key, + "chunk_size": args.chunk_size, + "embedding_model": args.embedding_model, + "db_collection_name": args.db_collection_name, + "zuul_url": args.zuul_url, + "date_cutoff": args.date_cutoff, + "recreate_collection": args.recreate_collection, + "pipelines": args.pipelines, + "tenants": args.tenants, + "tracebacks_json": "/tmp/tracebacks.json" + } + + + + tenant = list(constants.DEFAULT_ZULL_TENANTS)[0] # just one tenenat is testsed ATM + pipelines = list(constants.DEFAULT_ZULL_PIPELINES) # Just RHOSO pipelines are tested ATM + + if not args.populate_db_from_json: + # Get test operator reports, save tracebacks and create json + provider = TestOperatorReportsProvider(args.zuul_url, + tenant, + pipelines, + config_args["tracebacks_json"]) + provider.run() + + # when json is ready, proceed with tracebacks and store them to QdrantDB + scraper = CILogsScraper(config_args) + scraper.run() diff --git a/data_scraper/processors/ci_logs_provider.py b/data_scraper/processors/ci_logs_provider.py new file mode 100644 index 0000000..7ec16d0 --- /dev/null +++ b/data_scraper/processors/ci_logs_provider.py @@ -0,0 +1,679 @@ +"""Code for test operator logs data provisioning""" +# Standard library imports +import asyncio +import json +import logging +import os +import re +import urllib.parse +import warnings +from datetime import datetime, timedelta + +import browser_cookie3 +import httpx +import requests +from bs4 import BeautifulSoup +from httpx_gssapi import HTTPSPNEGOAuth +from httpx_gssapi.gssapi_ import OPTIONAL + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +LOG = logging.getLogger(__name__) + +# API endpoint constants +API_BASE = "zuul/api" +API_TENANT = API_BASE + "/tenant/{tenant}" +API_BUILDS = API_TENANT + "/builds" + +# Filter constants +FILTER_RESULT_FAILURE = "FAILURE" +FILTER_LIMIT = 20000 + +# Time filter constant - builds older than this will be excluded +CUTOFF_TIME = datetime.now() - timedelta(days=14) # 2 weeks ago + +# Test path constants +TEST_OPERATOR_PATH = "logs/controller-0/ci-framework-data/tests/test_operator" +TEMPEST_TEST_PATTERN = "tempest-tests" +TOBIKO_TEST_PATTERN = "tobiko-tests" + +async def fetch_with_gssapi(url, params=None, timeout=30.0): + """ + Fetch content using Kerberos authentication. + + Args: + url: URL to fetch + params: Optional query parameters + timeout: Request timeout in seconds + + Returns: + Response text content + """ + async with httpx.AsyncClient( + verify=False, + follow_redirects=True, + timeout=timeout + ) as session: + response = await session.get( + url, + params=params, + auth=HTTPSPNEGOAuth(mutual_authentication=OPTIONAL) + ) + response.raise_for_status() + return response.text + +def make_authenticated_request(url, params=None, timeout=30.0): + """ + Make an authenticated request to a URL using multiple authentication methods. + + Args: + url: URL to fetch + params: Optional query parameters + timeout: Request timeout in seconds + + Returns: + Response content as text, or None if an error occurs + For JSON responses, the caller will need to parse the text + """ + parsed_url = urllib.parse.urlparse(url) + domain = parsed_url.netloc + + verify = False + warnings.filterwarnings('ignore', message='Unverified HTTPS request') + + # First try with Kerberos + try: + LOG.info("Attempting to authenticate using Kerberos...") + content = asyncio.run(fetch_with_gssapi(url, params, timeout)) + LOG.info("Kerberos authentication successful") + return content + + except (httpx.HTTPError, httpx.RequestError, httpx.TimeoutException) as e: + LOG.warning("Kerberos authentication failed due to HTTP error: %s", e) + LOG.info("Falling back to browser cookies authentication...") + + # Second try with cookies from browsers + cookies = None + if 'redhat.com' in domain: + try: + cookies = browser_cookie3.chrome(domain_name=domain) + LOG.info("Using Chrome cookies for domain: %s", domain) + except (ImportError, RuntimeError, FileNotFoundError) as chrome_error: + LOG.warning("Could not get Chrome cookies: %s", chrome_error) + + try: + cookies = browser_cookie3.firefox(domain_name=domain) + LOG.info("Using Firefox cookies for domain: %s", domain) + except (ImportError, RuntimeError, FileNotFoundError) as firefox_error: + LOG.warning("Could not get Firefox cookies: %s", firefox_error) + + try: + response = requests.get( + url, + params=params, + cookies=cookies, + verify=verify, + timeout=timeout + ) + response.raise_for_status() + return response.text + + except (requests.RequestException, requests.HTTPError, requests.ConnectionError, + requests.Timeout, requests.TooManyRedirects) as request_error: + LOG.error("Error fetching from %s: %s", url, request_error) + return None + +class TempestResultsParser: + """Parser for tempest test HTML reports.""" + + def __init__(self, source=None): + self.source = source + self.html_content = None + self.soup = None + self.failed_tests = [] + + if source: + self.load_content(source) + + def load_content(self, source): + """Load content from a URL or file.""" + self.source = source + self.html_content = self._get_content(source) + if self.html_content: + self.soup = BeautifulSoup(self.html_content, 'html.parser') + return self + + def _get_content(self, source): + """Retrieve content from a URL or file.""" + if source.startswith('http://') or source.startswith('https://'): + LOG.info("Downloading from URL: %s", source) + return make_authenticated_request(source, timeout=30.0) + + LOG.info("Reading local file: %s", source) + try: + with open(source, 'r', encoding='utf-8') as f: + content = f.read() + return content + except IOError as e: + LOG.error("IO error reading file %s: %s", source, e) + return None + + def parse(self): + """Parse the HTML content and extract failed tests.""" + if not self.soup: + LOG.warning("No content loaded or parsing failed. Nothing to parse.") + return self.failed_tests + + self.failed_tests = [] + self._parse_html_format() + return self.failed_tests + + + def _extract_test_name(self, test_name_part: str) -> str: + """Extract the test name from the text before the traceback.""" + # Extract the test name using a regex pattern + test_name_match = re.search(r'ft\d+\.\d+:\s*(.*?)\)?testtools', test_name_part) + if test_name_match: + test_name = test_name_match.group(1).strip() + if test_name.endswith('('): + test_name = test_name[:-1].strip() + else: + # Try alternative pattern for different formats + test_name_match = re.search(r'ft\d+\.\d+:\s*(.*?)$', test_name_part) + if test_name_match: + test_name = test_name_match.group(1).strip() + else: + test_name = "Unknown Test Name" + + # Remove any content within square brackets + # e.g. test_tagged_boot_devices[id-a2e65a6c,image,network,slow,volume] + # becomes test_tagged_boot_devices + test_name = re.sub(r'\[.*?\]', '', test_name).strip() + + # Remove any content within parentheses + test_name = re.sub(r'\(.*?\)', '', test_name).strip() + + return test_name + + def _parse_html_format(self): + """Parse HTML formatted tempest test results.""" + + soup = self.soup + + # The code is copy/past-ed from src/api.py#L101-L141 + # https://github.com/RCAccelerator/chatbot/. Get rid + # of duplication at some stage. ATM it's a QnD solution to have exaclty the same + # parsing of tempest test reports both at the endpoint and in scraper. + failed_test_rows = soup.find_all('tr', id=re.compile(r'^ft\d+\.\d+')) + + for row in failed_test_rows: + row_text = row.get_text().strip() + + traceback_start_marker = "Traceback (most recent call last):" + traceback_start_index = row_text.find(traceback_start_marker) + + if traceback_start_index != -1: + test_name_part = row_text[:traceback_start_index].strip() + test_name = self._extract_test_name(test_name_part) + + traceback_text = row_text[traceback_start_index:] + end_marker_index = traceback_text.find("}}}") + if end_marker_index != -1: + traceback_text = traceback_text[:end_marker_index].strip() + else: + traceback_text = traceback_text.strip() + + self.failed_tests.append((test_name, traceback_text or "No traceback found")) + + + def has_failures(self): + """Check if any failed tests were found.""" + return len(self.failed_tests) > 0 + + def get_failure_summary(self): + """Get a summary of the failed tests.""" + if not self.failed_tests: + return "No failed tests found." + + summary = f"Found {len(self.failed_tests)} failed tests:\n" + for test_name, _ in self.failed_tests: + summary += f"- {test_name}\n" + + return summary + + +class ZuulClient: + """Client for fetching Zuul build data and analyzing test reports.""" + + def __init__(self, base_url): + """ + Initialize the client with the base URL. + + Args: + base_url: Base URL for the Zuul server + """ + self.base_url = base_url.rstrip('/') + + def retrive_failed_builds(self, tenant, pipeline, limit=FILTER_LIMIT): + """ + Get failed builds for a specific pipeline as JSON. + + Args: + tenant: Tenant name + pipeline: Pipeline name + limit: Maximum number of builds to return + + Returns: + JSON response as dictionary with builds filtered by cutoff time + """ + endpoint = API_BUILDS.format(tenant=tenant) + url = f"{self.base_url}/{endpoint}" + + params = { + "limit": limit, + "result": FILTER_RESULT_FAILURE, + "pipeline": pipeline + } + + LOG.info("Fetching data from: %s with params: %s", url, params) + + try: + content_text = make_authenticated_request(url, params=params, timeout=60.0) + if not content_text: + return [] + + result = json.loads(content_text) + LOG.info("Successfully fetched %d builds", len(result)) + + filtered_builds = self._filter_builds_by_date(result) + return filtered_builds + + except (requests.RequestException, requests.HTTPError, requests.ConnectionError, + requests.Timeout, requests.TooManyRedirects) as e: + LOG.error("Error fetching data: %s", e) + return [] + + def _filter_builds_by_date(self, builds): + """ + Filter builds based on the cutoff time. + + Args: + builds: List of build dictionaries + + Returns: + Filtered list of build dictionaries + """ + filtered_builds = [] + + for build in builds: + end_time = build.get("end_time") + if not end_time: + end_time = build.get("start_time") + + # Skip if no timestamp available + if not end_time: + continue + + # Parse the timestamp + try: + # Handle different timestamp formats + if 'Z' in end_time: + # ISO format with Z for UTC + build_time = datetime.fromisoformat(end_time.replace('Z', '+00:00')) + elif 'T' in end_time and '+' in end_time: + # ISO format with timezone offset + build_time = datetime.fromisoformat(end_time) + else: + # Assume simple format + build_time = datetime.fromisoformat(end_time) + + # Include only builds newer than the cutoff time + if build_time >= CUTOFF_TIME: + filtered_builds.append(build) + except ValueError: + LOG.warning("%s", "Could not parse timestamp") + # Include builds with unparseable timestamps by default + filtered_builds.append(build) + + LOG.info("Filtered to %d builds after cutoff time", len(filtered_builds)) + return filtered_builds + + def fetch_html_content(self, url): + """ + Fetch HTML content from a URL. + + Args: + url: URL to fetch + + Returns: + HTML content as string, or None if an error occurs + """ + LOG.info("Fetching HTML from: %s", url) + return make_authenticated_request(url) + + def find_test_reports(self, builds_json): + """ + Find test reports in the build logs. + + Args: + builds_json: JSON dictionary of builds + + Returns: + Dictionary mapping build UUIDs to test report information + """ + results = {} + + for build in builds_json: + build_uuid = build.get("uuid", "unknown") + log_url = build.get("log_url") + + if not log_url: + LOG.info("Build %s has no log_url", build_uuid) + continue + + LOG.info("Processing build %s with log URL %s", build_uuid, log_url) + + results[build_uuid] = self._process_build_reports(log_url) + + return results + + def _process_build_reports(self, log_url): + """ + Process test reports for a single build. + + Args: + log_url: URL to the build logs + + Returns: + Dictionary with test report information + """ + test_operator_path = f"{log_url.rstrip('/')}/{TEST_OPERATOR_PATH}" + operator_html = self.fetch_html_content(test_operator_path) + + if not operator_html: + LOG.info("Log URL %s contains no test_operator directory", log_url) + return { + "log_url": log_url, + "status": "no_test_operator_dir", + "tempest_reports": [], + "tobiko_reports": [] + } + + test_dirs = self._find_test_directories(operator_html) + + if not test_dirs: + LOG.info("No test directories found in %s", test_operator_path) + return { + "log_url": log_url, + "status": "no_test_directories", + "tempest_reports": [], + "tobiko_reports": [] + } + + tempest_reports = [] + tobiko_reports = [] + + for test_dir in test_dirs: + test_dir_url = f"{test_operator_path}/{test_dir}" + + if TEMPEST_TEST_PATTERN in test_dir: + tempest_reports.extend(self._process_tempest_directory(test_dir_url, test_dir)) + elif TOBIKO_TEST_PATTERN in test_dir: + tobiko_reports.extend(self._process_tobiko_directory(test_dir_url, test_dir)) + + status = "test_reports_found" if (tempest_reports or tobiko_reports) else "no_test_reports" + + return { + "log_url": log_url, + "status": status, + "tempest_reports": tempest_reports, + "tobiko_reports": tobiko_reports + } + + def _find_test_directories(self, html_content): + """Find test directories in HTML content.""" + soup = BeautifulSoup(html_content, "html.parser") + test_dirs = [] + + for link in soup.find_all("a", href=True): + href = link["href"] + # Look for directories (ends with /) + if href.endswith("/") and not href.startswith("/") and href != "../": + test_dirs.append(href) + + return test_dirs + + def _process_tempest_directory(self, dir_url, dir_name): + """Process a tempest test directory.""" + LOG.info("Checking tempest test directory: %s", dir_url) + reports = [] + html_files = self._find_html_files(dir_url) + + for html_file in html_files: + parser = TempestResultsParser(html_file) + parser.parse() + has_failures = parser.has_failures() + + report = { + "url": html_file, + "directory": dir_name, + "has_failures": has_failures, + "failed_tests": [] + } + + if has_failures: + for test_name, traceback in parser.failed_tests: + report["failed_tests"].append({ + "name": test_name, + "traceback": traceback + }) + + reports.append(report) + + return reports + + def _process_tobiko_directory(self, dir_url, dir_name): + """Process a tobiko test directory.""" + LOG.info("Checking tobiko test directory: %s", dir_url) + reports = [] + html_files = self._find_html_files(dir_url) + + for html_file in html_files: + # For now, we're not checking tobiko reports for failures + reports.append({ + "url": html_file, + "directory": dir_name, + "has_failures": False, # Placeholder + "failed_tests": [] # Placeholder + }) + + return reports + + def _find_html_files(self, directory_url): + """ + Find HTML files in a directory. + + Args: + directory_url: URL of the directory to search + + Returns: + List of HTML file URLs + """ + html_files = [] + + dir_html = self.fetch_html_content(directory_url) + + if not dir_html: + LOG.info("Could not access directory: %s", directory_url) + return html_files + + dir_soup = BeautifulSoup(dir_html, "html.parser") + + for link in dir_soup.find_all("a", href=True): + href = link["href"] + if href.endswith(".html") and not href.startswith("/"): + html_url = f"{directory_url}/{href}" + html_files.append(html_url) + LOG.info("Found HTML file: %s", html_url) + + return html_files + + +def save_failed_tempest_paths(report_results): + """ + Save only the failed tempest report paths to a file. + + Args: + report_results: Dictionary mapping build UUIDs to test report information + """ + failed_paths = [] + + for _, result in report_results.items(): + if result["status"] == "test_reports_found": + tempest_reports = result["tempest_reports"] + for report in tempest_reports: + if report["has_failures"]: + failed_paths.append(report["url"]) + + with open("failed_tempest_paths.txt", "w", encoding='utf-8') as f: + for path in failed_paths: + f.write(f"{path}\n") + + return failed_paths + + +def create_tempest_failures_json(report_results, tracebacks_json): + """ + Create a JSON file containing tempest report URLs and their failed tests. + + Args: + report_results: Dictionary mapping build UUIDs to test report information + """ + tempest_failures = [] + + for _, result in report_results.items(): + if result["status"] == "test_reports_found": + tempest_reports = result["tempest_reports"] + for report in tempest_reports: + if report["has_failures"]: + tempest_failures.append({ + "url": report["url"], + "failed_tests": report["failed_tests"] + }) + + with open(tracebacks_json, "w", encoding='utf-8') as f: + json.dump(tempest_failures, f, indent=2) + + return tempest_failures + + +# # pylint: disable=too-few-public-methods + +# class TestOperatorReportsProvider: +# """Class responsible for retrieving and processing test operator reports.""" + +# def __init__(self, server_url, tenant, pipeline, tracebacks_json): +# """Initialize the TestOperatorReportsProvider with a server URL. + +# Args: +# server_url (str): The URL of the Zuul server. +# """ +# self.server_url = server_url +# self.tenant = tenant +# self.pipeline = pipeline +# self.tracebacks_json = tracebacks_json + +# def run(self): +# """Entry point of TestOperatorReportsProvider. +# """ +# server_url = self.server_url +# tenant = self.tenant +# pipeline = self.pipeline + +# client = ZuulClient(server_url) + + +# builds = client.get_failed_builds_json(tenant, pipeline) + +# if not builds: +# LOG.error("%s", "No builds found or authentication failed") +# return + +# LOG.info("Found %d failed builds within the last 2 weeks (after %s)", +# len(builds), CUTOFF_TIME.isoformat()) + +# with open("failed_builds.json", "w", encoding='utf-8') as f: +# json.dump(builds, f, indent=2) + +# report_results = client.find_test_reports(builds) + +# failed_paths = save_failed_tempest_paths(report_results) + +# for path in failed_paths: +# print(path) + +# create_tempest_failures_json(report_results,self.tracebacks_json ) + + +class TestOperatorReportsProvider: + """Class responsible for retrieving and processing test operator reports.""" + + def __init__(self, server_url, tenant, pipelines, tracebacks_json): + """Initialize the TestOperatorReportsProvider with a server URL. + + Args: + server_url (str): The URL of the Zuul server. + tenant (str): The tenant name. + pipelines (list): A list of pipeline names. + tracebacks_json (str): Path to the tracebacks JSON file. + """ + self.server_url = server_url + self.tenant = tenant + self.pipelines = pipelines + self.tracebacks_json = tracebacks_json + + if os.path.exists("failed_builds.txt"): + os.remove("failed_builds.txt") + LOG.info("Deleted existing failed_builds.txt file") + + if os.path.exists(self.tracebacks_json): + os.remove(self.tracebacks_json) + LOG.info("Deleted existing %s file", self.tracebacks_json) + + def run(self): + """Entry point of TestOperatorReportsProvider. + """ + server_url = self.server_url + tenant = self.tenant + + client = ZuulClient(server_url) + + all_builds = [] + + for pipeline in self.pipelines: + LOG.info("Processing pipeline: %s", pipeline) + builds = client.retrive_failed_builds(tenant, pipeline) + + if not builds: + LOG.warning("No builds found for pipeline %s", pipeline) + continue + + LOG.info("Found %d failed builds in pipeline %s within the last 2 weeks (after %s)", + len(builds), pipeline, CUTOFF_TIME.isoformat()) + + all_builds.extend(builds) + + with open("failed_builds.txt", "a", encoding='utf-8') as f: + for build in builds: + f.write(f"{build}\n") + + report_results = client.find_test_reports(all_builds) + + failed_paths = save_failed_tempest_paths(report_results) + + for path in failed_paths: + print(path) + + create_tempest_failures_json(report_results,self.tracebacks_json) diff --git a/data_scraper/processors/vector_store.py b/data_scraper/processors/vector_store.py index 5370f48..919cb2e 100644 --- a/data_scraper/processors/vector_store.py +++ b/data_scraper/processors/vector_store.py @@ -35,8 +35,10 @@ def __init__(self, client_url: str, api_key: str, timeout=100): self.client = QdrantClient(client_url, api_key=api_key, timeout=timeout) def recreate_collection(self, collection_name: str, vector_size: int): - """Recreate the collection with specified parameters.""" + """Recreate the collection with specified parameters, preserving a snapshot first.""" + if self.check_collection(collection_name): + self.client.create_snapshot(collection_name=collection_name) self.client.delete_collection(collection_name) self.client.create_collection( diff --git a/pdm.lock b/pdm.lock index ba2a914..7316ebb 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:88441772af1a0d4a2337bc05a1d8b8176519cac6addfde716906daf61bd3bfbe" +content_hash = "sha256:df11e293dd0f226387cc80c9feb53cf4c65c3b8f59486e6dd80dab479ea53c2b" [[metadata.targets]] requires_python = "==3.12.*" @@ -70,6 +70,23 @@ files = [ {file = "beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195"}, ] +[[package]] +name = "browser-cookie3" +version = "0.20.1" +summary = "Loads cookies from your browser into a cookiejar object so can download with urllib and other libraries the same content you see in the web browser." +groups = ["default"] +dependencies = [ + "dbus-python; \"bsd\" in sys_platform and python_version < \"3.7\" or sys_platform == \"linux\" and python_version < \"3.7\"", + "jeepney; \"bsd\" in sys_platform and python_version >= \"3.7\" or sys_platform == \"linux\" and python_version >= \"3.7\"", + "lz4", + "pycryptodomex", + "shadowcopy; python_version >= \"3.7\" and platform_system == \"Windows\"", +] +files = [ + {file = "browser_cookie3-0.20.1-py3-none-any.whl", hash = "sha256:4b38bf669d386250733c8339f0036e1cf09c3d8e4d326fd507b9afb84def13d6"}, + {file = "browser_cookie3-0.20.1.tar.gz", hash = "sha256:6d8d0744bf42a5327c951bdbcf77741db3455b8b4e840e18bab266d598368a12"}, +] + [[package]] name = "bs4" version = "0.0.2" @@ -218,7 +235,6 @@ version = "5.2.1" requires_python = ">=3.8" summary = "Decorators for Humans" groups = ["default"] -marker = "sys_platform != \"win32\"" files = [ {file = "decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a"}, {file = "decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360"}, @@ -316,7 +332,7 @@ files = [ [[package]] name = "google-api-python-client" -version = "2.167.0" +version = "2.169.0" requires_python = ">=3.7" summary = "Google API Client Library for Python" groups = ["default"] @@ -328,13 +344,13 @@ dependencies = [ "uritemplate<5,>=3.0.1", ] files = [ - {file = "google_api_python_client-2.167.0-py2.py3-none-any.whl", hash = "sha256:ce25290cc229505d770ca5c8d03850e0ae87d8e998fc6dd743ecece018baa396"}, - {file = "google_api_python_client-2.167.0.tar.gz", hash = "sha256:a458d402572e1c2caf9db090d8e7b270f43ff326bd9349c731a86b19910e3995"}, + {file = "google_api_python_client-2.169.0-py3-none-any.whl", hash = "sha256:dae3e882dc0e6f28e60cf09c1f13fedfd881db84f824dd418aa9e44def2fe00d"}, + {file = "google_api_python_client-2.169.0.tar.gz", hash = "sha256:0585bb97bd5f5bf3ed8d4bf624593e4c5a14d06c811d1952b07a1f94b4d12c51"}, ] [[package]] name = "google-auth" -version = "2.39.0" +version = "2.40.1" requires_python = ">=3.7" summary = "Google Authentication Library" groups = ["default"] @@ -344,8 +360,8 @@ dependencies = [ "rsa<5,>=3.1.4", ] files = [ - {file = "google_auth-2.39.0-py2.py3-none-any.whl", hash = "sha256:0150b6711e97fb9f52fe599f55648950cc4540015565d8fbb31be2ad6e1548a2"}, - {file = "google_auth-2.39.0.tar.gz", hash = "sha256:73222d43cdc35a3aeacbfdcaf73142a97839f10de930550d89ebfe1d0a00cde7"}, + {file = "google_auth-2.40.1-py2.py3-none-any.whl", hash = "sha256:ed4cae4f5c46b41bae1d19c036e06f6c371926e97b19e816fc854eff811974ee"}, + {file = "google_auth-2.40.1.tar.gz", hash = "sha256:58f0e8416a9814c1d86c9b7f6acf6816b51aba167b2c76821965271bac275540"}, ] [[package]] @@ -462,7 +478,6 @@ version = "1.9.0" requires_python = ">=3.8" summary = "Python GSSAPI Wrapper" groups = ["default"] -marker = "sys_platform != \"win32\"" dependencies = [ "decorator", ] @@ -561,6 +576,21 @@ files = [ {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, ] +[[package]] +name = "httpx-gssapi" +version = "0.4" +requires_python = ">=3.9" +summary = "A Python GSSAPI authentication handler for HTTPX" +groups = ["default"] +dependencies = [ + "gssapi", + "httpx<0.29,>=0.16", +] +files = [ + {file = "httpx_gssapi-0.4-py3-none-any.whl", hash = "sha256:ea390c62015d3b58e594936af75b3fbf6421e53e334d587db76623f2f07c93e8"}, + {file = "httpx_gssapi-0.4.tar.gz", hash = "sha256:8c9a4fa526484c1323c0a4436ef1198f34c0f1d6dde0973ea38f960d51e16ef0"}, +] + [[package]] name = "httpx" version = "0.28.1" @@ -630,6 +660,18 @@ files = [ {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, ] +[[package]] +name = "jeepney" +version = "0.9.0" +requires_python = ">=3.7" +summary = "Low-level, pure Python DBus protocol wrapper." +groups = ["default"] +marker = "\"bsd\" in sys_platform and python_version >= \"3.7\" or sys_platform == \"linux\" and python_version >= \"3.7\"" +files = [ + {file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"}, + {file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"}, +] + [[package]] name = "jiter" version = "0.9.0" @@ -746,6 +788,24 @@ files = [ {file = "langsmith-0.3.33.tar.gz", hash = "sha256:0f439e945528c6d14140137b918cc048aea04c6a987525926dbfda2560002924"}, ] +[[package]] +name = "lz4" +version = "4.4.4" +requires_python = ">=3.9" +summary = "LZ4 Bindings for Python" +groups = ["default"] +files = [ + {file = "lz4-4.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:23ae267494fdd80f0d2a131beff890cf857f1b812ee72dbb96c3204aab725553"}, + {file = "lz4-4.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fff9f3a1ed63d45cb6514bfb8293005dc4141341ce3500abdfeb76124c0b9b2e"}, + {file = "lz4-4.4.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ea7f07329f85a8eda4d8cf937b87f27f0ac392c6400f18bea2c667c8b7f8ecc"}, + {file = "lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ccab8f7f7b82f9fa9fc3b0ba584d353bd5aa818d5821d77d5b9447faad2aaad"}, + {file = "lz4-4.4.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e43e9d48b2daf80e486213128b0763deed35bbb7a59b66d1681e205e1702d735"}, + {file = "lz4-4.4.4-cp312-cp312-win32.whl", hash = "sha256:33e01e18e4561b0381b2c33d58e77ceee850a5067f0ece945064cbaac2176962"}, + {file = "lz4-4.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d21d1a2892a2dcc193163dd13eaadabb2c1b803807a5117d8f8588b22eaf9f12"}, + {file = "lz4-4.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:2f4f2965c98ab254feddf6b5072854a6935adab7bc81412ec4fe238f07b85f62"}, + {file = "lz4-4.4.4.tar.gz", hash = "sha256:070fd0627ec4393011251a094e08ed9fdcc78cb4e7ab28f507638eee4e39abda"}, +] + [[package]] name = "mccabe" version = "0.7.0" @@ -994,6 +1054,26 @@ files = [ {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] +[[package]] +name = "pycryptodomex" +version = "3.22.0" +requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +summary = "Cryptographic library for Python" +groups = ["default"] +files = [ + {file = "pycryptodomex-3.22.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:aef4590263b9f2f6283469e998574d0bd45c14fb262241c27055b82727426157"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:5ac608a6dce9418d4f300fab7ba2f7d499a96b462f2b9b5c90d8d994cd36dcad"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a24f681365ec9757ccd69b85868bbd7216ba451d0f86f6ea0eed75eeb6975db"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:259664c4803a1fa260d5afb322972813c5fe30ea8b43e54b03b7e3a27b30856b"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7127d9de3c7ce20339e06bcd4f16f1a1a77f1471bcf04e3b704306dde101b719"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ee75067b35c93cc18b38af47b7c0664998d8815174cfc66dd00ea1e244eb27e6"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:1a8b0c5ba061ace4bcd03496d42702c3927003db805b8ec619ea6506080b381d"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:bfe4fe3233ef3e58028a3ad8f28473653b78c6d56e088ea04fe7550c63d4d16b"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-win32.whl", hash = "sha256:2cac9ed5c343bb3d0075db6e797e6112514764d08d667c74cb89b931aac9dddd"}, + {file = "pycryptodomex-3.22.0-cp37-abi3-win_amd64.whl", hash = "sha256:ff46212fda7ee86ec2f4a64016c994e8ad80f11ef748131753adb67e9b722ebd"}, + {file = "pycryptodomex-3.22.0.tar.gz", hash = "sha256:a1da61bacc22f93a91cbe690e3eb2022a03ab4123690ab16c46abb693a9df63d"}, +] + [[package]] name = "pydantic" version = "2.11.3" @@ -1362,6 +1442,21 @@ files = [ {file = "setuptools-79.0.1.tar.gz", hash = "sha256:128ce7b8f33c3079fd1b067ecbb4051a66e8526e7b65f6cec075dfc650ddfa88"}, ] +[[package]] +name = "shadowcopy" +version = "0.0.4" +requires_python = ">=3.7" +summary = "A project for shadowcopy" +groups = ["default"] +marker = "python_version >= \"3.7\" and platform_system == \"Windows\"" +dependencies = [ + "wmi", +] +files = [ + {file = "shadowcopy-0.0.4-py3-none-any.whl", hash = "sha256:fc51e59a639dc6a5a3a7a9b4e3ecadc71989e339f2d995d90aaa491acd4ba4eb"}, + {file = "shadowcopy-0.0.4.tar.gz", hash = "sha256:ed89817dda065f893607a04c0b7d6b3b34c3507a4711f441111a4bcb1b1826c0"}, +] + [[package]] name = "six" version = "1.17.0" @@ -1636,6 +1731,20 @@ files = [ {file = "virtualenv-20.30.0.tar.gz", hash = "sha256:800863162bcaa5450a6e4d721049730e7f2dae07720e0902b0e4040bd6f9ada8"}, ] +[[package]] +name = "wmi" +version = "1.5.1" +summary = "Windows Management Instrumentation" +groups = ["default"] +marker = "python_version >= \"3.7\" and platform_system == \"Windows\"" +dependencies = [ + "pywin32", +] +files = [ + {file = "WMI-1.5.1-py2.py3-none-any.whl", hash = "sha256:1d6b085e5c445141c475476000b661f60fff1aaa19f76bf82b7abb92e0ff4942"}, + {file = "WMI-1.5.1.tar.gz", hash = "sha256:b6a6be5711b1b6c8d55bda7a8befd75c48c12b770b9d227d31c1737dbf0d40a6"}, +] + [[package]] name = "zstandard" version = "0.23.0" diff --git a/pyproject.toml b/pyproject.toml index fc9b009..f4f5a14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,8 @@ dependencies = [ "urllib3>=2.4.0", "bs4>=0.0.2", "requests-kerberos>=0.15.0", + "browser-cookie3>=0.19.1", + "httpx-gssapi>=0.2.0", ] requires-python = "==3.12.*" @@ -35,6 +37,7 @@ py-modules = [] [project.scripts] jira_scraper = "data_scraper.main:jira_scraper" errata_scraper = "data_scraper.main:errata_scraper" +ci_logs_scraper = "data_scraper.main:ci_logs_scraper" feedback_exporter = "feedback_exporter.export_feedback:main" evaluation = "evaluation.evaluation:main" osp_doc_scraper = "data_scraper.main:osp_doc_scraper"