diff --git a/data_scraper/common/constants.py b/data_scraper/common/constants.py
index c42ec9b..6d417a8 100644
--- a/data_scraper/common/constants.py
+++ b/data_scraper/common/constants.py
@@ -3,6 +3,7 @@
 JIRA_COLLECTION_NAME = "rca-knowledge-base"
 OSP_DOCS_COLLECTION_NAME = "rca-osp-docs-knowledge-base"
 ERRATA_COLLECTION_NAME = "rca-errata"
+CI_LOGS_COLLECTION_NAME = "rca-ci"
 DEFAULT_EMBEDDING_MODEL = "BAAI/bge-m3"
 DEFAULT_JIRA_URL = "https://issues.redhat.com"
 DEFAULT_JIRA_PROJECTS = {
@@ -10,6 +11,15 @@
     "OSPCIX",
     "OSPRH",
 }
+DEFAULT_ZULL_PIPELINES = {
+    "openstack-uni-jobs-periodic-integration-rhoso18.0-rhel9",
+    "openstack-uni-update-jobs-periodic-integration-rhoso18.0-rhel9",
+    "ci-framework-integrity",
+}
+
+DEFAULT_ZULL_TENANTS = {
+    "components-integration",
+}
 DEFAULT_CHUNK_SIZE = 1024
 DEFAULT_MAX_RESULTS = 10000
 DEFAULT_DATE_CUTOFF = "2000-01-01T00:00:00Z"
diff --git a/data_scraper/core/ci_logs_scraper.py b/data_scraper/core/ci_logs_scraper.py
new file mode 100644
index 0000000..dac2407
--- /dev/null
+++ b/data_scraper/core/ci_logs_scraper.py
@@ -0,0 +1,125 @@
+"""Code for scraping test operator logs data"""
+import logging
+from typing import TypedDict, List, Dict
+import json
+
+import pandas as pd
+
+from data_scraper.core.scraper import Scraper
+
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+
+
+class CILogsRecord(TypedDict):
+    """CILogs data point."""
+    url: str
+    topic: str
+    text: str
+    components: list[str]
+    kind: str
+
+class CILogsScraper(Scraper):
+    """Main class for test operator logs scraping and processing."""
+
+    def __init__(self, config: dict):
+        super().__init__(config=config)
+        self.config = config
+
+    def get_documents(self) -> list[dict]:
+        all_failures = self._load_test_operator_results()
+
+        return all_failures
+
+    def get_records(self, documents: list[dict]) -> list[CILogsRecord]:
+        ci_logs_records: list[CILogsRecord] = []
+
+        for document in documents:
+            ci_logs_records.append({
+                "url": document["url"],
+                "topic": document["test_name"],
+                "text": document["traceback"],
+                "components": [],
+                "kind": "zuul_jobs",
+            })
+
+        return ci_logs_records
+
+    def get_chunks(self, record: dict) -> list[str]:
+        chunks = []
+
+        for field in ["text"]:
+            chunks += self.text_processor.split_text(record[field])
+
+        return chunks
+
+    def record_postprocessing(self, record):
+        # Postprocessing is not required
+        pass
+
+    # pylint: disable=R0801
+    def cleanup_records(
+        self, records: list, backup_path: str = "ci_logs_all_data.pickle"
+    ) -> list:
+        df = pd.DataFrame(records)
+
+        LOG.info("Records stats BEFORE cleanup:")
+        LOG.info(df.info())
+
+        df = df.dropna()
+        df = df.drop_duplicates(subset=["text"])
+
+        LOG.info("Records stats AFTER cleanup:")
+        LOG.info(df.info())
+
+        LOG.info("Saving backup to: %s", backup_path)
+        df.to_pickle(backup_path)
+
+        return [CILogsRecord(**row) for row in df.to_dict(orient="records")]
+
+    def _get_job_url(self, url: str) -> str:
+        """
+        Extract the job URL from a test report URL by removing everything from 'logs/controller-0'.
+
+        Args:
+            url (str): The complete report URL
+
+        Returns:
+            str: The job URL
+        """
+        split_marker = "logs/controller-0"
+        if split_marker in url:
+            base_url = url.split(split_marker)[0]
+            return base_url.rstrip('/')
+
+        return url
+
+    def _load_test_operator_results(self) -> List[Dict[str, str]]:
+        """
+        Loads test results from the tempest_tests_tracebacks.json
+
+        Args:
+            url: URL of the tempest report (used as a key to find matching results)
+
+        Returns:
+            List of dictionaries containing test names and tracebacks
+        """
+        try:
+            file_name = self.config["tracebacks_json"]
+            with open(file_name, "r", encoding="utf-8") as f:
+                tracebacks = json.load(f)
+
+            results = []
+            for report in tracebacks:
+                job_url = self._get_job_url(report.get("url", "Unknown url"))
+                for test in report.get("failed_tests", []):
+                    results.append({
+                        "url": job_url,
+                        "test_name": test.get("name", "Unknown test"),
+                        "traceback": test.get("traceback", "No traceback available")
+                    })
+            return results
+
+        except json.JSONDecodeError:
+            LOG.error("%s", "Error parsing tempest_tests_tracebacks.json file")
+            return []
diff --git a/data_scraper/main.py b/data_scraper/main.py
index 54ef3ce..43940d2 100644
--- a/data_scraper/main.py
+++ b/data_scraper/main.py
@@ -6,7 +6,8 @@
 from data_scraper.common import constants
 from data_scraper.core.scraper import JiraScraper, OSPDocScraper
 from data_scraper.core.errata_scraper import ErrataScraper
-
+from data_scraper.core.ci_logs_scraper import CILogsScraper
+from data_scraper.processors.ci_logs_provider  import TestOperatorReportsProvider
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 )
@@ -178,3 +179,74 @@ def errata_scraper() -> None:
 
     scraper = ErrataScraper(config_args)
     scraper.run()
+
+
+
+
+def ci_logs_scraper() -> None:
+    """Entry point for command line execution."""
+    parser = ArgumentParser("ci_logs_scraper")
+
+    # Required arguments
+    parser.add_argument("--database_client_url", type=str, required=True)
+    parser.add_argument("--llm_server_url", type=str, required=True)
+    parser.add_argument("--llm_api_key", type=str, required=True)
+    parser.add_argument("--database_api_key", type=str, required=True)
+    parser.add_argument("--zuul-url", type=str, required=True)
+
+    # Optional arguments
+    parser.add_argument("--chunk_size", type=int,
+                        default=constants.DEFAULT_CHUNK_SIZE)
+    parser.add_argument("--embedding_model", type=str,
+                        default=constants.DEFAULT_EMBEDDING_MODEL)
+    parser.add_argument("--db_collection_name", type=str,
+                        default=constants.CI_LOGS_COLLECTION_NAME)
+    parser.add_argument("--date_cutoff", type=datetime.fromisoformat,
+                        default=datetime.fromisoformat(constants.DEFAULT_DATE_CUTOFF),
+                        help=(
+                            "No issues from before this date will be used. "
+                            "Date must follow ISO format 'YYYY-MM-DD'"
+                        )
+    )
+    parser.add_argument("--recreate_collection", type=bool, default=True,
+                        help="Recreate database collection from scratch.")
+    parser.add_argument("--pipelines", nargs='+', type=str,
+                        default=constants.DEFAULT_ZULL_PIPELINES)
+    parser.add_argument("--tenants", nargs='+', type=str,
+                        default=constants.DEFAULT_ZULL_TENANTS)
+    parser.add_argument("--populate_db_from_json", type=bool, default=False,
+                    help="Used from Zuul jobs that create json file at the end of their runs.")
+    args = parser.parse_args()
+
+    config_args = {
+        "database_client_url": args.database_client_url,
+        "llm_server_url": args.llm_server_url,
+        "llm_api_key": args.llm_api_key,
+        "database_api_key": args.database_api_key,
+        "chunk_size": args.chunk_size,
+        "embedding_model": args.embedding_model,
+        "db_collection_name": args.db_collection_name,
+        "zuul_url": args.zuul_url,
+        "date_cutoff": args.date_cutoff,
+        "recreate_collection": args.recreate_collection,
+        "pipelines": args.pipelines,
+        "tenants": args.tenants,
+        "tracebacks_json": "/tmp/tracebacks.json"
+    }
+
+
+
+    tenant = list(constants.DEFAULT_ZULL_TENANTS)[0] # just one tenenat is testsed ATM
+    pipelines = list(constants.DEFAULT_ZULL_PIPELINES) # Just RHOSO pipelines are tested ATM
+
+    if not args.populate_db_from_json:
+    # Get test operator reports, save tracebacks and create json
+        provider = TestOperatorReportsProvider(args.zuul_url,
+                                            tenant,
+                                            pipelines,
+                                            config_args["tracebacks_json"])
+        provider.run()
+
+    # when json is ready, proceed with tracebacks and store them to QdrantDB
+    scraper = CILogsScraper(config_args)
+    scraper.run()
diff --git a/data_scraper/processors/ci_logs_provider.py b/data_scraper/processors/ci_logs_provider.py
new file mode 100644
index 0000000..7ec16d0
--- /dev/null
+++ b/data_scraper/processors/ci_logs_provider.py
@@ -0,0 +1,679 @@
+"""Code for test operator logs data provisioning"""
+# Standard library imports
+import asyncio
+import json
+import logging
+import os
+import re
+import urllib.parse
+import warnings
+from datetime import datetime, timedelta
+
+import browser_cookie3
+import httpx
+import requests
+from bs4 import BeautifulSoup
+from httpx_gssapi import HTTPSPNEGOAuth
+from httpx_gssapi.gssapi_ import OPTIONAL
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+LOG = logging.getLogger(__name__)
+
+# API endpoint constants
+API_BASE = "zuul/api"
+API_TENANT = API_BASE + "/tenant/{tenant}"
+API_BUILDS = API_TENANT + "/builds"
+
+# Filter constants
+FILTER_RESULT_FAILURE = "FAILURE"
+FILTER_LIMIT = 20000
+
+# Time filter constant - builds older than this will be excluded
+CUTOFF_TIME = datetime.now() - timedelta(days=14)  # 2 weeks ago
+
+# Test path constants
+TEST_OPERATOR_PATH = "logs/controller-0/ci-framework-data/tests/test_operator"
+TEMPEST_TEST_PATTERN = "tempest-tests"
+TOBIKO_TEST_PATTERN = "tobiko-tests"
+
+async def fetch_with_gssapi(url, params=None, timeout=30.0):
+    """
+    Fetch content using Kerberos authentication.
+
+    Args:
+        url: URL to fetch
+        params: Optional query parameters
+        timeout: Request timeout in seconds
+
+    Returns:
+        Response text content
+    """
+    async with httpx.AsyncClient(
+        verify=False,
+        follow_redirects=True,
+        timeout=timeout
+    ) as session:
+        response = await session.get(
+            url,
+            params=params,
+            auth=HTTPSPNEGOAuth(mutual_authentication=OPTIONAL)
+        )
+        response.raise_for_status()
+        return response.text
+
+def make_authenticated_request(url, params=None, timeout=30.0):
+    """
+    Make an authenticated request to a URL using multiple authentication methods.
+
+    Args:
+        url: URL to fetch
+        params: Optional query parameters
+        timeout: Request timeout in seconds
+
+    Returns:
+        Response content as text, or None if an error occurs
+        For JSON responses, the caller will need to parse the text
+    """
+    parsed_url = urllib.parse.urlparse(url)
+    domain = parsed_url.netloc
+
+    verify = False
+    warnings.filterwarnings('ignore', message='Unverified HTTPS request')
+
+    # First try with Kerberos
+    try:
+        LOG.info("Attempting to authenticate using Kerberos...")
+        content = asyncio.run(fetch_with_gssapi(url, params, timeout))
+        LOG.info("Kerberos authentication successful")
+        return content
+
+    except (httpx.HTTPError, httpx.RequestError, httpx.TimeoutException) as e:
+        LOG.warning("Kerberos authentication failed due to HTTP error: %s", e)
+        LOG.info("Falling back to browser cookies authentication...")
+
+        # Second try with cookies from browsers
+        cookies = None
+        if 'redhat.com' in domain:
+            try:
+                cookies = browser_cookie3.chrome(domain_name=domain)
+                LOG.info("Using Chrome cookies for domain: %s", domain)
+            except (ImportError, RuntimeError, FileNotFoundError) as chrome_error:
+                LOG.warning("Could not get Chrome cookies: %s", chrome_error)
+
+                try:
+                    cookies = browser_cookie3.firefox(domain_name=domain)
+                    LOG.info("Using Firefox cookies for domain: %s", domain)
+                except (ImportError, RuntimeError, FileNotFoundError) as firefox_error:
+                    LOG.warning("Could not get Firefox cookies: %s", firefox_error)
+
+        try:
+            response = requests.get(
+                url,
+                params=params,
+                cookies=cookies,
+                verify=verify,
+                timeout=timeout
+            )
+            response.raise_for_status()
+            return response.text
+
+        except (requests.RequestException, requests.HTTPError, requests.ConnectionError,
+                requests.Timeout, requests.TooManyRedirects) as request_error:
+            LOG.error("Error fetching from %s: %s", url, request_error)
+            return None
+
+class TempestResultsParser:
+    """Parser for tempest test HTML reports."""
+
+    def __init__(self, source=None):
+        self.source = source
+        self.html_content = None
+        self.soup = None
+        self.failed_tests = []
+
+        if source:
+            self.load_content(source)
+
+    def load_content(self, source):
+        """Load content from a URL or file."""
+        self.source = source
+        self.html_content = self._get_content(source)
+        if self.html_content:
+            self.soup = BeautifulSoup(self.html_content, 'html.parser')
+        return self
+
+    def _get_content(self, source):
+        """Retrieve content from a URL or file."""
+        if source.startswith('http://') or source.startswith('https://'):
+            LOG.info("Downloading from URL: %s", source)
+            return make_authenticated_request(source, timeout=30.0)
+
+        LOG.info("Reading local file: %s", source)
+        try:
+            with open(source, 'r', encoding='utf-8') as f:
+                content = f.read()
+            return content
+        except IOError as e:
+            LOG.error("IO error reading file %s: %s", source, e)
+            return None
+
+    def parse(self):
+        """Parse the HTML content and extract failed tests."""
+        if not self.soup:
+            LOG.warning("No content loaded or parsing failed. Nothing to parse.")
+            return self.failed_tests
+
+        self.failed_tests = []
+        self._parse_html_format()
+        return self.failed_tests
+
+
+    def _extract_test_name(self, test_name_part: str) -> str:
+        """Extract the test name from the text before the traceback."""
+        # Extract the test name using a regex pattern
+        test_name_match = re.search(r'ft\d+\.\d+:\s*(.*?)\)?testtools', test_name_part)
+        if test_name_match:
+            test_name = test_name_match.group(1).strip()
+            if test_name.endswith('('):
+                test_name = test_name[:-1].strip()
+        else:
+            # Try alternative pattern for different formats
+            test_name_match = re.search(r'ft\d+\.\d+:\s*(.*?)$', test_name_part)
+            if test_name_match:
+                test_name = test_name_match.group(1).strip()
+            else:
+                test_name = "Unknown Test Name"
+
+        # Remove any content within square brackets
+        # e.g. test_tagged_boot_devices[id-a2e65a6c,image,network,slow,volume]
+        # becomes test_tagged_boot_devices
+        test_name = re.sub(r'\[.*?\]', '', test_name).strip()
+
+        # Remove any content within parentheses
+        test_name = re.sub(r'\(.*?\)', '', test_name).strip()
+
+        return test_name
+
+    def _parse_html_format(self):
+        """Parse HTML formatted tempest test results."""
+
+        soup = self.soup
+
+        # The code is copy/past-ed from src/api.py#L101-L141
+        # https://github.com/RCAccelerator/chatbot/. Get rid
+        # of duplication at some stage. ATM it's a QnD solution to have exaclty the same
+        # parsing of tempest test reports both at the endpoint and in scraper.
+        failed_test_rows = soup.find_all('tr', id=re.compile(r'^ft\d+\.\d+'))
+
+        for row in failed_test_rows:
+            row_text = row.get_text().strip()
+
+            traceback_start_marker = "Traceback (most recent call last):"
+            traceback_start_index = row_text.find(traceback_start_marker)
+
+            if traceback_start_index != -1:
+                test_name_part = row_text[:traceback_start_index].strip()
+                test_name = self._extract_test_name(test_name_part)
+
+                traceback_text = row_text[traceback_start_index:]
+                end_marker_index = traceback_text.find("}}}")
+                if end_marker_index != -1:
+                    traceback_text = traceback_text[:end_marker_index].strip()
+                else:
+                    traceback_text = traceback_text.strip()
+
+                self.failed_tests.append((test_name, traceback_text or "No traceback found"))
+
+
+    def has_failures(self):
+        """Check if any failed tests were found."""
+        return len(self.failed_tests) > 0
+
+    def get_failure_summary(self):
+        """Get a summary of the failed tests."""
+        if not self.failed_tests:
+            return "No failed tests found."
+
+        summary = f"Found {len(self.failed_tests)} failed tests:\n"
+        for test_name, _ in self.failed_tests:
+            summary += f"- {test_name}\n"
+
+        return summary
+
+
+class ZuulClient:
+    """Client for fetching Zuul build data and analyzing test reports."""
+
+    def __init__(self, base_url):
+        """
+        Initialize the client with the base URL.
+
+        Args:
+            base_url: Base URL for the Zuul server
+        """
+        self.base_url = base_url.rstrip('/')
+
+    def retrive_failed_builds(self, tenant, pipeline, limit=FILTER_LIMIT):
+        """
+        Get failed builds for a specific pipeline as JSON.
+
+        Args:
+            tenant: Tenant name
+            pipeline: Pipeline name
+            limit: Maximum number of builds to return
+
+        Returns:
+            JSON response as dictionary with builds filtered by cutoff time
+        """
+        endpoint = API_BUILDS.format(tenant=tenant)
+        url = f"{self.base_url}/{endpoint}"
+
+        params = {
+            "limit": limit,
+            "result": FILTER_RESULT_FAILURE,
+            "pipeline": pipeline
+        }
+
+        LOG.info("Fetching data from: %s with params: %s", url, params)
+
+        try:
+            content_text = make_authenticated_request(url, params=params, timeout=60.0)
+            if not content_text:
+                return []
+
+            result = json.loads(content_text)
+            LOG.info("Successfully fetched %d builds", len(result))
+
+            filtered_builds = self._filter_builds_by_date(result)
+            return filtered_builds
+
+        except (requests.RequestException, requests.HTTPError, requests.ConnectionError,
+                requests.Timeout, requests.TooManyRedirects) as e:
+            LOG.error("Error fetching data: %s", e)
+            return []
+
+    def _filter_builds_by_date(self, builds):
+        """
+        Filter builds based on the cutoff time.
+
+        Args:
+            builds: List of build dictionaries
+
+        Returns:
+            Filtered list of build dictionaries
+        """
+        filtered_builds = []
+
+        for build in builds:
+            end_time = build.get("end_time")
+            if not end_time:
+                end_time = build.get("start_time")
+
+            # Skip if no timestamp available
+            if not end_time:
+                continue
+
+            # Parse the timestamp
+            try:
+                # Handle different timestamp formats
+                if 'Z' in end_time:
+                    # ISO format with Z for UTC
+                    build_time = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
+                elif 'T' in end_time and '+' in end_time:
+                    # ISO format with timezone offset
+                    build_time = datetime.fromisoformat(end_time)
+                else:
+                    # Assume simple format
+                    build_time = datetime.fromisoformat(end_time)
+
+                # Include only builds newer than the cutoff time
+                if build_time >= CUTOFF_TIME:
+                    filtered_builds.append(build)
+            except ValueError:
+                LOG.warning("%s", "Could not parse timestamp")
+                # Include builds with unparseable timestamps by default
+                filtered_builds.append(build)
+
+        LOG.info("Filtered to %d builds after cutoff time", len(filtered_builds))
+        return filtered_builds
+
+    def fetch_html_content(self, url):
+        """
+        Fetch HTML content from a URL.
+
+        Args:
+            url: URL to fetch
+
+        Returns:
+            HTML content as string, or None if an error occurs
+        """
+        LOG.info("Fetching HTML from: %s", url)
+        return make_authenticated_request(url)
+
+    def find_test_reports(self, builds_json):
+        """
+        Find test reports in the build logs.
+
+        Args:
+            builds_json: JSON dictionary of builds
+
+        Returns:
+            Dictionary mapping build UUIDs to test report information
+        """
+        results = {}
+
+        for build in builds_json:
+            build_uuid = build.get("uuid", "unknown")
+            log_url = build.get("log_url")
+
+            if not log_url:
+                LOG.info("Build %s has no log_url", build_uuid)
+                continue
+
+            LOG.info("Processing build %s with log URL %s", build_uuid, log_url)
+
+            results[build_uuid] = self._process_build_reports(log_url)
+
+        return results
+
+    def _process_build_reports(self, log_url):
+        """
+        Process test reports for a single build.
+
+        Args:
+            log_url: URL to the build logs
+
+        Returns:
+            Dictionary with test report information
+        """
+        test_operator_path = f"{log_url.rstrip('/')}/{TEST_OPERATOR_PATH}"
+        operator_html = self.fetch_html_content(test_operator_path)
+
+        if not operator_html:
+            LOG.info("Log URL %s contains no test_operator directory", log_url)
+            return {
+                "log_url": log_url,
+                "status": "no_test_operator_dir",
+                "tempest_reports": [],
+                "tobiko_reports": []
+            }
+
+        test_dirs = self._find_test_directories(operator_html)
+
+        if not test_dirs:
+            LOG.info("No test directories found in %s", test_operator_path)
+            return {
+                "log_url": log_url,
+                "status": "no_test_directories",
+                "tempest_reports": [],
+                "tobiko_reports": []
+            }
+
+        tempest_reports = []
+        tobiko_reports = []
+
+        for test_dir in test_dirs:
+            test_dir_url = f"{test_operator_path}/{test_dir}"
+
+            if TEMPEST_TEST_PATTERN in test_dir:
+                tempest_reports.extend(self._process_tempest_directory(test_dir_url, test_dir))
+            elif TOBIKO_TEST_PATTERN in test_dir:
+                tobiko_reports.extend(self._process_tobiko_directory(test_dir_url, test_dir))
+
+        status = "test_reports_found" if (tempest_reports or tobiko_reports) else "no_test_reports"
+
+        return {
+            "log_url": log_url,
+            "status": status,
+            "tempest_reports": tempest_reports,
+            "tobiko_reports": tobiko_reports
+        }
+
+    def _find_test_directories(self, html_content):
+        """Find test directories in HTML content."""
+        soup = BeautifulSoup(html_content, "html.parser")
+        test_dirs = []
+
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            # Look for directories (ends with /)
+            if href.endswith("/") and not href.startswith("/") and href != "../":
+                test_dirs.append(href)
+
+        return test_dirs
+
+    def _process_tempest_directory(self, dir_url, dir_name):
+        """Process a tempest test directory."""
+        LOG.info("Checking tempest test directory: %s", dir_url)
+        reports = []
+        html_files = self._find_html_files(dir_url)
+
+        for html_file in html_files:
+            parser = TempestResultsParser(html_file)
+            parser.parse()
+            has_failures = parser.has_failures()
+
+            report = {
+                "url": html_file,
+                "directory": dir_name,
+                "has_failures": has_failures,
+                "failed_tests": []
+            }
+
+            if has_failures:
+                for test_name, traceback in parser.failed_tests:
+                    report["failed_tests"].append({
+                        "name": test_name,
+                        "traceback": traceback
+                    })
+
+            reports.append(report)
+
+        return reports
+
+    def _process_tobiko_directory(self, dir_url, dir_name):
+        """Process a tobiko test directory."""
+        LOG.info("Checking tobiko test directory: %s", dir_url)
+        reports = []
+        html_files = self._find_html_files(dir_url)
+
+        for html_file in html_files:
+            # For now, we're not checking tobiko reports for failures
+            reports.append({
+                "url": html_file,
+                "directory": dir_name,
+                "has_failures": False,  # Placeholder
+                "failed_tests": []      # Placeholder
+            })
+
+        return reports
+
+    def _find_html_files(self, directory_url):
+        """
+        Find HTML files in a directory.
+
+        Args:
+            directory_url: URL of the directory to search
+
+        Returns:
+            List of HTML file URLs
+        """
+        html_files = []
+
+        dir_html = self.fetch_html_content(directory_url)
+
+        if not dir_html:
+            LOG.info("Could not access directory: %s", directory_url)
+            return html_files
+
+        dir_soup = BeautifulSoup(dir_html, "html.parser")
+
+        for link in dir_soup.find_all("a", href=True):
+            href = link["href"]
+            if href.endswith(".html") and not href.startswith("/"):
+                html_url = f"{directory_url}/{href}"
+                html_files.append(html_url)
+                LOG.info("Found HTML file: %s", html_url)
+
+        return html_files
+
+
+def save_failed_tempest_paths(report_results):
+    """
+    Save only the failed tempest report paths to a file.
+
+    Args:
+        report_results: Dictionary mapping build UUIDs to test report information
+    """
+    failed_paths = []
+
+    for _, result in report_results.items():
+        if result["status"] == "test_reports_found":
+            tempest_reports = result["tempest_reports"]
+            for report in tempest_reports:
+                if report["has_failures"]:
+                    failed_paths.append(report["url"])
+
+    with open("failed_tempest_paths.txt", "w", encoding='utf-8') as f:
+        for path in failed_paths:
+            f.write(f"{path}\n")
+
+    return failed_paths
+
+
+def create_tempest_failures_json(report_results, tracebacks_json):
+    """
+    Create a JSON file containing tempest report URLs and their failed tests.
+
+    Args:
+        report_results: Dictionary mapping build UUIDs to test report information
+    """
+    tempest_failures = []
+
+    for _, result in report_results.items():
+        if result["status"] == "test_reports_found":
+            tempest_reports = result["tempest_reports"]
+            for report in tempest_reports:
+                if report["has_failures"]:
+                    tempest_failures.append({
+                        "url": report["url"],
+                        "failed_tests": report["failed_tests"]
+                    })
+
+    with open(tracebacks_json, "w", encoding='utf-8') as f:
+        json.dump(tempest_failures, f, indent=2)
+
+    return tempest_failures
+
+
+# # pylint: disable=too-few-public-methods
+
+# class TestOperatorReportsProvider:
+#     """Class responsible for retrieving and processing test operator reports."""
+
+#     def __init__(self, server_url, tenant, pipeline, tracebacks_json):
+#         """Initialize the TestOperatorReportsProvider with a server URL.
+
+#         Args:
+#             server_url (str): The URL of the Zuul server.
+#         """
+#         self.server_url = server_url
+#         self.tenant = tenant
+#         self.pipeline = pipeline
+#         self.tracebacks_json = tracebacks_json
+
+#     def run(self):
+#         """Entry point of TestOperatorReportsProvider.
+#         """
+#         server_url = self.server_url
+#         tenant = self.tenant
+#         pipeline = self.pipeline
+
+#         client = ZuulClient(server_url)
+
+
+#         builds = client.get_failed_builds_json(tenant, pipeline)
+
+#         if not builds:
+#             LOG.error("%s", "No builds found or authentication failed")
+#             return
+
+#         LOG.info("Found %d failed builds within the last 2 weeks (after %s)",
+#                     len(builds), CUTOFF_TIME.isoformat())
+
+#         with open("failed_builds.json", "w", encoding='utf-8') as f:
+#             json.dump(builds, f, indent=2)
+
+#         report_results = client.find_test_reports(builds)
+
+#         failed_paths = save_failed_tempest_paths(report_results)
+
+#         for path in failed_paths:
+#             print(path)
+
+#         create_tempest_failures_json(report_results,self.tracebacks_json )
+
+
+class TestOperatorReportsProvider:
+    """Class responsible for retrieving and processing test operator reports."""
+
+    def __init__(self, server_url, tenant, pipelines, tracebacks_json):
+        """Initialize the TestOperatorReportsProvider with a server URL.
+
+        Args:
+            server_url (str): The URL of the Zuul server.
+            tenant (str): The tenant name.
+            pipelines (list): A list of pipeline names.
+            tracebacks_json (str): Path to the tracebacks JSON file.
+        """
+        self.server_url = server_url
+        self.tenant = tenant
+        self.pipelines = pipelines
+        self.tracebacks_json = tracebacks_json
+
+        if os.path.exists("failed_builds.txt"):
+            os.remove("failed_builds.txt")
+            LOG.info("Deleted existing failed_builds.txt file")
+
+        if os.path.exists(self.tracebacks_json):
+            os.remove(self.tracebacks_json)
+            LOG.info("Deleted existing %s file", self.tracebacks_json)
+
+    def run(self):
+        """Entry point of TestOperatorReportsProvider.
+        """
+        server_url = self.server_url
+        tenant = self.tenant
+
+        client = ZuulClient(server_url)
+
+        all_builds = []
+
+        for pipeline in self.pipelines:
+            LOG.info("Processing pipeline: %s", pipeline)
+            builds = client.retrive_failed_builds(tenant, pipeline)
+
+            if not builds:
+                LOG.warning("No builds found for pipeline %s", pipeline)
+                continue
+
+            LOG.info("Found %d failed builds in pipeline %s within the last 2 weeks (after %s)",
+                    len(builds), pipeline, CUTOFF_TIME.isoformat())
+
+            all_builds.extend(builds)
+
+            with open("failed_builds.txt", "a", encoding='utf-8') as f:
+                for build in builds:
+                    f.write(f"{build}\n")
+
+        report_results = client.find_test_reports(all_builds)
+
+        failed_paths = save_failed_tempest_paths(report_results)
+
+        for path in failed_paths:
+            print(path)
+
+        create_tempest_failures_json(report_results,self.tracebacks_json)
diff --git a/data_scraper/processors/vector_store.py b/data_scraper/processors/vector_store.py
index 5370f48..919cb2e 100644
--- a/data_scraper/processors/vector_store.py
+++ b/data_scraper/processors/vector_store.py
@@ -35,8 +35,10 @@ def __init__(self, client_url: str, api_key: str, timeout=100):
         self.client = QdrantClient(client_url, api_key=api_key, timeout=timeout)
 
     def recreate_collection(self, collection_name: str, vector_size: int):
-        """Recreate the collection with specified parameters."""
+        """Recreate the collection with specified parameters, preserving a snapshot first."""
+
         if self.check_collection(collection_name):
+            self.client.create_snapshot(collection_name=collection_name)
             self.client.delete_collection(collection_name)
 
         self.client.create_collection(
diff --git a/pdm.lock b/pdm.lock
index ba2a914..7316ebb 100644
--- a/pdm.lock
+++ b/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "dev"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
-content_hash = "sha256:88441772af1a0d4a2337bc05a1d8b8176519cac6addfde716906daf61bd3bfbe"
+content_hash = "sha256:df11e293dd0f226387cc80c9feb53cf4c65c3b8f59486e6dd80dab479ea53c2b"
 
 [[metadata.targets]]
 requires_python = "==3.12.*"
@@ -70,6 +70,23 @@ files = [
     {file = "beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195"},
 ]
 
+[[package]]
+name = "browser-cookie3"
+version = "0.20.1"
+summary = "Loads cookies from your browser into a cookiejar object so can download with urllib and other libraries the same content you see in the web browser."
+groups = ["default"]
+dependencies = [
+    "dbus-python; \"bsd\" in sys_platform and python_version < \"3.7\" or sys_platform == \"linux\" and python_version < \"3.7\"",
+    "jeepney; \"bsd\" in sys_platform and python_version >= \"3.7\" or sys_platform == \"linux\" and python_version >= \"3.7\"",
+    "lz4",
+    "pycryptodomex",
+    "shadowcopy; python_version >= \"3.7\" and platform_system == \"Windows\"",
+]
+files = [
+    {file = "browser_cookie3-0.20.1-py3-none-any.whl", hash = "sha256:4b38bf669d386250733c8339f0036e1cf09c3d8e4d326fd507b9afb84def13d6"},
+    {file = "browser_cookie3-0.20.1.tar.gz", hash = "sha256:6d8d0744bf42a5327c951bdbcf77741db3455b8b4e840e18bab266d598368a12"},
+]
+
 [[package]]
 name = "bs4"
 version = "0.0.2"
@@ -218,7 +235,6 @@ version = "5.2.1"
 requires_python = ">=3.8"
 summary = "Decorators for Humans"
 groups = ["default"]
-marker = "sys_platform != \"win32\""
 files = [
     {file = "decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a"},
     {file = "decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360"},
@@ -316,7 +332,7 @@ files = [
 
 [[package]]
 name = "google-api-python-client"
-version = "2.167.0"
+version = "2.169.0"
 requires_python = ">=3.7"
 summary = "Google API Client Library for Python"
 groups = ["default"]
@@ -328,13 +344,13 @@ dependencies = [
     "uritemplate<5,>=3.0.1",
 ]
 files = [
-    {file = "google_api_python_client-2.167.0-py2.py3-none-any.whl", hash = "sha256:ce25290cc229505d770ca5c8d03850e0ae87d8e998fc6dd743ecece018baa396"},
-    {file = "google_api_python_client-2.167.0.tar.gz", hash = "sha256:a458d402572e1c2caf9db090d8e7b270f43ff326bd9349c731a86b19910e3995"},
+    {file = "google_api_python_client-2.169.0-py3-none-any.whl", hash = "sha256:dae3e882dc0e6f28e60cf09c1f13fedfd881db84f824dd418aa9e44def2fe00d"},
+    {file = "google_api_python_client-2.169.0.tar.gz", hash = "sha256:0585bb97bd5f5bf3ed8d4bf624593e4c5a14d06c811d1952b07a1f94b4d12c51"},
 ]
 
 [[package]]
 name = "google-auth"
-version = "2.39.0"
+version = "2.40.1"
 requires_python = ">=3.7"
 summary = "Google Authentication Library"
 groups = ["default"]
@@ -344,8 +360,8 @@ dependencies = [
     "rsa<5,>=3.1.4",
 ]
 files = [
-    {file = "google_auth-2.39.0-py2.py3-none-any.whl", hash = "sha256:0150b6711e97fb9f52fe599f55648950cc4540015565d8fbb31be2ad6e1548a2"},
-    {file = "google_auth-2.39.0.tar.gz", hash = "sha256:73222d43cdc35a3aeacbfdcaf73142a97839f10de930550d89ebfe1d0a00cde7"},
+    {file = "google_auth-2.40.1-py2.py3-none-any.whl", hash = "sha256:ed4cae4f5c46b41bae1d19c036e06f6c371926e97b19e816fc854eff811974ee"},
+    {file = "google_auth-2.40.1.tar.gz", hash = "sha256:58f0e8416a9814c1d86c9b7f6acf6816b51aba167b2c76821965271bac275540"},
 ]
 
 [[package]]
@@ -462,7 +478,6 @@ version = "1.9.0"
 requires_python = ">=3.8"
 summary = "Python GSSAPI Wrapper"
 groups = ["default"]
-marker = "sys_platform != \"win32\""
 dependencies = [
     "decorator",
 ]
@@ -561,6 +576,21 @@ files = [
     {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"},
 ]
 
+[[package]]
+name = "httpx-gssapi"
+version = "0.4"
+requires_python = ">=3.9"
+summary = "A Python GSSAPI authentication handler for HTTPX"
+groups = ["default"]
+dependencies = [
+    "gssapi",
+    "httpx<0.29,>=0.16",
+]
+files = [
+    {file = "httpx_gssapi-0.4-py3-none-any.whl", hash = "sha256:ea390c62015d3b58e594936af75b3fbf6421e53e334d587db76623f2f07c93e8"},
+    {file = "httpx_gssapi-0.4.tar.gz", hash = "sha256:8c9a4fa526484c1323c0a4436ef1198f34c0f1d6dde0973ea38f960d51e16ef0"},
+]
+
 [[package]]
 name = "httpx"
 version = "0.28.1"
@@ -630,6 +660,18 @@ files = [
     {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"},
 ]
 
+[[package]]
+name = "jeepney"
+version = "0.9.0"
+requires_python = ">=3.7"
+summary = "Low-level, pure Python DBus protocol wrapper."
+groups = ["default"]
+marker = "\"bsd\" in sys_platform and python_version >= \"3.7\" or sys_platform == \"linux\" and python_version >= \"3.7\""
+files = [
+    {file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
+    {file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
+]
+
 [[package]]
 name = "jiter"
 version = "0.9.0"
@@ -746,6 +788,24 @@ files = [
     {file = "langsmith-0.3.33.tar.gz", hash = "sha256:0f439e945528c6d14140137b918cc048aea04c6a987525926dbfda2560002924"},
 ]
 
+[[package]]
+name = "lz4"
+version = "4.4.4"
+requires_python = ">=3.9"
+summary = "LZ4 Bindings for Python"
+groups = ["default"]
+files = [
+    {file = "lz4-4.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:23ae267494fdd80f0d2a131beff890cf857f1b812ee72dbb96c3204aab725553"},
+    {file = "lz4-4.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fff9f3a1ed63d45cb6514bfb8293005dc4141341ce3500abdfeb76124c0b9b2e"},
+    {file = "lz4-4.4.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ea7f07329f85a8eda4d8cf937b87f27f0ac392c6400f18bea2c667c8b7f8ecc"},
+    {file = "lz4-4.4.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ccab8f7f7b82f9fa9fc3b0ba584d353bd5aa818d5821d77d5b9447faad2aaad"},
+    {file = "lz4-4.4.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e43e9d48b2daf80e486213128b0763deed35bbb7a59b66d1681e205e1702d735"},
+    {file = "lz4-4.4.4-cp312-cp312-win32.whl", hash = "sha256:33e01e18e4561b0381b2c33d58e77ceee850a5067f0ece945064cbaac2176962"},
+    {file = "lz4-4.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d21d1a2892a2dcc193163dd13eaadabb2c1b803807a5117d8f8588b22eaf9f12"},
+    {file = "lz4-4.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:2f4f2965c98ab254feddf6b5072854a6935adab7bc81412ec4fe238f07b85f62"},
+    {file = "lz4-4.4.4.tar.gz", hash = "sha256:070fd0627ec4393011251a094e08ed9fdcc78cb4e7ab28f507638eee4e39abda"},
+]
+
 [[package]]
 name = "mccabe"
 version = "0.7.0"
@@ -994,6 +1054,26 @@ files = [
     {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
 ]
 
+[[package]]
+name = "pycryptodomex"
+version = "3.22.0"
+requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+summary = "Cryptographic library for Python"
+groups = ["default"]
+files = [
+    {file = "pycryptodomex-3.22.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:aef4590263b9f2f6283469e998574d0bd45c14fb262241c27055b82727426157"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:5ac608a6dce9418d4f300fab7ba2f7d499a96b462f2b9b5c90d8d994cd36dcad"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a24f681365ec9757ccd69b85868bbd7216ba451d0f86f6ea0eed75eeb6975db"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:259664c4803a1fa260d5afb322972813c5fe30ea8b43e54b03b7e3a27b30856b"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7127d9de3c7ce20339e06bcd4f16f1a1a77f1471bcf04e3b704306dde101b719"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ee75067b35c93cc18b38af47b7c0664998d8815174cfc66dd00ea1e244eb27e6"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:1a8b0c5ba061ace4bcd03496d42702c3927003db805b8ec619ea6506080b381d"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:bfe4fe3233ef3e58028a3ad8f28473653b78c6d56e088ea04fe7550c63d4d16b"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-win32.whl", hash = "sha256:2cac9ed5c343bb3d0075db6e797e6112514764d08d667c74cb89b931aac9dddd"},
+    {file = "pycryptodomex-3.22.0-cp37-abi3-win_amd64.whl", hash = "sha256:ff46212fda7ee86ec2f4a64016c994e8ad80f11ef748131753adb67e9b722ebd"},
+    {file = "pycryptodomex-3.22.0.tar.gz", hash = "sha256:a1da61bacc22f93a91cbe690e3eb2022a03ab4123690ab16c46abb693a9df63d"},
+]
+
 [[package]]
 name = "pydantic"
 version = "2.11.3"
@@ -1362,6 +1442,21 @@ files = [
     {file = "setuptools-79.0.1.tar.gz", hash = "sha256:128ce7b8f33c3079fd1b067ecbb4051a66e8526e7b65f6cec075dfc650ddfa88"},
 ]
 
+[[package]]
+name = "shadowcopy"
+version = "0.0.4"
+requires_python = ">=3.7"
+summary = "A project for shadowcopy"
+groups = ["default"]
+marker = "python_version >= \"3.7\" and platform_system == \"Windows\""
+dependencies = [
+    "wmi",
+]
+files = [
+    {file = "shadowcopy-0.0.4-py3-none-any.whl", hash = "sha256:fc51e59a639dc6a5a3a7a9b4e3ecadc71989e339f2d995d90aaa491acd4ba4eb"},
+    {file = "shadowcopy-0.0.4.tar.gz", hash = "sha256:ed89817dda065f893607a04c0b7d6b3b34c3507a4711f441111a4bcb1b1826c0"},
+]
+
 [[package]]
 name = "six"
 version = "1.17.0"
@@ -1636,6 +1731,20 @@ files = [
     {file = "virtualenv-20.30.0.tar.gz", hash = "sha256:800863162bcaa5450a6e4d721049730e7f2dae07720e0902b0e4040bd6f9ada8"},
 ]
 
+[[package]]
+name = "wmi"
+version = "1.5.1"
+summary = "Windows Management Instrumentation"
+groups = ["default"]
+marker = "python_version >= \"3.7\" and platform_system == \"Windows\""
+dependencies = [
+    "pywin32",
+]
+files = [
+    {file = "WMI-1.5.1-py2.py3-none-any.whl", hash = "sha256:1d6b085e5c445141c475476000b661f60fff1aaa19f76bf82b7abb92e0ff4942"},
+    {file = "WMI-1.5.1.tar.gz", hash = "sha256:b6a6be5711b1b6c8d55bda7a8befd75c48c12b770b9d227d31c1737dbf0d40a6"},
+]
+
 [[package]]
 name = "zstandard"
 version = "0.23.0"
diff --git a/pyproject.toml b/pyproject.toml
index fc9b009..f4f5a14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,8 @@ dependencies = [
     "urllib3>=2.4.0",
     "bs4>=0.0.2",
     "requests-kerberos>=0.15.0",
+    "browser-cookie3>=0.19.1",
+    "httpx-gssapi>=0.2.0",
 ]
 requires-python = "==3.12.*"
 
@@ -35,6 +37,7 @@ py-modules = []
 [project.scripts]
 jira_scraper = "data_scraper.main:jira_scraper"
 errata_scraper = "data_scraper.main:errata_scraper"
+ci_logs_scraper = "data_scraper.main:ci_logs_scraper"
 feedback_exporter = "feedback_exporter.export_feedback:main"
 evaluation = "evaluation.evaluation:main"
 osp_doc_scraper = "data_scraper.main:osp_doc_scraper"