From 8e69c78e4d4302958df03e3c309b619aebdfa5ad Mon Sep 17 00:00:00 2001 From: Samk Date: Mon, 8 Dec 2025 20:14:04 +0530 Subject: [PATCH 1/5] Add EUVD mirror pipeline Signed-off-by: Samk --- .github/workflows/sync.yml | 37 +++++++ .gitignore | 21 ++++ requirements.txt | 2 + sync_catalog.py | 216 +++++++++++++++++++++++++++++++++++++ 4 files changed, 276 insertions(+) create mode 100644 .github/workflows/sync.yml create mode 100644 .gitignore create mode 100644 requirements.txt create mode 100644 sync_catalog.py diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml new file mode 100644 index 0000000..d25eeca --- /dev/null +++ b/.github/workflows/sync.yml @@ -0,0 +1,37 @@ +name: Daily sync of EUVD catalog + +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * *' + +permissions: + contents: write + +jobs: + scheduled: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install required packages + run: pip install requests==2.32.5 aboutcode.pipeline==0.2.1 + + - name: Run sync (daily) + run: python sync_catalog.py daily + + - name: Commit and push if it changed + run: |- + git config user.name "AboutCode Automation" + git config user.email "automation@aboutcode.org" + git add -A + timestamp=$(date -u) + git commit -m "$(echo -e "Sync EUVD catalog: $timestamp\n\nSigned-off-by: AboutCode Automation ")" || exit 0 + git push diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c993167 --- /dev/null +++ b/.gitignore @@ -0,0 +1,21 @@ +# Various junk and temp files +.DS_Store +*~ +.*.sw[po] +.build +.ve +*.bak +var +share +selenium +local +/dist/ +/.*cache/ +/.venv/ +/.python-version +/.pytest_cache/ +/scancodeio.egg-info/ +*.rdb +*.aof +.vscode +.ipynb_checkpoints \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b80ce4b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests==2.32.5 +aboutcode.pipeline==0.2.1 diff --git a/sync_catalog.py b/sync_catalog.py new file mode 100644 index 0000000..b92296c --- /dev/null +++ b/sync_catalog.py @@ -0,0 +1,216 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import sys +from datetime import date, datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Dict + +import requests + +from aboutcode.pipeline import BasePipeline, LoopProgress + +ROOT_PATH = Path(__file__).parent +CATALOG_PATH = ROOT_PATH / "catalog" +PAGE_DIRECTORY = CATALOG_PATH / "pages" + +API_URL = "https://euvdservices.enisa.europa.eu/api/search" + +HEADERS = { + "User-Agent": "Vulnerablecode", + "Accept": "application/json", +} + +PAGE_SIZE = 100 +DEFAULT_START_YEAR = 2010 +REQUEST_TIMEOUT = 10 + + +class EuvdCatalogMirror(BasePipeline): + def __init__(self) -> None: + super().__init__() + self.session = requests.Session() + + @classmethod + def steps(cls): + return (cls.collect_catalog,) + + def collect_catalog(self) -> None: + mode = getattr(self, "mode", "backfill") + if mode == "daily": + self.sync_yesterday() + else: + self.backfill_from_year(DEFAULT_START_YEAR) + + def backfill_from_year(self, start_year: int) -> None: + today = date.today() + backfill_start = date(start_year, 1, 1) + backfill_end = today + + months: list[tuple[int, int]] = [] + current = backfill_start + + while current <= backfill_end: + months.append((current.year, current.month)) + if current.month == 12: + current = date(current.year + 1, 1, 1) + else: + current = date(current.year, current.month + 1, 1) + + progress = LoopProgress(total_iterations=len(months), logger=self.log) + + for year, month in progress.iter(months): + if year == backfill_end.year and month == backfill_end.month: + month_start = date(year, month, 1) + day_token = month_start.isoformat() + self.log(f"backfill {year}-{month:02d}: {month_start} to {backfill_end}") + self._collect_paginated( + start=month_start, + end=backfill_end, + year=year, + month=month, + day_token=day_token, + ) + else: + self.collect_month(year, month) + + def sync_yesterday(self) -> None: + target_date = date.today() - timedelta(days=1) + self.collect_single_day(target_date) + + def collect_month(self, year: int, month: int) -> None: + month_start = date(year, month, 1) + if month == 12: + next_month = date(year + 1, 1, 1) + else: + next_month = date(year, month + 1, 1) + month_end = next_month - timedelta(days=1) + + self.log(f"month {year}-{month:02d}: {month_start} to {month_end}") + day_token = month_start.isoformat() + + self._collect_paginated( + start=month_start, + end=month_end, + year=year, + month=month, + day_token=day_token, + ) + + def collect_single_day(self, target_date: date) -> None: + self.log(f"day {target_date}: updated_on={target_date}") + day_token = target_date.isoformat() + + self._collect_paginated( + start=target_date, + end=target_date, + year=target_date.year, + month=target_date.month, + day_token=day_token, + ) + + def _collect_paginated( + self, + start: date, + end: date, + year: int, + month: int, + day_token: str, + ) -> None: + page = 0 + + while True: + params = { + "fromUpdatedDate": start.isoformat(), + "toUpdatedDate": end.isoformat(), + "size": PAGE_SIZE, + "page": page, + } + + data = self.fetch_page(params) + items = data.get("items") or [] + count = len(items) + + if count == 0: + self.log(f"no results for {start}–{end} on page {page}, stopping") + break + + page_number = page + 1 + self.write_page_file( + year=year, + month=month, + day_token=day_token, + page_number=page_number, + payload=data, + ) + + if count < PAGE_SIZE: + self.log(f"finished {start}–{end} at page {page} ({count} items)") + break + + page += 1 + + def write_page_file( + self, + year: int, + month: int, + day_token: str, + page_number: int, + payload: Dict[str, Any], + ) -> None: + year_str = f"{year:04d}" + month_str = f"{month:02d}" + page_str = f"{page_number:04d}" + + dir_path = PAGE_DIRECTORY / year_str / month_str + dir_path.mkdir(parents=True, exist_ok=True) + + filename = f"page{day_token}-{page_str}.json" + path = dir_path / filename + + if path.exists(): + self.log(f"skip existing file: {path}") + return + + with path.open("w", encoding="utf-8") as output: + json.dump(payload, output, indent=2) + + self.log(f"saved {path}") + + def fetch_page(self, params: Dict[str, Any]) -> Dict[str, Any]: + self.log(f"GET {API_URL} {params}") + response = self.session.get( + API_URL, + params=params, + headers=HEADERS, + timeout=REQUEST_TIMEOUT, + ) + response.raise_for_status() + data: Any = response.json() + if not isinstance(data, dict): + return {} + return data + + def log(self, message: str) -> None: + now = datetime.now(timezone.utc).astimezone() + stamp = now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + print(f"{stamp} {message}") + + +if __name__ == "__main__": + mode = "backfill" + if len(sys.argv) >= 2: + mode = sys.argv[1] + + mirror = EuvdCatalogMirror() + mirror.mode = mode + + status_code, error_message = mirror.execute() + if error_message: + print(error_message) + sys.exit(status_code) From f75cf19ff4adfd113751aab1ab93ff527b72ec8f Mon Sep 17 00:00:00 2001 From: Samk Date: Tue, 9 Dec 2025 21:28:47 +0530 Subject: [PATCH 2/5] Update DEFAULT_START_YEAR to Unix Epoch Signed-off-by: Samk --- sync_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync_catalog.py b/sync_catalog.py index b92296c..078bc30 100644 --- a/sync_catalog.py +++ b/sync_catalog.py @@ -27,7 +27,7 @@ } PAGE_SIZE = 100 -DEFAULT_START_YEAR = 2010 +DEFAULT_START_YEAR = 1970 REQUEST_TIMEOUT = 10 From 91cbede853b5ac6a1f247444df6c9ab92d7d7117 Mon Sep 17 00:00:00 2001 From: Samk Date: Fri, 19 Dec 2025 18:46:32 +0530 Subject: [PATCH 3/5] Refactor EUVD sync as per suggestions Signed-off-by: Samk --- sync_catalog.py | 100 +++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/sync_catalog.py b/sync_catalog.py index 078bc30..2399d53 100644 --- a/sync_catalog.py +++ b/sync_catalog.py @@ -5,6 +5,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import argparse import json import sys from datetime import date, datetime, timedelta, timezone @@ -41,11 +42,9 @@ def steps(cls): return (cls.collect_catalog,) def collect_catalog(self) -> None: - mode = getattr(self, "mode", "backfill") - if mode == "daily": - self.sync_yesterday() - else: - self.backfill_from_year(DEFAULT_START_YEAR) + if getattr(self, "mode", "backfill") == "daily": + return self.sync_yesterday() + self.backfill_from_year(DEFAULT_START_YEAR) def backfill_from_year(self, start_year: int) -> None: today = date.today() @@ -59,15 +58,17 @@ def backfill_from_year(self, start_year: int) -> None: months.append((current.year, current.month)) if current.month == 12: current = date(current.year + 1, 1, 1) - else: - current = date(current.year, current.month + 1, 1) + continue + + current = date(current.year, current.month + 1, 1) progress = LoopProgress(total_iterations=len(months), logger=self.log) for year, month in progress.iter(months): + month_start = date(year, month, 1) + day_token = month_start.isoformat() + if year == backfill_end.year and month == backfill_end.month: - month_start = date(year, month, 1) - day_token = month_start.isoformat() self.log(f"backfill {year}-{month:02d}: {month_start} to {backfill_end}") self._collect_paginated( start=month_start, @@ -76,33 +77,25 @@ def backfill_from_year(self, start_year: int) -> None: month=month, day_token=day_token, ) + continue + + if month == 12: + next_month = date(year + 1, 1, 1) else: - self.collect_month(year, month) + next_month = date(year, month + 1, 1) + month_end = next_month - timedelta(days=1) + + self.log(f"month {year}-{month:02d}: {month_start} to {month_end}") + self._collect_paginated( + start=month_start, + end=month_end, + year=year, + month=month, + day_token=day_token, + ) def sync_yesterday(self) -> None: target_date = date.today() - timedelta(days=1) - self.collect_single_day(target_date) - - def collect_month(self, year: int, month: int) -> None: - month_start = date(year, month, 1) - if month == 12: - next_month = date(year + 1, 1, 1) - else: - next_month = date(year, month + 1, 1) - month_end = next_month - timedelta(days=1) - - self.log(f"month {year}-{month:02d}: {month_start} to {month_end}") - day_token = month_start.isoformat() - - self._collect_paginated( - start=month_start, - end=month_end, - year=year, - month=month, - day_token=day_token, - ) - - def collect_single_day(self, target_date: date) -> None: self.log(f"day {target_date}: updated_on={target_date}") day_token = target_date.isoformat() @@ -122,6 +115,9 @@ def _collect_paginated( month: int, day_token: str, ) -> None: + """ + Fetches all results from the EUVD API for the given date range, handling pagination.Each page of results is saved as a separate JSON file under the appropriate year/month directory. Fetching stops when a page returns fewer results than PAGE_SIZE, meaning there are no more results to fetch. + """ page = 0 while True: @@ -184,17 +180,21 @@ def write_page_file( def fetch_page(self, params: Dict[str, Any]) -> Dict[str, Any]: self.log(f"GET {API_URL} {params}") - response = self.session.get( - API_URL, - params=params, - headers=HEADERS, - timeout=REQUEST_TIMEOUT, - ) - response.raise_for_status() - data: Any = response.json() - if not isinstance(data, dict): - return {} - return data + try: + response = self.session.get( + API_URL, + params=params, + headers=HEADERS, + timeout=REQUEST_TIMEOUT, + ) + response.raise_for_status() + data: Any = response.json() + if not isinstance(data, dict): + return {} + return data + except requests.exceptions.RequestException as e: + self.log(f"Request failed: {e}") + raise def log(self, message: str) -> None: now = datetime.now(timezone.utc).astimezone() @@ -203,12 +203,18 @@ def log(self, message: str) -> None: if __name__ == "__main__": - mode = "backfill" - if len(sys.argv) >= 2: - mode = sys.argv[1] + parser = argparse.ArgumentParser(description="EUVD Catalog Mirror") + parser.add_argument( + "mode", + nargs="?", + default="daily", + choices=["backfill", "daily"], + help="Sync mode: 'backfill' for full history, 'daily' for yesterday only", + ) + args = parser.parse_args() mirror = EuvdCatalogMirror() - mirror.mode = mode + mirror.mode = args.mode status_code, error_message = mirror.execute() if error_message: From ecee45ce37ee9a0c9c87b5a71d552512734231ff Mon Sep 17 00:00:00 2001 From: Samk Date: Mon, 22 Dec 2025 01:49:53 +0530 Subject: [PATCH 4/5] Add logging to unexpected results from API Signed-off-by: Samk --- sync_catalog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sync_catalog.py b/sync_catalog.py index 2399d53..de4fadd 100644 --- a/sync_catalog.py +++ b/sync_catalog.py @@ -190,6 +190,7 @@ def fetch_page(self, params: Dict[str, Any]) -> Dict[str, Any]: response.raise_for_status() data: Any = response.json() if not isinstance(data, dict): + self.log(f"Unexpected API response type: {type(data).__name__}, expected dict") return {} return data except requests.exceptions.RequestException as e: From 217d3aeda59722e942ad828cd2af5dd07d24cd5b Mon Sep 17 00:00:00 2001 From: Samk Date: Tue, 6 Jan 2026 20:31:28 +0530 Subject: [PATCH 5/5] Address review feedback for EUVD catalog mirror pipeline Signed-off-by: Samk --- .github/workflows/sync.yml | 4 +- sync_catalog.py | 128 ++++++++++++++++++++----------------- 2 files changed, 73 insertions(+), 59 deletions(-) diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml index d25eeca..8df59f1 100644 --- a/.github/workflows/sync.yml +++ b/.github/workflows/sync.yml @@ -19,10 +19,10 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.10' - name: Install required packages - run: pip install requests==2.32.5 aboutcode.pipeline==0.2.1 + run: pip install -r requirements.txt - name: Run sync (daily) run: python sync_catalog.py daily diff --git a/sync_catalog.py b/sync_catalog.py index de4fadd..4071964 100644 --- a/sync_catalog.py +++ b/sync_catalog.py @@ -32,11 +32,7 @@ REQUEST_TIMEOUT = 10 -class EuvdCatalogMirror(BasePipeline): - def __init__(self) -> None: - super().__init__() - self.session = requests.Session() - +class EUVDCatalogMirror(BasePipeline): @classmethod def steps(cls): return (cls.collect_catalog,) @@ -59,9 +55,10 @@ def backfill_from_year(self, start_year: int) -> None: if current.month == 12: current = date(current.year + 1, 1, 1) continue - current = date(current.year, current.month + 1, 1) + self.log(f"Starting backfill for {len(months)} months") + progress = LoopProgress(total_iterations=len(months), logger=self.log) for year, month in progress.iter(months): @@ -69,7 +66,6 @@ def backfill_from_year(self, start_year: int) -> None: day_token = month_start.isoformat() if year == backfill_end.year and month == backfill_end.month: - self.log(f"backfill {year}-{month:02d}: {month_start} to {backfill_end}") self._collect_paginated( start=month_start, end=backfill_end, @@ -85,7 +81,6 @@ def backfill_from_year(self, start_year: int) -> None: next_month = date(year, month + 1, 1) month_end = next_month - timedelta(days=1) - self.log(f"month {year}-{month:02d}: {month_start} to {month_end}") self._collect_paginated( start=month_start, end=month_end, @@ -94,19 +89,46 @@ def backfill_from_year(self, start_year: int) -> None: day_token=day_token, ) + self.log("Backfill completed") + def sync_yesterday(self) -> None: target_date = date.today() - timedelta(days=1) - self.log(f"day {target_date}: updated_on={target_date}") - day_token = target_date.isoformat() - - self._collect_paginated( - start=target_date, - end=target_date, - year=target_date.year, - month=target_date.month, - day_token=day_token, + + first_page = self.fetch_page( + { + "fromUpdatedDate": target_date.isoformat(), + "toUpdatedDate": target_date.isoformat(), + "size": PAGE_SIZE, + "page": 0, + } ) + total = first_page.get("total", 0) + total_pages = (total + PAGE_SIZE - 1) // PAGE_SIZE + + progress = LoopProgress(total_iterations=total_pages, logger=self.log) + + for page in progress.iter(range(total_pages)): + if page == 0: + data = first_page + else: + data = self.fetch_page( + { + "fromUpdatedDate": target_date.isoformat(), + "toUpdatedDate": target_date.isoformat(), + "size": PAGE_SIZE, + "page": page, + } + ) + + self.write_page_file( + year=target_date.year, + month=target_date.month, + day_token=target_date.isoformat(), + page_number=page + 1, + payload=data, + ) + def _collect_paginated( self, start: date, @@ -116,41 +138,42 @@ def _collect_paginated( day_token: str, ) -> None: """ - Fetches all results from the EUVD API for the given date range, handling pagination.Each page of results is saved as a separate JSON file under the appropriate year/month directory. Fetching stops when a page returns fewer results than PAGE_SIZE, meaning there are no more results to fetch. + Fetch all EUVD results for the given date range using paginated requests.The total number of results is read from the API response and used to determine how many pages to fetch. Each page is written as a separate JSON file under the corresponding year/month directory. """ - page = 0 - while True: - params = { + first_page = self.fetch_page( + { "fromUpdatedDate": start.isoformat(), "toUpdatedDate": end.isoformat(), "size": PAGE_SIZE, - "page": page, + "page": 0, } + ) - data = self.fetch_page(params) - items = data.get("items") or [] - count = len(items) + total = first_page.get("total", 0) + total_pages = (total + PAGE_SIZE - 1) // PAGE_SIZE - if count == 0: - self.log(f"no results for {start}–{end} on page {page}, stopping") - break + for page in range(total_pages): + if page == 0: + data = first_page + else: + data = self.fetch_page( + { + "fromUpdatedDate": start.isoformat(), + "toUpdatedDate": end.isoformat(), + "size": PAGE_SIZE, + "page": page, + } + ) - page_number = page + 1 self.write_page_file( year=year, month=month, day_token=day_token, - page_number=page_number, + page_number=page + 1, payload=data, ) - if count < PAGE_SIZE: - self.log(f"finished {start}–{end} at page {page} ({count} items)") - break - - page += 1 - def write_page_file( self, year: int, @@ -170,32 +193,23 @@ def write_page_file( path = dir_path / filename if path.exists(): - self.log(f"skip existing file: {path}") return with path.open("w", encoding="utf-8") as output: json.dump(payload, output, indent=2) - self.log(f"saved {path}") - def fetch_page(self, params: Dict[str, Any]) -> Dict[str, Any]: - self.log(f"GET {API_URL} {params}") - try: - response = self.session.get( - API_URL, - params=params, - headers=HEADERS, - timeout=REQUEST_TIMEOUT, - ) - response.raise_for_status() - data: Any = response.json() - if not isinstance(data, dict): - self.log(f"Unexpected API response type: {type(data).__name__}, expected dict") - return {} - return data - except requests.exceptions.RequestException as e: - self.log(f"Request failed: {e}") - raise + response = requests.get( + API_URL, + params=params, + headers=HEADERS, + timeout=REQUEST_TIMEOUT, + ) + response.raise_for_status() + data: Any = response.json() + if not isinstance(data, dict): + return {} + return data def log(self, message: str) -> None: now = datetime.now(timezone.utc).astimezone() @@ -214,7 +228,7 @@ def log(self, message: str) -> None: ) args = parser.parse_args() - mirror = EuvdCatalogMirror() + mirror = EUVDCatalogMirror() mirror.mode = args.mode status_code, error_message = mirror.execute()