From 99dc37a615bd7f0ced5500f8933b53f1f4f0f201 Mon Sep 17 00:00:00 2001 From: Laurent Savaete Date: Wed, 27 Oct 2021 18:45:03 +0100 Subject: [PATCH] Add dependents count fields on repositories stream This is WIP, as this commit requires changes in the SDK which are in a separate PR on that repo. --- tap_github/streams.py | 39 +++++++++++++++++++++++++++++++++++- tap_github/tests/test_tap.py | 39 +++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/tap_github/streams.py b/tap_github/streams.py index 0f351cc0..4857d208 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -1,6 +1,8 @@ """Stream type classes for tap-github.""" -from typing import Any, Dict, Iterable, List, Optional +import re +import requests +from typing import Any, Dict, Iterable, List, Optional, Tuple from singer_sdk import typing as th # JSON Schema typing helpers from tap_github.client import GitHubStream @@ -68,6 +70,39 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict: "repo": record["name"], } + def fetch_dependents_counts_repos_and_packages(self, repo_full_name: str) -> Tuple[Optional[int], Optional[int]]: + """ + Fetch additional data about the number of dependents on this repository. + + This parses HTML as the corresponding data is not available on any github + API endpoint (REST or graphQL) + """ + url = f"https://github.com/{repo_full_name}/network/dependents" + response = requests.get(url) + if response.status_code != 200: + self.logger.info(f"Failed to get dependents page, setting value to 0") + return None, None + content = response.text + match = re.search(r'([0-9,]+)\s+Repositories.+?([0-9,]+)\s+Packages', content, re.DOTALL) + if match is None: + return None, None + return int(match.group(1).replace(',','').strip(' \n')), int(match.group(2).replace(',','').strip(' \n')) + + def post_process(self, row: dict, context: Optional[dict] = None) -> dict: + """ + Optionally add 2 extra fields on the repository record. These are deselected by default + as they are quite resource heavy to get. + """ + fetch_dependents_count_repos: bool = self.mask[('properties', 'dependents_count_repositories')] + fetch_dependents_count_pkgs: bool = self.mask[('properties', 'dependents_count_packages')] + if fetch_dependents_count_repos or fetch_dependents_count_pkgs: + r, p = self.fetch_dependents_counts_repos_and_packages(row["full_name"]) + if fetch_dependents_count_repos: + row["dependents_count_repositories"] = r + if fetch_dependents_count_pkgs: + row["dependents_count_packages"] = p + return row + schema = th.PropertiesList( th.Property("search_name", th.StringType), th.Property("search_query", th.StringType), @@ -144,6 +179,8 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict: th.Property("site_admin", th.BooleanType), ), ), + th.Property("dependents_count_repositories", th.IntegerType, selected_by_default=False), + th.Property("dependents_count_packages", th.IntegerType, selected_by_default=False), ).to_dict() diff --git a/tap_github/tests/test_tap.py b/tap_github/tests/test_tap.py index 78de0619..17bc56ee 100644 --- a/tap_github/tests/test_tap.py +++ b/tap_github/tests/test_tap.py @@ -1,4 +1,4 @@ -import datetime +import json import pytest from tap_github.tap import TapGitHub @@ -48,3 +48,40 @@ def test_get_a_repository_in_repo_list_mode(capsys, repo_list_config): assert captured.out.count('{"type": "RECORD", "stream": "repositories"') == len( repo_list_2 ) + + +@pytest.mark.repo_list(["facebook/react"]) +def test_get_a_repository_with_dependents_count(capsys, repo_list_config): + """ + Discover the catalog, request a repo with the dependent count and + check that the value is what we expect + """ + tap1 = TapGitHub(config=repo_list_config) + tap1.run_discovery() + catalog = tap1._singer_catalog + # disable child streams + deselect_all_streams(catalog) + set_catalog_stream_selected( + catalog=catalog, stream_name="repositories", selected=True + ) + set_catalog_stream_selected( + catalog=catalog, + stream_name="repositories", + selected=True, + breadcrumb=("properties", "dependents_count_repositories"), + ) + # discard previous output to stdout (potentially from other tests) + capsys.readouterr() + tap2 = TapGitHub(config=repo_list_config, catalog=catalog.to_dict()) + tap2.sync_all() + captured = capsys.readouterr() + # Verify we got the right number of records (one per repo in the list) + record_marker = '{"type": "RECORD", "stream": "repositories"' + assert captured.out.count(record_marker) == 1 + record = json.loads( + [line for line in captured.out.splitlines() if line.startswith(record_marker)][ + 0 + ] + ) + assert "dependents_count_repositories" in record["record"] + assert record["record"]["dependents_count_repositories"] > 7000000