MeltanoLabs · laurentS · Oct 27, 2021
@@ -1,6 +1,8 @@
 """Stream type classes for tap-github."""
 
-from typing import Any, Dict, Iterable, List, Optional
+import re
+import requests
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 from singer_sdk import typing as th  # JSON Schema typing helpers
 
 from tap_github.client import GitHubStream
@@ -68,6 +70,39 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
             "repo": record["name"],
         }
 
+    def fetch_dependents_counts_repos_and_packages(self, repo_full_name: str) -> Tuple[Optional[int], Optional[int]]:
+        """
+        Fetch additional data about the number of dependents on this repository.
+
+        This parses HTML as the corresponding data is not available on any github
+        API endpoint (REST or graphQL)
+        """
+        url = f"https://github.com/{repo_full_name}/network/dependents"
+        response = requests.get(url)
+        if response.status_code != 200:
+            self.logger.info(f"Failed to get dependents page, setting value to 0")
+            return None, None
+        content = response.text
+        match = re.search(r'([0-9,]+)\s+Repositories.+?([0-9,]+)\s+Packages', content, re.DOTALL)
+        if match is None:
+            return None, None
+        return int(match.group(1).replace(',','').strip(' \n')), int(match.group(2).replace(',','').strip(' \n'))
+
+    def post_process(self, row: dict, context: Optional[dict] = None) -> dict:
+        """
+        Optionally add 2 extra fields on the repository record. These are deselected by default
+        as they are quite resource heavy to get.
+        """
+        fetch_dependents_count_repos: bool = self.mask[('properties', 'dependents_count_repositories')]
+        fetch_dependents_count_pkgs: bool = self.mask[('properties', 'dependents_count_packages')]
+        if fetch_dependents_count_repos or fetch_dependents_count_pkgs:
+            r, p = self.fetch_dependents_counts_repos_and_packages(row["full_name"])
+            if fetch_dependents_count_repos:
+                row["dependents_count_repositories"] = r
+            if fetch_dependents_count_pkgs:
+                row["dependents_count_packages"] = p
+        return row
+
     schema = th.PropertiesList(
         th.Property("search_name", th.StringType),
         th.Property("search_query", th.StringType),
@@ -144,6 +179,8 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
                 th.Property("site_admin", th.BooleanType),
             ),
         ),
+        th.Property("dependents_count_repositories", th.IntegerType, selected_by_default=False),
+        th.Property("dependents_count_packages", th.IntegerType, selected_by_default=False),
     ).to_dict()
 
 

@@ -1,4 +1,4 @@
-import datetime
+import json
 import pytest
 
 from tap_github.tap import TapGitHub
@@ -48,3 +48,40 @@ def test_get_a_repository_in_repo_list_mode(capsys, repo_list_config):
     assert captured.out.count('{"type": "RECORD", "stream": "repositories"') == len(
         repo_list_2
     )
+
+
+@pytest.mark.repo_list(["facebook/react"])
+def test_get_a_repository_with_dependents_count(capsys, repo_list_config):
+    """
+    Discover the catalog, request a repo with the dependent count and
+    check that the value is what we expect
+    """
+    tap1 = TapGitHub(config=repo_list_config)
+    tap1.run_discovery()
+    catalog = tap1._singer_catalog
+    # disable child streams
+    deselect_all_streams(catalog)
+    set_catalog_stream_selected(
+        catalog=catalog, stream_name="repositories", selected=True
+    )
+    set_catalog_stream_selected(
+        catalog=catalog,
+        stream_name="repositories",
+        selected=True,
+        breadcrumb=("properties", "dependents_count_repositories"),
+    )
+    # discard previous output to stdout (potentially from other tests)
+    capsys.readouterr()
+    tap2 = TapGitHub(config=repo_list_config, catalog=catalog.to_dict())
+    tap2.sync_all()
+    captured = capsys.readouterr()
+    # Verify we got the right number of records (one per repo in the list)
+    record_marker = '{"type": "RECORD", "stream": "repositories"'
+    assert captured.out.count(record_marker) == 1
+    record = json.loads(
+        [line for line in captured.out.splitlines() if line.startswith(record_marker)][
+            0
+        ]
+    )
+    assert "dependents_count_repositories" in record["record"]
+    assert record["record"]["dependents_count_repositories"] > 7000000