Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion tap_github/streams.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Stream type classes for tap-github."""

from typing import Any, Dict, Iterable, List, Optional
import re
import requests
from typing import Any, Dict, Iterable, List, Optional, Tuple
from singer_sdk import typing as th # JSON Schema typing helpers

from tap_github.client import GitHubStream
Expand Down Expand Up @@ -68,6 +70,39 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
"repo": record["name"],
}

def fetch_dependents_counts_repos_and_packages(self, repo_full_name: str) -> Tuple[Optional[int], Optional[int]]:
"""
Fetch additional data about the number of dependents on this repository.

This parses HTML as the corresponding data is not available on any github
API endpoint (REST or graphQL)
"""
url = f"https://github.com/{repo_full_name}/network/dependents"
response = requests.get(url)
if response.status_code != 200:
self.logger.info(f"Failed to get dependents page, setting value to 0")
return None, None
content = response.text
match = re.search(r'([0-9,]+)\s+Repositories.+?([0-9,]+)\s+Packages', content, re.DOTALL)
if match is None:
return None, None
return int(match.group(1).replace(',','').strip(' \n')), int(match.group(2).replace(',','').strip(' \n'))

def post_process(self, row: dict, context: Optional[dict] = None) -> dict:
"""
Optionally add 2 extra fields on the repository record. These are deselected by default
as they are quite resource heavy to get.
"""
fetch_dependents_count_repos: bool = self.mask[('properties', 'dependents_count_repositories')]
fetch_dependents_count_pkgs: bool = self.mask[('properties', 'dependents_count_packages')]
if fetch_dependents_count_repos or fetch_dependents_count_pkgs:
r, p = self.fetch_dependents_counts_repos_and_packages(row["full_name"])
if fetch_dependents_count_repos:
row["dependents_count_repositories"] = r
if fetch_dependents_count_pkgs:
row["dependents_count_packages"] = p
return row

schema = th.PropertiesList(
th.Property("search_name", th.StringType),
th.Property("search_query", th.StringType),
Expand Down Expand Up @@ -144,6 +179,8 @@ def get_child_context(self, record: dict, context: Optional[dict]) -> dict:
th.Property("site_admin", th.BooleanType),
),
),
th.Property("dependents_count_repositories", th.IntegerType, selected_by_default=False),
th.Property("dependents_count_packages", th.IntegerType, selected_by_default=False),
).to_dict()


Expand Down
39 changes: 38 additions & 1 deletion tap_github/tests/test_tap.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import datetime
import json
import pytest

from tap_github.tap import TapGitHub
Expand Down Expand Up @@ -48,3 +48,40 @@ def test_get_a_repository_in_repo_list_mode(capsys, repo_list_config):
assert captured.out.count('{"type": "RECORD", "stream": "repositories"') == len(
repo_list_2
)


@pytest.mark.repo_list(["facebook/react"])
def test_get_a_repository_with_dependents_count(capsys, repo_list_config):
"""
Discover the catalog, request a repo with the dependent count and
check that the value is what we expect
"""
tap1 = TapGitHub(config=repo_list_config)
tap1.run_discovery()
catalog = tap1._singer_catalog
# disable child streams
deselect_all_streams(catalog)
set_catalog_stream_selected(
catalog=catalog, stream_name="repositories", selected=True
)
set_catalog_stream_selected(
catalog=catalog,
stream_name="repositories",
selected=True,
breadcrumb=("properties", "dependents_count_repositories"),
)
# discard previous output to stdout (potentially from other tests)
capsys.readouterr()
tap2 = TapGitHub(config=repo_list_config, catalog=catalog.to_dict())
tap2.sync_all()
captured = capsys.readouterr()
# Verify we got the right number of records (one per repo in the list)
record_marker = '{"type": "RECORD", "stream": "repositories"'
assert captured.out.count(record_marker) == 1
record = json.loads(
[line for line in captured.out.splitlines() if line.startswith(record_marker)][
0
]
)
assert "dependents_count_repositories" in record["record"]
assert record["record"]["dependents_count_repositories"] > 7000000