From f427b61d3ea922ce9dec5aa0a85ca53b0d342fa6 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Thu, 4 Dec 2025 15:44:25 -0500 Subject: [PATCH 01/15] initial commit for basic GA summary info --- NAMESPACE | 2 ++ R/ga_summary.R | 31 +++++++++++++++++++++++++++++++ man/get_ga_summary.Rd | 23 +++++++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 R/ga_summary.R create mode 100644 man/get_ga_summary.Rd diff --git a/NAMESPACE b/NAMESPACE index e44315d..1d9ae96 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(ga_query_explorer) export(get_core_project_info) export(get_ga_basic) export(get_ga_meta_by_id) +export(get_ga_summary) export(get_github_by_topic) export(get_github_by_topic_graphql) export(icite) @@ -28,6 +29,7 @@ importFrom(glue,glue) importFrom(glue,glue_collapse) importFrom(googleAnalyticsR,ga_account_list) importFrom(googleAnalyticsR,ga_auth) +importFrom(googleAnalyticsR,ga_data) importFrom(httr2,req_auth_bearer_token) importFrom(httr2,req_body_json) importFrom(httr2,req_error) diff --git a/R/ga_summary.R b/R/ga_summary.R new file mode 100644 index 0000000..cc7af93 --- /dev/null +++ b/R/ga_summary.R @@ -0,0 +1,31 @@ +#' Get summary metrics from Google Analytics +#' +#' This function retrieves summary metrics from Google Analytics, +#' including total users, new users, engaged sessions, engagement rate, +#' event count, and screen page views. +#' +#' @param propertyId The Google Analytics property ID +#' @param start_date The start date for the data retrieval (e.g., "30daysAgo") +#' @param end_date The end date for the data retrieval (e.g., "yesterday") +#' +#' @importFrom googleAnalyticsR ga_data +#' @importFrom dplyr mutate +#' +#' @return A tibble containing the summary metrics +#' @export +get_ga_summary <- function(propertyId, start_date = "30daysAgo", end_date = "yesterday") { + ga_data( + propertyId, + metrics = c( + "totalUsers", + "newUsers", + "engagedSessions", + "engagementRate", + "eventCount", + "screenPageViews" + ), + dimensions = "date", + date_range = c(start_date, end_date) + ) |> + mutate(propertyId = propertyId) +} diff --git a/man/get_ga_summary.Rd b/man/get_ga_summary.Rd new file mode 100644 index 0000000..623daa0 --- /dev/null +++ b/man/get_ga_summary.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ga_summary.R +\name{get_ga_summary} +\alias{get_ga_summary} +\title{Get summary metrics from Google Analytics} +\usage{ +get_ga_summary(propertyId, start_date = "30daysAgo", end_date = "yesterday") +} +\arguments{ +\item{propertyId}{The Google Analytics property ID} + +\item{start_date}{The start date for the data retrieval (e.g., "30daysAgo")} + +\item{end_date}{The end date for the data retrieval (e.g., "yesterday")} +} +\value{ +A tibble containing the summary metrics +} +\description{ +This function retrieves summary metrics from Google Analytics, +including total users, new users, engaged sessions, engagement rate, +event count, and screen page views. +} From ad1836cd91432d64b8f787e2ef86c64315392297 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 5 Dec 2025 17:52:39 -0500 Subject: [PATCH 02/15] add to graphql query for GitHub - re-org --- R/get_github_by_topic_graphQL.R | 76 +++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 32d52bb..7d84370 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -30,13 +30,13 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { } topics <- tolower(topics) - + # Set up client cli <- ghql::GraphqlClient$new( url = "https://api.github.com/graphql", headers = list(Authorization = paste0("bearer ", token)) ) - + # GraphQL query template # Fetches up to `limit` repos per topic, along with key metadata query_template <- ' @@ -64,38 +64,37 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { defaultBranchRef { target { ... on Commit { - history { - totalCount - } + history { totalCount } } } } - mentionableUsers { - totalCount - } + mentionableUsers { totalCount } repositoryTopics(first: 20) { - nodes { - topic { name } - } + nodes { topic { name } } + } + readme: object(expression: "HEAD:README.md") { id } + coc: object(expression: "HEAD:CODE_OF_CONDUCT.md") { id } + languages(first: 20) { + edges { size node { name } } } } } } }' - - # Compile query once + qry <- ghql::Query$new() qry$query("repoQuery", query_template) - - # Helper: fetch repos for one topic + fetch_topic <- function(topic) { - res <- cli$exec(qry$queries$repoQuery, - variables = list(queryString = paste0("topic:", topic), - limit = limit)) + res <- cli$exec( + qry$queries$repoQuery, + variables = list(queryString = paste0("topic:", topic), + limit = limit) + ) dat <- jsonlite::fromJSON(res, flatten = TRUE) repos <- dat$data$search$nodes if (length(repos) == 0) return(NULL) - # browser() + tibble::tibble( name = repos$name, owner = repos$owner.login, @@ -111,31 +110,45 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { repos$defaultBranchRef.target.history.totalCount, ~ if (is.null(.x)) NA_real_ else .x ), - contributors = repos$mentionableUsers.totalCount %||% 0, + mentionable_users = repos$mentionableUsers.totalCount %||% 0, + has_readme = !purrr::map_lgl(repos$readme.id, is.na), + code_of_conduct = !purrr::map_lgl(repos$coc.id, is.na), tags = purrr::map_chr( - repos$repositoryTopics.nodes, - ~ if (length(.x$topic.name) == 0) NA_character_ - else paste(.x$topic.name, collapse = ", ") - ), + repos$repositoryTopics.nodes, + ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_ + else paste(.x$topic.name, collapse = ", ") + ), language = repos$primaryLanguage.name, - license = repos$licenseInfo.name, + language_loc = purrr::map( + repos$languages.edges, + ~ if (is.null(.x)) { + tibble::tibble(language = NA_character_, bytes = NA_real_, loc = NA_real_) + } else { + tibble::tibble( + language = .x$node.name, + bytes = .x$size, + loc = round(.x$size / 50) + ) + } + ), + license = repos$licenseInfo.name %||% NA_character_, created_at = repos$createdAt, pushed_at = repos$pushedAt, updated_at = repos$updatedAt, html_url = repos$url, queried_topic = topic ) - } - + # Fetch across all topics and combine df <- purrr::map_dfr(topics, fetch_topic) - if (nrow(df) == 0) { + if (nrow(df) == 0) { expected_cols <- c( "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", - "closed_issues", "closed_prs", "commits", "contributors", "tags", "language", "license", - "created_at", "pushed_at", "updated_at", "html_url", "queried_topic" + "closed_issues", "closed_prs", "commits", "mentionable_users", "has_readme", "code_of_conduct", + "tags", "language", "language_loc", "license", "created_at", "pushed_at", "updated_at", "html_url", + "queried_topic" ) df <- tibble::tibble( !!!setNames(rep(list(NA), length(expected_cols)), expected_cols) @@ -148,7 +161,6 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { dplyr::relocate('closed_issues', .after = 'open_prs') |> dplyr::relocate('closed_prs', .after = 'closed_issues') |> dplyr::relocate('commits', .after = 'closed_prs') |> - dplyr::relocate('contributors', .after = 'commits') - + dplyr::relocate('mentionable_users', .after = 'commits') return(df) } From 8231310b6613ea8396d3cc54a2576852c979fde0 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Fri, 5 Dec 2025 18:06:58 -0500 Subject: [PATCH 03/15] add contributors from REST as there is no graphql equivalent. borrow from get_github_by_topic() - TODO: refactor helper function/add docs --- R/get_github_by_topic_graphQL.R | 36 ++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 7d84370..7452d05 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -146,7 +146,7 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { if (nrow(df) == 0) { expected_cols <- c( "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", - "closed_issues", "closed_prs", "commits", "mentionable_users", "has_readme", "code_of_conduct", + "closed_issues", "closed_prs", "commits", "mentionable_users", "contributors","has_readme", "code_of_conduct", "tags", "language", "language_loc", "license", "created_at", "pushed_at", "updated_at", "html_url", "queried_topic" ) @@ -154,13 +154,43 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { !!!setNames(rep(list(NA), length(expected_cols)), expected_cols) ) %>% filter(!is.na(.data$name)) - } + } else { + df <- df |> + mutate(contributors = map2_dbl(owner, name, ~ get_contributor_count(.x, .y, token = token))) + } + # Organize columns like before df <- df |> dplyr::relocate('open_prs', .after = 'open_issues') |> dplyr::relocate('closed_issues', .after = 'open_prs') |> dplyr::relocate('closed_prs', .after = 'closed_issues') |> dplyr::relocate('commits', .after = 'closed_prs') |> - dplyr::relocate('mentionable_users', .after = 'commits') + dplyr::relocate('mentionable_users', .after = 'commits') |> + dplyr::relocate('contributors', .after = 'mentionable_users') return(df) } + + # Helper to count contributors + get_contributor_count <- function(owner, repo, token = NULL) { + base_url <- glue("https://api.github.com/repos/{owner}/{repo}/contributors") + req <- request(base_url) |> + req_url_query(per_page = 1, anon = "false") |> # anon=TRUE counts contributors without accounts + req_headers("User-Agent" = "httr2") + if (!is.null(token)) { + req <- req |> req_auth_bearer_token(token) + } + + resp <- tryCatch(req_perform(req), error = function(e) NULL) + if (is.null(resp) || resp_status(resp) != 200) return(NA_real_) + + link <- resp_headers(resp)[["link"]] + if (!is.null(link) && grepl("rel=\"last\"", link)) { + matches <- regmatches(link, regexpr("page=\\d+>; rel=\\\"last\\\"", link)) + count <- as.numeric(sub("page=", "", sub(">; rel=\"last\"", "", matches))) + return(count) + } else { + body <- resp_body_json(resp) + return(length(body)) # if only a few contributors + } + } + From cea01c4a1a191cd44f5f04f1ae524ff2b270682e Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 08:42:16 -0500 Subject: [PATCH 04/15] formatting --- R/get_github_by_topic_graphQL.R | 84 ++++++++++++++++----------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 7452d05..99040dc 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -96,47 +96,47 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { if (length(repos) == 0) return(NULL) tibble::tibble( - name = repos$name, - owner = repos$owner.login, - description = repos$description, - stars = repos$stargazerCount, - watchers = repos$watchers.totalCount, - forks = repos$forkCount, - open_issues = repos$issues.totalCount, - closed_issues = repos$closedIssues.totalCount, - open_prs = repos$openPRs.totalCount, - closed_prs = repos$closedPRs.totalCount, - commits = purrr::map_dbl( - repos$defaultBranchRef.target.history.totalCount, - ~ if (is.null(.x)) NA_real_ else .x - ), - mentionable_users = repos$mentionableUsers.totalCount %||% 0, - has_readme = !purrr::map_lgl(repos$readme.id, is.na), - code_of_conduct = !purrr::map_lgl(repos$coc.id, is.na), - tags = purrr::map_chr( - repos$repositoryTopics.nodes, - ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_ - else paste(.x$topic.name, collapse = ", ") - ), - language = repos$primaryLanguage.name, - language_loc = purrr::map( - repos$languages.edges, - ~ if (is.null(.x)) { - tibble::tibble(language = NA_character_, bytes = NA_real_, loc = NA_real_) - } else { - tibble::tibble( - language = .x$node.name, - bytes = .x$size, - loc = round(.x$size / 50) - ) - } - ), - license = repos$licenseInfo.name %||% NA_character_, - created_at = repos$createdAt, - pushed_at = repos$pushedAt, - updated_at = repos$updatedAt, - html_url = repos$url, - queried_topic = topic + name = repos$name, + owner = repos$owner.login, + description = repos$description, + stars = repos$stargazerCount, + watchers = repos$watchers.totalCount, + forks = repos$forkCount, + open_issues = repos$issues.totalCount, + closed_issues = repos$closedIssues.totalCount, + open_prs = repos$openPRs.totalCount, + closed_prs = repos$closedPRs.totalCount, + commits = purrr::map_dbl( + repos$defaultBranchRef.target.history.totalCount, + ~ if (is.null(.x)) NA_real_ else .x + ), + mentionable_users = repos$mentionableUsers.totalCount %||% 0, + has_readme = !purrr::map_lgl(repos$readme.id, is.na), + code_of_conduct = !purrr::map_lgl(repos$coc.id, is.na), + tags = purrr::map_chr( + repos$repositoryTopics.nodes, + ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_ + else paste(.x$topic.name, collapse = ", ") + ), + language = repos$primaryLanguage.name, + language_loc = purrr::map( + repos$languages.edges, + ~ if (is.null(.x)) { + tibble::tibble(language = NA_character_, bytes = NA_real_, loc = NA_real_) + } else { + tibble::tibble( + language = .x$node.name, + bytes = .x$size, + loc = round(.x$size / 50) + ) + } + ), + license = repos$licenseInfo.name %||% NA_character_, + created_at = repos$createdAt, + pushed_at = repos$pushedAt, + updated_at = repos$updatedAt, + html_url = repos$url, + queried_topic = topic ) } @@ -156,7 +156,7 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { filter(!is.na(.data$name)) } else { df <- df |> - mutate(contributors = map2_dbl(owner, name, ~ get_contributor_count(.x, .y, token = token))) + mutate(contributors = map2_dbl(.data$owner, .data$name, ~ get_contributor_count(.x, .y, token = token))) } # Organize columns like before From 3ecfe00dfd11f51d23ba7d90c41b4560d5ed51c7 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 08:43:00 -0500 Subject: [PATCH 05/15] logical readability --- R/get_github_by_topic_graphQL.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 99040dc..d1ad76b 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -111,8 +111,8 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { ~ if (is.null(.x)) NA_real_ else .x ), mentionable_users = repos$mentionableUsers.totalCount %||% 0, - has_readme = !purrr::map_lgl(repos$readme.id, is.na), - code_of_conduct = !purrr::map_lgl(repos$coc.id, is.na), + has_readme = purrr::map_lgl(repos$readme.id, !is.na), + code_of_conduct = purrr::map_lgl(repos$coc.id, !is.na), tags = purrr::map_chr( repos$repositoryTopics.nodes, ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_ From f0725aad4d9ce63767193b67470d3809f074cb96 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 08:54:48 -0500 Subject: [PATCH 06/15] additional logical considerations --- R/get_github_by_topic_graphQL.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index d1ad76b..74322b9 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -111,8 +111,14 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { ~ if (is.null(.x)) NA_real_ else .x ), mentionable_users = repos$mentionableUsers.totalCount %||% 0, - has_readme = purrr::map_lgl(repos$readme.id, !is.na), - code_of_conduct = purrr::map_lgl(repos$coc.id, !is.na), + has_readme = purrr::map_lgl( + repos$readme.id, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ), + code_of_conduct = purrr::map_lgl( + repos$coc.id, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ), tags = purrr::map_chr( repos$repositoryTopics.nodes, ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_ From 8c879018e88c567f3c268ef600b174d05bd24cdc Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 09:15:49 -0500 Subject: [PATCH 07/15] if no repositories return results, the json field for `id` never gets populated. check to make sure it is available before evaluating logical --- R/get_github_by_topic_graphQL.R | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 74322b9..b6a9abf 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -94,7 +94,7 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { dat <- jsonlite::fromJSON(res, flatten = TRUE) repos <- dat$data$search$nodes if (length(repos) == 0) return(NULL) - + # browser() tibble::tibble( name = repos$name, owner = repos$owner.login, @@ -111,14 +111,20 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { ~ if (is.null(.x)) NA_real_ else .x ), mentionable_users = repos$mentionableUsers.totalCount %||% 0, - has_readme = purrr::map_lgl( - repos$readme.id, - ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE - ), - code_of_conduct = purrr::map_lgl( - repos$coc.id, - ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE - ), + has_readme = if( 'readme.id' %in% names(repos)) { + purrr::map_lgl( + repos$readme.id, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ) } else { + FALSE + }, + code_of_conduct = if( 'coc.id' %in% names(repos)) { + purrr::map_lgl( + repos$coc, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ) } else { + FALSE + }, tags = purrr::map_chr( repos$repositoryTopics.nodes, ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_ From 4c7ff8ec4262139e42edcfa4705b68a2ca9674a8 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 10:28:56 -0500 Subject: [PATCH 08/15] relocate helper function to facilitate re-use. keep with other github REST functions --- R/get_github_by_topic.R | 53 ++++++++++++++++++--------------- R/get_github_by_topic_graphQL.R | 25 ---------------- 2 files changed, 29 insertions(+), 49 deletions(-) diff --git a/R/get_github_by_topic.R b/R/get_github_by_topic.R index 82cfe54..c746cdf 100644 --- a/R/get_github_by_topic.R +++ b/R/get_github_by_topic.R @@ -1,3 +1,32 @@ +# Helper to count contributors + get_contributor_count <- function(owner, repo, token = NULL) { + base_url <- glue("https://api.github.com/repos/{owner}/{repo}/contributors") + req <- request(base_url) |> + req_url_query(per_page = 1, anon = "false") |> # anon=TRUE counts contributors without accounts + req_headers("User-Agent" = "httr2") + if (!is.null(token)) { + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600) + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600) + } + + resp <- tryCatch(req_perform(req), error = function(e) NULL) + if (is.null(resp) || resp_status(resp) != 200) return(NA_real_) + + link <- resp_headers(resp)[["link"]] + if (!is.null(link) && grepl("rel=\"last\"", link)) { + matches <- regmatches(link, regexpr("page=\\d+>; rel=\\\"last\\\"", link)) + count <- as.numeric(sub("page=", "", sub(">; rel=\"last\"", "", matches))) + return(count) + } else { + body <- resp_body_json(resp) + return(length(body)) # if only a few contributors + } + } + #' Get GitHub Repositories by Topic #' #' @param topics A vector of GitHub topics to search for. @@ -168,30 +197,6 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { commit_counts <- map2_dbl(df$owner, df$name, ~ get_commit_count(.x, .y, token = token)) df$commits <- commit_counts - # Helper to count contributors - get_contributor_count <- function(owner, repo, token = NULL) { - base_url <- glue("https://api.github.com/repos/{owner}/{repo}/contributors") - req <- request(base_url) |> - req_url_query(per_page = 1, anon = "false") |> # anon=TRUE counts contributors without accounts - req_headers("User-Agent" = "httr2") - if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) - } - - resp <- tryCatch(req_perform(req), error = function(e) NULL) - if (is.null(resp) || resp_status(resp) != 200) return(NA_real_) - - link <- resp_headers(resp)[["link"]] - if (!is.null(link) && grepl("rel=\"last\"", link)) { - matches <- regmatches(link, regexpr("page=\\d+>; rel=\\\"last\\\"", link)) - count <- as.numeric(sub("page=", "", sub(">; rel=\"last\"", "", matches))) - return(count) - } else { - body <- resp_body_json(resp) - return(length(body)) # if only a few contributors - } - } - contributor_counts <- map2_dbl(df$owner, df$name, ~ get_contributor_count(.x, .y, token = token)) df$contributors <- contributor_counts diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index b6a9abf..359b714 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -181,28 +181,3 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { dplyr::relocate('contributors', .after = 'mentionable_users') return(df) } - - # Helper to count contributors - get_contributor_count <- function(owner, repo, token = NULL) { - base_url <- glue("https://api.github.com/repos/{owner}/{repo}/contributors") - req <- request(base_url) |> - req_url_query(per_page = 1, anon = "false") |> # anon=TRUE counts contributors without accounts - req_headers("User-Agent" = "httr2") - if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) - } - - resp <- tryCatch(req_perform(req), error = function(e) NULL) - if (is.null(resp) || resp_status(resp) != 200) return(NA_real_) - - link <- resp_headers(resp)[["link"]] - if (!is.null(link) && grepl("rel=\"last\"", link)) { - matches <- regmatches(link, regexpr("page=\\d+>; rel=\\\"last\\\"", link)) - count <- as.numeric(sub("page=", "", sub(">; rel=\"last\"", "", matches))) - return(count) - } else { - body <- resp_body_json(resp) - return(length(body)) # if only a few contributors - } - } - From c9aef02b1f607cde14c861a3aa2f277b1547891a Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 10:34:55 -0500 Subject: [PATCH 09/15] re-org, add dep --- NAMESPACE | 1 + R/get_github_by_topic.R | 14 ++++++++++++-- man/get_contributor_count.Rd | 21 +++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 man/get_contributor_count.Rd diff --git a/NAMESPACE b/NAMESPACE index 1d9ae96..62d217b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,7 @@ importFrom(httr2,req_method) importFrom(httr2,req_perform) importFrom(httr2,req_perform_parallel) importFrom(httr2,req_retry) +importFrom(httr2,req_throttle) importFrom(httr2,req_url_query) importFrom(httr2,request) importFrom(httr2,resp_body_json) diff --git a/R/get_github_by_topic.R b/R/get_github_by_topic.R index c746cdf..06ebe10 100644 --- a/R/get_github_by_topic.R +++ b/R/get_github_by_topic.R @@ -1,5 +1,14 @@ -# Helper to count contributors - get_contributor_count <- function(owner, repo, token = NULL) { +# REST API Helper Functions +#' Get the number of contributors for a GitHub repository +#' +#' @param owner The owner of the GitHub repository +#' @param repo The name of the GitHub repository +#' @param token A GitHub personal access token (optional) +#' +#' @importFrom httr2 req_throttle +#' +#' @return The number of contributors to the repository +get_contributor_count <- function(owner, repo, token = NULL) { base_url <- glue("https://api.github.com/repos/{owner}/{repo}/contributors") req <- request(base_url) |> req_url_query(per_page = 1, anon = "false") |> # anon=TRUE counts contributors without accounts @@ -27,6 +36,7 @@ } } +# REST Functions #' Get GitHub Repositories by Topic #' #' @param topics A vector of GitHub topics to search for. diff --git a/man/get_contributor_count.Rd b/man/get_contributor_count.Rd new file mode 100644 index 0000000..559dfa4 --- /dev/null +++ b/man/get_contributor_count.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_github_by_topic.R +\name{get_contributor_count} +\alias{get_contributor_count} +\title{Get the number of contributors for a GitHub repository} +\usage{ +get_contributor_count(owner, repo, token = NULL) +} +\arguments{ +\item{owner}{The owner of the GitHub repository} + +\item{repo}{The name of the GitHub repository} + +\item{token}{A GitHub personal access token (optional)} +} +\value{ +The number of contributors to the repository +} +\description{ +Get the number of contributors for a GitHub repository +} From 71f822035ecd082ff687fcdadaa04922f0db6756 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 10:41:58 -0500 Subject: [PATCH 10/15] add REST API limits for GitHub Search --- R/get_github_by_topic.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/get_github_by_topic.R b/R/get_github_by_topic.R index 06ebe10..5bbfe5e 100644 --- a/R/get_github_by_topic.R +++ b/R/get_github_by_topic.R @@ -84,7 +84,12 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(q = q_topic, per_page = limit) if (!is.null(token)) { - req_topic <- req_topic |> req_auth_bearer_token(token) + req_topic <- req_topic |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 30, fill_time_s = 60) + } else { + req_topic <- req_topic |> + req_throttle(capacity = 10, fill_time_s = 60) } resp_topic <- req_perform(req_topic) From 26faf4c72a22f9e55f2af7136952a6cb6c9f204c Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 10:59:51 -0500 Subject: [PATCH 11/15] share rate limit pools across similar api calls - this likely would have occurred as when realm is unset, it defaults to hostname. but this calms my ocd --- R/get_github_by_topic.R | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/R/get_github_by_topic.R b/R/get_github_by_topic.R index 5bbfe5e..5892230 100644 --- a/R/get_github_by_topic.R +++ b/R/get_github_by_topic.R @@ -16,10 +16,10 @@ get_contributor_count <- function(owner, repo, token = NULL) { if (!is.null(token)) { req <- req |> req_auth_bearer_token(token) |> - req_throttle(capacity = 5000, fill_time_s = 3600) + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") } else { req <- req |> - req_throttle(capacity = 60, fill_time_s = 3600) + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") } resp <- tryCatch(req_perform(req), error = function(e) NULL) @@ -86,10 +86,10 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { if (!is.null(token)) { req_topic <- req_topic |> req_auth_bearer_token(token) |> - req_throttle(capacity = 30, fill_time_s = 60) + req_throttle(capacity = 30, fill_time_s = 60, realm = "github_search_authenticated") } else { req_topic <- req_topic |> - req_throttle(capacity = 10, fill_time_s = 60) + req_throttle(capacity = 10, fill_time_s = 60, realm = "github_search_anonymous") } resp_topic <- req_perform(req_topic) @@ -108,7 +108,14 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req <- httr2::request(url) |> httr2::req_headers("User-Agent" = "httr2", "X-GitHub-Api-Version" = "2022-11-28") - if (!is.null(token)) req <- req |> httr2::req_auth_bearer_token(token) + if (!is.null(token)) { + req <- req |> + httr2::req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") + } resp <- tryCatch(httr2::req_perform(req), error = function(e) NULL) if (is.null(resp) || httr2::resp_status(resp) != 200) return(NA_real_) @@ -141,7 +148,12 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(state = state, per_page = 1) |> req_headers("User-Agent" = "httr2") if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") } resp <- tryCatch(req_perform(req), error = function(e) NULL) @@ -165,8 +177,13 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(q = q, per_page = 1) |> req_headers("User-Agent" = "httr2") if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) - } + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 30, fill_time_s = 60, realm = "github_search_authenticated") + } else { + req <- req |> + req_throttle(capacity = 10, fill_time_s = 60, realm = "github_search_anonymous") + } resp <- tryCatch(req_perform(req), error = function(e) NULL) if (!is.null(resp) && resp_status(resp) == 200) { return(resp_body_json(resp)$total_count) @@ -192,7 +209,12 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(sha = branch, per_page = 1) |> req_headers("User-Agent" = "httr2") if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") } resp <- tryCatch(req_perform(req), error = function(e) NULL) From ecaa55838ab4537784b46937223eb7d00a43ee34 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 18:07:23 -0500 Subject: [PATCH 12/15] some groups have LOTS of repos... - utiilize pagination in graphQL requests - implement progress bar - update test --- DESCRIPTION | 3 +- NAMESPACE | 3 + R/get_github_by_topic_graphQL.R | 102 ++++++++++++------ .../test-get_github_by_topic_graphQL.R | 11 +- 4 files changed, 81 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b436e46..9951a41 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,8 @@ Imports: openxlsx, tidyr, rvest, - readr + readr, + cli Suggests: gargle, gitcreds, diff --git a/NAMESPACE b/NAMESPACE index 62d217b..60420b6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,9 @@ export(icite) import(dplyr) import(httr2) import(jsonlite) +importFrom(cli,cli_progress_bar) +importFrom(cli,cli_progress_done) +importFrom(cli,cli_progress_update) importFrom(dplyr,arrange) importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 359b714..89cf6ef 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -4,6 +4,8 @@ #' @param token A GitHub personal access token. Required for GraphQL API. #' @param limit The maximum number of repositories to return per topic (max 1000 by GitHub). #' +#' @importFrom cli cli_progress_bar cli_progress_update cli_progress_done +#' @importFrom dplyr bind_rows #' @importFrom ghql GraphqlClient #' @importFrom jsonlite fromJSON #' @importFrom tibble tibble @@ -24,25 +26,27 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { if (missing(token) || is.null(token)) { stop("A GitHub personal access token is required for GraphQL API.") } - if (length(topics) == 0) { stop("At least one topic must be provided.") } topics <- tolower(topics) - # Set up client + # Setup client cli <- ghql::GraphqlClient$new( url = "https://api.github.com/graphql", headers = list(Authorization = paste0("bearer ", token)) ) - # GraphQL query template - # Fetches up to `limit` repos per topic, along with key metadata + # GraphQL query template with pagination query_template <- ' - query($queryString: String!, $limit: Int!) { - search(query: $queryString, type: REPOSITORY, first: $limit) { + query($queryString: String!, $limit: Int!, $cursor: String) { + search(query: $queryString, type: REPOSITORY, first: $limit, after: $cursor) { repositoryCount + pageInfo { + hasNextPage + endCursor + } nodes { ... on Repository { name @@ -85,16 +89,37 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { qry <- ghql::Query$new() qry$query("repoQuery", query_template) + # Pagination-enabled requester fetch_topic <- function(topic) { - res <- cli$exec( - qry$queries$repoQuery, - variables = list(queryString = paste0("topic:", topic), - limit = limit) - ) - dat <- jsonlite::fromJSON(res, flatten = TRUE) - repos <- dat$data$search$nodes - if (length(repos) == 0) return(NULL) - # browser() + cursor <- NULL + all_repos <- list() + + repeat { + res <- cli$exec( + qry$queries$repoQuery, + variables = list( + queryString = paste0("topic:", topic), + limit = limit, + cursor = cursor + ) + ) + + dat <- jsonlite::fromJSON(res, flatten = TRUE) + search <- dat$data$search + repos <- search$nodes + + if (length(repos) > 0) { + all_repos <- append(all_repos, list(repos)) + } + + if (!isTRUE(search$pageInfo$hasNextPage)) break + cursor <- search$pageInfo$endCursor + } + + if (length(all_repos) == 0) return(NULL) + + repos <- dplyr::bind_rows(all_repos) + tibble::tibble( name = repos$name, owner = repos$owner.login, @@ -152,32 +177,45 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { ) } - # Fetch across all topics and combine - df <- purrr::map_dfr(topics, fetch_topic) + results_list <- vector("list", length(topics)) ## initialize results + cli::cli_progress_bar("Requesting repositories by topic", total = length(topics)) + + for (i in seq_along(topics)) { + results_list[[i]] <- fetch_topic(topics[i]) + cli::cli_progress_update() + } + + cli::cli_progress_done() + results <- dplyr::bind_rows(results_list) - if (nrow(df) == 0) { + # If nothing is found + if (nrow(results) == 0) { expected_cols <- c( "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", - "closed_issues", "closed_prs", "commits", "mentionable_users", "contributors","has_readme", "code_of_conduct", - "tags", "language", "language_loc", "license", "created_at", "pushed_at", "updated_at", "html_url", - "queried_topic" + "closed_issues", "closed_prs", "commits", "mentionable_users", "contributors", + "has_readme", "code_of_conduct", "tags", "language", "language_loc", + "license", "created_at", "pushed_at", "updated_at", "html_url", "queried_topic" ) - df <- tibble::tibble( + + return( + tibble::tibble( !!!setNames(rep(list(NA), length(expected_cols)), expected_cols) - ) %>% - filter(!is.na(.data$name)) - } else { - df <- df |> - mutate(contributors = map2_dbl(.data$owner, .data$name, ~ get_contributor_count(.x, .y, token = token))) - } - + ) |> filter(!is.na(.data$name)) + ) + } + + # Contributor count + results <- results |> + dplyr::mutate(contributors = map2_dbl(.data$owner, .data$name, ~ + get_contributor_count(.x, .y, token = token) + )) + # Organize columns like before - df <- df |> + results |> dplyr::relocate('open_prs', .after = 'open_issues') |> dplyr::relocate('closed_issues', .after = 'open_prs') |> dplyr::relocate('closed_prs', .after = 'closed_issues') |> dplyr::relocate('commits', .after = 'closed_prs') |> - dplyr::relocate('mentionable_users', .after = 'commits') |> + dplyr::relocate('mentionable_users', .after = 'commits') |> dplyr::relocate('contributors', .after = 'mentionable_users') - return(df) } diff --git a/tests/testthat/test-get_github_by_topic_graphQL.R b/tests/testthat/test-get_github_by_topic_graphQL.R index 0d5d6e0..409a8ee 100644 --- a/tests/testthat/test-get_github_by_topic_graphQL.R +++ b/tests/testthat/test-get_github_by_topic_graphQL.R @@ -22,12 +22,13 @@ test_that("Returned data frame contains expected columns", { skip_if_offline() skip_on_ci() - result <- get_github_by_topic_graphql(c("bioinformatics"), token = gitcreds_get()$password, limit = 3) + result <- get_github_by_topic_graphql(c("u24ca289073"), token = gitcreds_get()$password, limit = 3) expected_cols <- c( - "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", - "closed_issues", "closed_prs", "commits", "contributors", "tags", "language", "license", - "created_at", "pushed_at", "updated_at", "html_url" - ) + "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", + "closed_issues", "closed_prs", "commits", "mentionable_users", "contributors", + "has_readme", "code_of_conduct", "tags", "language", "language_loc", + "license", "created_at", "pushed_at", "updated_at", "html_url", "queried_topic" + ) expect_true(all(expected_cols %in% names(result))) }) From da480d6c8f947018ee1d78732e81f9889df80f01 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Mon, 8 Dec 2025 18:21:25 -0500 Subject: [PATCH 13/15] nope --- R/get_github_by_topic_graphQL.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 89cf6ef..db3f141 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -29,6 +29,9 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { if (length(topics) == 0) { stop("At least one topic must be provided.") } + if (limit > 30) { + stop("limit must be 30 or less") ## results are good. more than this and GitHub will nope out + } topics <- tolower(topics) From 03a676790d7b876c90863bd577a02a1a4963368e Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 9 Dec 2025 07:55:22 -0500 Subject: [PATCH 14/15] fix warning --- R/get_github_by_topic_graphQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index db3f141..0edfec6 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -148,7 +148,7 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { }, code_of_conduct = if( 'coc.id' %in% names(repos)) { purrr::map_lgl( - repos$coc, + repos$coc.id, ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE ) } else { FALSE From 7b8aaf3c5b2473bc9872f3b370b18b300aa33a89 Mon Sep 17 00:00:00 2001 From: David Mayer Date: Tue, 9 Dec 2025 08:45:33 -0500 Subject: [PATCH 15/15] add contributing --- R/get_github_by_topic_graphQL.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 0edfec6..27cc59e 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -81,6 +81,7 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { } readme: object(expression: "HEAD:README.md") { id } coc: object(expression: "HEAD:CODE_OF_CONDUCT.md") { id } + contributing: object(expression: "HEAD:CONTRIBUTING.md") { id } languages(first: 20) { edges { size node { name } } } @@ -153,6 +154,13 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { ) } else { FALSE }, + contributing = if( 'contributing.id' %in% names(repos)) { + purrr::map_lgl( + repos$contributing.id, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ) } else { + FALSE + }, tags = purrr::map_chr( repos$repositoryTopics.nodes, ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_