diff --git a/DESCRIPTION b/DESCRIPTION index b436e46..9951a41 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,8 @@ Imports: openxlsx, tidyr, rvest, - readr + readr, + cli Suggests: gargle, gitcreds, diff --git a/NAMESPACE b/NAMESPACE index e44315d..60420b6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,12 +9,16 @@ export(ga_query_explorer) export(get_core_project_info) export(get_ga_basic) export(get_ga_meta_by_id) +export(get_ga_summary) export(get_github_by_topic) export(get_github_by_topic_graphql) export(icite) import(dplyr) import(httr2) import(jsonlite) +importFrom(cli,cli_progress_bar) +importFrom(cli,cli_progress_done) +importFrom(cli,cli_progress_update) importFrom(dplyr,arrange) importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) @@ -28,6 +32,7 @@ importFrom(glue,glue) importFrom(glue,glue_collapse) importFrom(googleAnalyticsR,ga_account_list) importFrom(googleAnalyticsR,ga_auth) +importFrom(googleAnalyticsR,ga_data) importFrom(httr2,req_auth_bearer_token) importFrom(httr2,req_body_json) importFrom(httr2,req_error) @@ -36,6 +41,7 @@ importFrom(httr2,req_method) importFrom(httr2,req_perform) importFrom(httr2,req_perform_parallel) importFrom(httr2,req_retry) +importFrom(httr2,req_throttle) importFrom(httr2,req_url_query) importFrom(httr2,request) importFrom(httr2,resp_body_json) diff --git a/R/ga_summary.R b/R/ga_summary.R new file mode 100644 index 0000000..cc7af93 --- /dev/null +++ b/R/ga_summary.R @@ -0,0 +1,31 @@ +#' Get summary metrics from Google Analytics +#' +#' This function retrieves summary metrics from Google Analytics, +#' including total users, new users, engaged sessions, engagement rate, +#' event count, and screen page views. +#' +#' @param propertyId The Google Analytics property ID +#' @param start_date The start date for the data retrieval (e.g., "30daysAgo") +#' @param end_date The end date for the data retrieval (e.g., "yesterday") +#' +#' @importFrom googleAnalyticsR ga_data +#' @importFrom dplyr mutate +#' +#' @return A tibble containing the summary metrics +#' @export +get_ga_summary <- function(propertyId, start_date = "30daysAgo", end_date = "yesterday") { + ga_data( + propertyId, + metrics = c( + "totalUsers", + "newUsers", + "engagedSessions", + "engagementRate", + "eventCount", + "screenPageViews" + ), + dimensions = "date", + date_range = c(start_date, end_date) + ) |> + mutate(propertyId = propertyId) +} diff --git a/R/get_github_by_topic.R b/R/get_github_by_topic.R index 82cfe54..5892230 100644 --- a/R/get_github_by_topic.R +++ b/R/get_github_by_topic.R @@ -1,3 +1,42 @@ +# REST API Helper Functions +#' Get the number of contributors for a GitHub repository +#' +#' @param owner The owner of the GitHub repository +#' @param repo The name of the GitHub repository +#' @param token A GitHub personal access token (optional) +#' +#' @importFrom httr2 req_throttle +#' +#' @return The number of contributors to the repository +get_contributor_count <- function(owner, repo, token = NULL) { + base_url <- glue("https://api.github.com/repos/{owner}/{repo}/contributors") + req <- request(base_url) |> + req_url_query(per_page = 1, anon = "false") |> # anon=TRUE counts contributors without accounts + req_headers("User-Agent" = "httr2") + if (!is.null(token)) { + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") + } + + resp <- tryCatch(req_perform(req), error = function(e) NULL) + if (is.null(resp) || resp_status(resp) != 200) return(NA_real_) + + link <- resp_headers(resp)[["link"]] + if (!is.null(link) && grepl("rel=\"last\"", link)) { + matches <- regmatches(link, regexpr("page=\\d+>; rel=\\\"last\\\"", link)) + count <- as.numeric(sub("page=", "", sub(">; rel=\"last\"", "", matches))) + return(count) + } else { + body <- resp_body_json(resp) + return(length(body)) # if only a few contributors + } + } + +# REST Functions #' Get GitHub Repositories by Topic #' #' @param topics A vector of GitHub topics to search for. @@ -45,7 +84,12 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(q = q_topic, per_page = limit) if (!is.null(token)) { - req_topic <- req_topic |> req_auth_bearer_token(token) + req_topic <- req_topic |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 30, fill_time_s = 60, realm = "github_search_authenticated") + } else { + req_topic <- req_topic |> + req_throttle(capacity = 10, fill_time_s = 60, realm = "github_search_anonymous") } resp_topic <- req_perform(req_topic) @@ -64,7 +108,14 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req <- httr2::request(url) |> httr2::req_headers("User-Agent" = "httr2", "X-GitHub-Api-Version" = "2022-11-28") - if (!is.null(token)) req <- req |> httr2::req_auth_bearer_token(token) + if (!is.null(token)) { + req <- req |> + httr2::req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") + } resp <- tryCatch(httr2::req_perform(req), error = function(e) NULL) if (is.null(resp) || httr2::resp_status(resp) != 200) return(NA_real_) @@ -97,7 +148,12 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(state = state, per_page = 1) |> req_headers("User-Agent" = "httr2") if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") } resp <- tryCatch(req_perform(req), error = function(e) NULL) @@ -121,8 +177,13 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(q = q, per_page = 1) |> req_headers("User-Agent" = "httr2") if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) - } + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 30, fill_time_s = 60, realm = "github_search_authenticated") + } else { + req <- req |> + req_throttle(capacity = 10, fill_time_s = 60, realm = "github_search_anonymous") + } resp <- tryCatch(req_perform(req), error = function(e) NULL) if (!is.null(resp) && resp_status(resp) == 200) { return(resp_body_json(resp)$total_count) @@ -148,7 +209,12 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { req_url_query(sha = branch, per_page = 1) |> req_headers("User-Agent" = "httr2") if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) + req <- req |> + req_auth_bearer_token(token) |> + req_throttle(capacity = 5000, fill_time_s = 3600, realm = "github_authenticated") + } else { + req <- req |> + req_throttle(capacity = 60, fill_time_s = 3600, realm = "github_anonymous") } resp <- tryCatch(req_perform(req), error = function(e) NULL) @@ -168,30 +234,6 @@ get_github_by_topic <- function(topics, token = NULL, limit = 30) { commit_counts <- map2_dbl(df$owner, df$name, ~ get_commit_count(.x, .y, token = token)) df$commits <- commit_counts - # Helper to count contributors - get_contributor_count <- function(owner, repo, token = NULL) { - base_url <- glue("https://api.github.com/repos/{owner}/{repo}/contributors") - req <- request(base_url) |> - req_url_query(per_page = 1, anon = "false") |> # anon=TRUE counts contributors without accounts - req_headers("User-Agent" = "httr2") - if (!is.null(token)) { - req <- req |> req_auth_bearer_token(token) - } - - resp <- tryCatch(req_perform(req), error = function(e) NULL) - if (is.null(resp) || resp_status(resp) != 200) return(NA_real_) - - link <- resp_headers(resp)[["link"]] - if (!is.null(link) && grepl("rel=\"last\"", link)) { - matches <- regmatches(link, regexpr("page=\\d+>; rel=\\\"last\\\"", link)) - count <- as.numeric(sub("page=", "", sub(">; rel=\"last\"", "", matches))) - return(count) - } else { - body <- resp_body_json(resp) - return(length(body)) # if only a few contributors - } - } - contributor_counts <- map2_dbl(df$owner, df$name, ~ get_contributor_count(.x, .y, token = token)) df$contributors <- contributor_counts diff --git a/R/get_github_by_topic_graphQL.R b/R/get_github_by_topic_graphQL.R index 32d52bb..27cc59e 100644 --- a/R/get_github_by_topic_graphQL.R +++ b/R/get_github_by_topic_graphQL.R @@ -4,6 +4,8 @@ #' @param token A GitHub personal access token. Required for GraphQL API. #' @param limit The maximum number of repositories to return per topic (max 1000 by GitHub). #' +#' @importFrom cli cli_progress_bar cli_progress_update cli_progress_done +#' @importFrom dplyr bind_rows #' @importFrom ghql GraphqlClient #' @importFrom jsonlite fromJSON #' @importFrom tibble tibble @@ -24,25 +26,30 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { if (missing(token) || is.null(token)) { stop("A GitHub personal access token is required for GraphQL API.") } - if (length(topics) == 0) { stop("At least one topic must be provided.") } + if (limit > 30) { + stop("limit must be 30 or less") ## results are good. more than this and GitHub will nope out + } topics <- tolower(topics) - - # Set up client + + # Setup client cli <- ghql::GraphqlClient$new( url = "https://api.github.com/graphql", headers = list(Authorization = paste0("bearer ", token)) ) - - # GraphQL query template - # Fetches up to `limit` repos per topic, along with key metadata + + # GraphQL query template with pagination query_template <- ' - query($queryString: String!, $limit: Int!) { - search(query: $queryString, type: REPOSITORY, first: $limit) { + query($queryString: String!, $limit: Int!, $cursor: String) { + search(query: $queryString, type: REPOSITORY, first: $limit, after: $cursor) { repositoryCount + pageInfo { + hasNextPage + endCursor + } nodes { ... on Repository { name @@ -64,91 +71,162 @@ get_github_by_topic_graphql <- function(topics, token, limit = 30) { defaultBranchRef { target { ... on Commit { - history { - totalCount - } + history { totalCount } } } } - mentionableUsers { - totalCount - } + mentionableUsers { totalCount } repositoryTopics(first: 20) { - nodes { - topic { name } - } + nodes { topic { name } } + } + readme: object(expression: "HEAD:README.md") { id } + coc: object(expression: "HEAD:CODE_OF_CONDUCT.md") { id } + contributing: object(expression: "HEAD:CONTRIBUTING.md") { id } + languages(first: 20) { + edges { size node { name } } } } } } }' - - # Compile query once + qry <- ghql::Query$new() qry$query("repoQuery", query_template) - - # Helper: fetch repos for one topic + + # Pagination-enabled requester fetch_topic <- function(topic) { - res <- cli$exec(qry$queries$repoQuery, - variables = list(queryString = paste0("topic:", topic), - limit = limit)) - dat <- jsonlite::fromJSON(res, flatten = TRUE) - repos <- dat$data$search$nodes - if (length(repos) == 0) return(NULL) - # browser() + cursor <- NULL + all_repos <- list() + + repeat { + res <- cli$exec( + qry$queries$repoQuery, + variables = list( + queryString = paste0("topic:", topic), + limit = limit, + cursor = cursor + ) + ) + + dat <- jsonlite::fromJSON(res, flatten = TRUE) + search <- dat$data$search + repos <- search$nodes + + if (length(repos) > 0) { + all_repos <- append(all_repos, list(repos)) + } + + if (!isTRUE(search$pageInfo$hasNextPage)) break + cursor <- search$pageInfo$endCursor + } + + if (length(all_repos) == 0) return(NULL) + + repos <- dplyr::bind_rows(all_repos) + tibble::tibble( - name = repos$name, - owner = repos$owner.login, - description = repos$description, - stars = repos$stargazerCount, - watchers = repos$watchers.totalCount, - forks = repos$forkCount, - open_issues = repos$issues.totalCount, - closed_issues = repos$closedIssues.totalCount, - open_prs = repos$openPRs.totalCount, - closed_prs = repos$closedPRs.totalCount, - commits = purrr::map_dbl( - repos$defaultBranchRef.target.history.totalCount, - ~ if (is.null(.x)) NA_real_ else .x - ), - contributors = repos$mentionableUsers.totalCount %||% 0, - tags = purrr::map_chr( - repos$repositoryTopics.nodes, - ~ if (length(.x$topic.name) == 0) NA_character_ - else paste(.x$topic.name, collapse = ", ") - ), - language = repos$primaryLanguage.name, - license = repos$licenseInfo.name, - created_at = repos$createdAt, - pushed_at = repos$pushedAt, - updated_at = repos$updatedAt, - html_url = repos$url, - queried_topic = topic + name = repos$name, + owner = repos$owner.login, + description = repos$description, + stars = repos$stargazerCount, + watchers = repos$watchers.totalCount, + forks = repos$forkCount, + open_issues = repos$issues.totalCount, + closed_issues = repos$closedIssues.totalCount, + open_prs = repos$openPRs.totalCount, + closed_prs = repos$closedPRs.totalCount, + commits = purrr::map_dbl( + repos$defaultBranchRef.target.history.totalCount, + ~ if (is.null(.x)) NA_real_ else .x + ), + mentionable_users = repos$mentionableUsers.totalCount %||% 0, + has_readme = if( 'readme.id' %in% names(repos)) { + purrr::map_lgl( + repos$readme.id, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ) } else { + FALSE + }, + code_of_conduct = if( 'coc.id' %in% names(repos)) { + purrr::map_lgl( + repos$coc.id, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ) } else { + FALSE + }, + contributing = if( 'contributing.id' %in% names(repos)) { + purrr::map_lgl( + repos$contributing.id, + ~ if (is.na(.x) || is.null(.x) || length(.x) == 0) FALSE else TRUE + ) } else { + FALSE + }, + tags = purrr::map_chr( + repos$repositoryTopics.nodes, + ~ if (is.null(.x) || length(.x$topic.name) == 0) NA_character_ + else paste(.x$topic.name, collapse = ", ") + ), + language = repos$primaryLanguage.name, + language_loc = purrr::map( + repos$languages.edges, + ~ if (is.null(.x)) { + tibble::tibble(language = NA_character_, bytes = NA_real_, loc = NA_real_) + } else { + tibble::tibble( + language = .x$node.name, + bytes = .x$size, + loc = round(.x$size / 50) + ) + } + ), + license = repos$licenseInfo.name %||% NA_character_, + created_at = repos$createdAt, + pushed_at = repos$pushedAt, + updated_at = repos$updatedAt, + html_url = repos$url, + queried_topic = topic ) + } + results_list <- vector("list", length(topics)) ## initialize results + cli::cli_progress_bar("Requesting repositories by topic", total = length(topics)) + + for (i in seq_along(topics)) { + results_list[[i]] <- fetch_topic(topics[i]) + cli::cli_progress_update() } - # Fetch across all topics and combine - df <- purrr::map_dfr(topics, fetch_topic) + cli::cli_progress_done() + results <- dplyr::bind_rows(results_list) - if (nrow(df) == 0) { + # If nothing is found + if (nrow(results) == 0) { expected_cols <- c( "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", - "closed_issues", "closed_prs", "commits", "contributors", "tags", "language", "license", - "created_at", "pushed_at", "updated_at", "html_url", "queried_topic" + "closed_issues", "closed_prs", "commits", "mentionable_users", "contributors", + "has_readme", "code_of_conduct", "tags", "language", "language_loc", + "license", "created_at", "pushed_at", "updated_at", "html_url", "queried_topic" ) - df <- tibble::tibble( + + return( + tibble::tibble( !!!setNames(rep(list(NA), length(expected_cols)), expected_cols) - ) %>% - filter(!is.na(.data$name)) + ) |> filter(!is.na(.data$name)) + ) } + + # Contributor count + results <- results |> + dplyr::mutate(contributors = map2_dbl(.data$owner, .data$name, ~ + get_contributor_count(.x, .y, token = token) + )) + # Organize columns like before - df <- df |> + results |> dplyr::relocate('open_prs', .after = 'open_issues') |> dplyr::relocate('closed_issues', .after = 'open_prs') |> dplyr::relocate('closed_prs', .after = 'closed_issues') |> dplyr::relocate('commits', .after = 'closed_prs') |> - dplyr::relocate('contributors', .after = 'commits') - - return(df) + dplyr::relocate('mentionable_users', .after = 'commits') |> + dplyr::relocate('contributors', .after = 'mentionable_users') } diff --git a/man/get_contributor_count.Rd b/man/get_contributor_count.Rd new file mode 100644 index 0000000..559dfa4 --- /dev/null +++ b/man/get_contributor_count.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_github_by_topic.R +\name{get_contributor_count} +\alias{get_contributor_count} +\title{Get the number of contributors for a GitHub repository} +\usage{ +get_contributor_count(owner, repo, token = NULL) +} +\arguments{ +\item{owner}{The owner of the GitHub repository} + +\item{repo}{The name of the GitHub repository} + +\item{token}{A GitHub personal access token (optional)} +} +\value{ +The number of contributors to the repository +} +\description{ +Get the number of contributors for a GitHub repository +} diff --git a/man/get_ga_summary.Rd b/man/get_ga_summary.Rd new file mode 100644 index 0000000..623daa0 --- /dev/null +++ b/man/get_ga_summary.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ga_summary.R +\name{get_ga_summary} +\alias{get_ga_summary} +\title{Get summary metrics from Google Analytics} +\usage{ +get_ga_summary(propertyId, start_date = "30daysAgo", end_date = "yesterday") +} +\arguments{ +\item{propertyId}{The Google Analytics property ID} + +\item{start_date}{The start date for the data retrieval (e.g., "30daysAgo")} + +\item{end_date}{The end date for the data retrieval (e.g., "yesterday")} +} +\value{ +A tibble containing the summary metrics +} +\description{ +This function retrieves summary metrics from Google Analytics, +including total users, new users, engaged sessions, engagement rate, +event count, and screen page views. +} diff --git a/tests/testthat/test-get_github_by_topic_graphQL.R b/tests/testthat/test-get_github_by_topic_graphQL.R index 0d5d6e0..409a8ee 100644 --- a/tests/testthat/test-get_github_by_topic_graphQL.R +++ b/tests/testthat/test-get_github_by_topic_graphQL.R @@ -22,12 +22,13 @@ test_that("Returned data frame contains expected columns", { skip_if_offline() skip_on_ci() - result <- get_github_by_topic_graphql(c("bioinformatics"), token = gitcreds_get()$password, limit = 3) + result <- get_github_by_topic_graphql(c("u24ca289073"), token = gitcreds_get()$password, limit = 3) expected_cols <- c( - "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", - "closed_issues", "closed_prs", "commits", "contributors", "tags", "language", "license", - "created_at", "pushed_at", "updated_at", "html_url" - ) + "name", "owner", "description", "stars", "watchers", "forks", "open_issues", "open_prs", + "closed_issues", "closed_prs", "commits", "mentionable_users", "contributors", + "has_readme", "code_of_conduct", "tags", "language", "language_loc", + "license", "created_at", "pushed_at", "updated_at", "html_url", "queried_topic" + ) expect_true(all(expected_cols %in% names(result))) })