diff --git a/github_activity/github_activity.py b/github_activity/github_activity.py index 650bc6e..bc5e0c5 100644 --- a/github_activity/github_activity.py +++ b/github_activity/github_activity.py @@ -134,7 +134,8 @@ def get_activity( ------- query_data : pandas DataFrame A munged collection of data returned from your query. This - will be a combination of issues and PRs. + will be a combination of issues and PRs. The DataFrame has a + `bot_users` attribute containing the set of detected bot usernames. """ org, repo = _parse_target(target) @@ -206,6 +207,7 @@ def get_activity( # Query for both opened and closed issues/PRs in this window print(f"Running search query:\n{search_query}\n\n", file=sys.stderr) query_data = [] + all_bot_users = set() for activity_type in ["created", "closed"]: ii_search_query = ( search_query + f" {activity_type}:{since_dt_str}..{until_dt_str}" @@ -213,6 +215,8 @@ def get_activity( qu = GitHubGraphQlQuery(ii_search_query, auth=auth) qu.request() query_data.append(qu.data) + # Collect bot users from each query + all_bot_users.update(qu.data.attrs.get("bot_users", set())) query_data = ( pd.concat(query_data).drop_duplicates(subset=["id"]).reset_index(drop=True) @@ -223,9 +227,12 @@ def get_activity( query_data.until_dt_str = until_dt_str query_data.since_is_git_ref = since_is_git_ref query_data.until_is_git_ref = until_is_git_ref + # Restore bot_users in attrs (lost during concat) + query_data.attrs["bot_users"] = all_bot_users if cache: _cache_data(query_data, cache) + return query_data @@ -462,15 +469,34 @@ def generate_activity_md( data["contributors"] = [[]] * len(data) # Get bot users from GraphQL data (stored in DataFrame attrs) - bot_users = data.attrs.get("bot_users", set()) + bot_users = data.attrs["bot_users"] def ignored_user(username): - if username in bot_users: + if not username: + return False + + # First check against GraphQL-detected bot users + # It is common for a bot to have `username` in GitHub and `username[bot]` in commits. + # So this accounts for that. + normalized_username = username.replace("[bot]", "") + if normalized_username in bot_users: + return True + + # Next use pattern-based fallback for bots not detected by GraphQL + username_lower = username.lower() + bot_patterns = [ + "[bot]", # e.g., github-actions[bot], codecov[bot] + "-bot", # e.g., renovate-bot, release-bot, dependabot + ] + if any(pattern in username_lower for pattern in bot_patterns): return True + + # Check against user-specified ignored contributors if ignored_contributors and any( fnmatch.fnmatch(username, user) for user in ignored_contributors ): return True + return False def filter_ignored(userlist): @@ -490,12 +516,19 @@ def filter_ignored(userlist): # - merger # - reviewers - item_contributors.author = row.author + # Only add author if they're not a bot + if not ignored_user(row.author): + item_contributors.author = row.author if row.kind == "pr": for committer in filter_ignored(row.committers): item_contributors.add(committer) - if row.mergedBy and row.mergedBy != row.author: + # Only add merger if they're not a bot and not the author + if ( + row.mergedBy + and row.mergedBy != row.author + and not ignored_user(row.mergedBy) + ): item_contributors.add(row.mergedBy) for reviewer in filter_ignored(row.reviewers): item_contributors.add(reviewer) diff --git a/github_activity/graphql.py b/github_activity/graphql.py index 4e0a1f2..bfe5312 100644 --- a/github_activity/graphql.py +++ b/github_activity/graphql.py @@ -47,6 +47,7 @@ committer { user { login + __typename } } authors(first: 10) { @@ -54,6 +55,7 @@ node { user { login + __typename } } } @@ -140,6 +142,7 @@ def __init__(self, query, display_progress=True, auth=None): variable `GITHUB_ACCESS_TOKEN` will be tried. """ self.query = query + self.bot_users = set() # Store detected bot usernames # Authentication token = auth or os.environ.get("GITHUB_ACCESS_TOKEN") @@ -149,7 +152,7 @@ def __init__(self, query, display_progress=True, auth=None): "--auth flag or must be used to pass a Personal Access Token " "needed by the GitHub API. You can generate a token at " "https://github.com/settings/tokens/new. Note that while " - "working with a public repository, you don’t need to set any " + "working with a public repository, you don't need to set any " "scopes on the token you create." ) self.auth = TokenAuth(token) @@ -240,9 +243,7 @@ def request(self, n_pages=100, n_per_page=50): # Extract bot users from raw data before DataFrame conversion def is_bot(user_dict): """Check if a GraphQL user object represents a bot account.""" - if not user_dict: - return False - return user_dict.get("__typename") == "Bot" + return user_dict and user_dict.get("__typename") == "Bot" bot_users = set() for item in self.issues_and_or_prs: @@ -272,9 +273,26 @@ def is_bot(user_dict): if is_bot(comment_author): bot_users.add(comment_author["login"]) + # Check commit authors and committers + commits = item.get("commits") + if commits: + for commit_edge in commits.get("edges", []): + commit = commit_edge["node"]["commit"] + # Check committer + committer = commit.get("committer") + if committer and committer.get("user"): + if is_bot(committer["user"]): + bot_users.add(committer["user"]["login"]) + # Check authors + authors = commit.get("authors") + if authors: + for author_edge in authors.get("edges", []): + author_user = author_edge["node"].get("user") + if author_user and is_bot(author_user): + bot_users.add(author_user["login"]) + # Create a dataframe of the issues and/or PRs self.data = pd.DataFrame(self.issues_and_or_prs) - # Store bot users in DataFrame metadata (attrs dict) self.data.attrs["bot_users"] = bot_users # Add some extra fields diff --git a/tests/test_cli.py b/tests/test_cli.py index a1a7ef5..901fe5a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -141,3 +141,34 @@ def test_contributor_sorting(tmpdir, file_regression): run(cmd.split(), check=True) md = path_output.read_text() file_regression.check(md, extension=".md") + + +@mark.integration +def test_bot_filtering(file_regression): + """Test that bot users are detected and filtered from output.""" + from github_activity.github_activity import get_activity, generate_activity_md + + # Use jupyter-book/mystmd because it's a small release, and know theres bot activity + data = get_activity( + target="jupyter-book/mystmd", + since="mystmd@1.6.5", + until="mystmd@1.6.6", + ) + + # Verify bot_users attrs exists and was preserved (catches the concat bug) + assert "bot_users" in data.attrs, "bot_users should be in DataFrame attrs" + + # Verify we actually detected some bots + assert len(data.attrs["bot_users"]) > 0, ( + "Should have detected bot users in this release" + ) + + # Generate markdown and save as regression baseline + md = generate_activity_md( + target="jupyter-book/mystmd", + since="mystmd@1.6.5", + until="mystmd@1.6.6", + ) + + # Use this regression test to make sure no bots are in the output + file_regression.check(md, extension=".md") diff --git a/tests/test_cli/test_bot_filtering.md b/tests/test_cli/test_bot_filtering.md new file mode 100644 index 0000000..ea0f551 --- /dev/null +++ b/tests/test_cli/test_bot_filtering.md @@ -0,0 +1,23 @@ +# mystmd@1.6.5...mystmd@1.6.6 + +([full changelog](https://github.com/jupyter-book/mystmd/compare/mystmd@1.6.5...mystmd@1.6.6)) + +## Bugs fixed + +- Fix execution bug: no need for kernelspec if no executable content [#2454](https://github.com/jupyter-book/mystmd/pull/2454) ([@choldgraf](https://github.com/choldgraf), [@stefanv](https://github.com/stefanv)) + +## Other merged PRs + +- πŸš€ Release [#2457](https://github.com/jupyter-book/mystmd/pull/2457) ([@stefanv](https://github.com/stefanv)) +- Pull in latest myst-execute [#2456](https://github.com/jupyter-book/mystmd/pull/2456) ([@stefanv](https://github.com/stefanv)) +- πŸš€ Release [#2455](https://github.com/jupyter-book/mystmd/pull/2455) ([@stefanv](https://github.com/stefanv)) +- πŸš€ Release [#2416](https://github.com/jupyter-book/mystmd/pull/2416) ([@bsipocz](https://github.com/bsipocz), [@choldgraf](https://github.com/choldgraf), [@stefanv](https://github.com/stefanv)) + +## Contributors to this release + +The following people contributed discussions, new ideas, code and documentation contributions, and review. +See [our definition of contributors](https://github-activity.readthedocs.io/en/latest/#how-does-this-tool-define-contributions-in-the-reports). + +([GitHub contributors page for this release](https://github.com/jupyter-book/mystmd/graphs/contributors?from=2025-11-18&to=2025-11-19&type=c)) + +@bsipocz ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Absipocz+updated%3A2025-11-18..2025-11-19&type=Issues)) | @choldgraf ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Acholdgraf+updated%3A2025-11-18..2025-11-19&type=Issues)) | @jukent ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Ajukent+updated%3A2025-11-18..2025-11-19&type=Issues)) | @stefanv ([activity](https://github.com/search?q=repo%3Ajupyter-book%2Fmystmd+involves%3Astefanv+updated%3A2025-11-18..2025-11-19&type=Issues))