From 3c34fcf0c55924b55fbceb73bf9ea8209bf7d76d Mon Sep 17 00:00:00 2001 From: Pavel Tsakalidis Date: Sat, 6 Jul 2024 21:30:05 +0100 Subject: [PATCH 1/6] Implement support for --repo-name to scan a specific repo --- src/cmdline.py | 17 + src/config/config.py | 3 + src/downloader/download.py | 23 + tests/integration/integration_consts.py | 15 + .../structures_json/demo-1-index.json | 415 ++++++++++++++++++ .../integration/test_single_repo_download.py | 28 ++ tests/tests_init.py | 9 +- 7 files changed, 509 insertions(+), 1 deletion(-) create mode 100644 tests/integration/structures_json/demo-1-index.json create mode 100644 tests/integration/test_single_repo_download.py diff --git a/src/cmdline.py b/src/cmdline.py index aed8c03..2c642eb 100644 --- a/src/cmdline.py +++ b/src/cmdline.py @@ -4,6 +4,7 @@ from src.downloader.download import ( download_all_workflows_and_actions, download_account_workflows_and_actions, + download_repo_workflows_and_actions, ) from src.indexer.index import index_downloaded_workflows_and_actions from src.reporter.report import generate @@ -24,6 +25,7 @@ REDIS_CLEAN_DEFAULT, DOWNLOAD_COMMAND, DOWNLOAD_ACCOUNT_COMMAND, + DOWNLOAD_REPO_COMMAND, DOWNLOAD_CRAWL_COMMAND, INDEX_COMMAND, REPORT_COMMAND, @@ -39,6 +41,7 @@ DOWNLOAD_COMMAND: { DOWNLOAD_CRAWL_COMMAND: download_all_workflows_and_actions, DOWNLOAD_ACCOUNT_COMMAND: download_account_workflows_and_actions, + DOWNLOAD_REPO_COMMAND: download_repo_workflows_and_actions, }, INDEX_COMMAND: index_downloaded_workflows_and_actions, REPORT_COMMAND: generate, @@ -165,6 +168,20 @@ def raven() -> None: help="Download repositories owned by the authenticated user", ) + repo_download_parser = download_sub_parser.add_parser( + "repo", + help="Download specific repository", + parents=[download_parser_options, redis_parser], + ) + + repo_download_parser.add_argument( + "--repo-name", + required=True, + action="append", + type=str, + help="Repository to download" + ) + crawl_download_parser.add_argument( "--max-stars", type=int, help="Maximum number of stars for a repository" ) diff --git a/src/config/config.py b/src/config/config.py index 9a47471..38b881f 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -38,6 +38,7 @@ # CLI commands DOWNLOAD_COMMAND = "download" DOWNLOAD_ACCOUNT_COMMAND = "account" +DOWNLOAD_REPO_COMMAND = "repo" DOWNLOAD_CRAWL_COMMAND = "crawl" INDEX_COMMAND = "index" REPORT_COMMAND = "report" @@ -71,6 +72,7 @@ def load_downloader_config(args) -> None: Config.min_stars = args.get("min_stars", MIN_STARS_DEFAULT) Config.max_stars = args.get("max_stars") Config.account_name = args.get("account_name") + Config.repo_name = args.get("repo_name") Config.personal = args.get("personal") Config.clean_redis = args.get("clean_redis", REDIS_CLEAN_DEFAULT) @@ -143,6 +145,7 @@ class Config: min_stars: int = None max_stars: int = None account_name: list[str] = [] + repo_name: list[str] = [] personal: bool = None # Indexer Configs diff --git a/src/downloader/download.py b/src/downloader/download.py index 796f66c..b95fc6d 100644 --- a/src/downloader/download.py +++ b/src/downloader/download.py @@ -229,3 +229,26 @@ def download_action_or_reusable_workflow(uses_string: str, repo: str) -> None: ) # In the future, ref will be with commit sha add_ref_pointer_to_redis(full_path, full_path) + + +def download_repo_workflows_and_actions() -> None: + """Download single repository + + We are enumerating the .github/workflows directory, and downloading all the workflows. + In addition if the repository contains action.yml file, it means it is a composite action, + so we download it as well. + + For each such workflow we also scan if it uses additional external actions. + If so, we download these as well. + + We are trying to cache the downloads as much as we can to reduce redundant download attempts. + """ + log.info(f"[+] Scanning single repository") + + for repo in Config.repo_name: + # Ensure it's of the "org/repo" format. + if repo.count("/") != 1: + log.error(f"[-] Repository '{repo}' is not a repository") + log.fail_exit() + continue + download_workflows_and_actions(repo) diff --git a/tests/integration/integration_consts.py b/tests/integration/integration_consts.py index 47d370e..102505d 100644 --- a/tests/integration/integration_consts.py +++ b/tests/integration/integration_consts.py @@ -65,4 +65,19 @@ }, }, }, + { + "test_name": "test_demo_index_single_repo", + "json_path": "tests/integration/structures_json/demo-1-index.json", + "description": "Tests Demo-1's graph structures combined. It has a workflow that uses the checkout action.", + "queries": { + "nodes_query": GET_NODES_BY_PATH_QUERY, + "relationships_query": GET_RELATIONSHIPS_BY_PATH_QUERY, + "to_format": { + "paths_list": [ + "RavenIntegrationTests/Demo-1/.github/workflows/demo-workflow.yml", + "actions/checkout", + ] + }, + }, + } ] diff --git a/tests/integration/structures_json/demo-1-index.json b/tests/integration/structures_json/demo-1-index.json new file mode 100644 index 0000000..564b5ff --- /dev/null +++ b/tests/integration/structures_json/demo-1-index.json @@ -0,0 +1,415 @@ +{ + "nodes": [ + { + "path": "actions/checkout", + "using": "node20", + "name": "Checkout", + "is_public": true, + "_id": "d35e7df441120da9624b8c11e36151be", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeAction" + ] + }, + { + "path": "actions/checkout", + "default": "${{ github.repository }}", + "name": "repository", + "description": "Repository name with owner. For example, actions/checkout", + "_id": "775c9f7b8b404a9df37b16c9d4d8f336", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "name": "ref", + "description": "The branch, tag or SHA to checkout. When checking out the repository that triggered a workflow, this defaults to the reference or SHA for that event. Otherwise, uses the default branch.\n", + "_id": "6bf31cd9bee2b2c9a0fd9a8d3c578903", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "${{ github.token }}", + "name": "token", + "description": "Personal access token (PAT) used to fetch the repository. The PAT is configured with the local git config, which enables your scripts to run authenticated git commands. The post-job step removes the PAT.\n\nWe recommend using a service account with the least permissions necessary. Also when generating a new PAT, select the least scopes necessary.\n\n[Learn more about creating and using encrypted secrets](https://help.github.com/en/actions/automating-your-workflow-with-github-actions/creating-and-using-encrypted-secrets)\n", + "_id": "5f4fa35f28767a9155a5813b2cc934d5", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "name": "ssh-key", + "description": "SSH key used to fetch the repository. The SSH key is configured with the local git config, which enables your scripts to run authenticated git commands. The post-job step removes the SSH key.\n\nWe recommend using a service account with the least permissions necessary.\n\n[Learn more about creating and using encrypted secrets](https://help.github.com/en/actions/automating-your-workflow-with-github-actions/creating-and-using-encrypted-secrets)\n", + "_id": "c53d6fe7a19a576c8bfefe6cfa80870b", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "name": "ssh-known-hosts", + "description": "Known hosts in addition to the user and global host key database. The public SSH keys for a host may be obtained using the utility `ssh-keyscan`. For example, `ssh-keyscan github.com`. The public key for github.com is always implicitly added.\n", + "_id": "145d5c9a78c4d7564f7b79ec495b6ff2", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "true", + "name": "ssh-strict", + "description": "Whether to perform strict host key checking. When true, adds the options `StrictHostKeyChecking=yes` and `CheckHostIP=no` to the SSH command line. Use the input `ssh-known-hosts` to configure additional hosts.\n", + "_id": "ec56ff9d7ca004ba974a2fca1b01f2c5", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "git", + "name": "ssh-user", + "description": "The user to use when connecting to the remote SSH host. By default 'git' is used.\n", + "_id": "c647109fbdf1f6f88205b9d01b7e84cb", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "true", + "name": "persist-credentials", + "description": "Whether to configure the token or SSH key with the local git config", + "_id": "fe19be86972043c79c7ad0f618559ade", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "name": "path", + "description": "Relative path under $GITHUB_WORKSPACE to place the repository", + "_id": "3470907d9833db0f15032d1aadd1dbc2", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "true", + "name": "clean", + "description": "Whether to execute `git clean -ffdx && git reset --hard HEAD` before fetching", + "_id": "a8d8ef152d8986d297b84bc2faf66d1e", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "name": "filter", + "description": "Partially clone against a given filter. Overrides sparse-checkout if set.\n", + "_id": "974202e17cde42b352e662c9c55df9ff", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "name": "sparse-checkout", + "description": "Do a sparse checkout on given patterns. Each pattern should be separated with new lines.\n", + "_id": "c626f387aeac44aa82b5e27f4cbaca54", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "true", + "name": "sparse-checkout-cone-mode", + "description": "Specifies whether to use cone-mode when doing a sparse checkout.\n", + "_id": "90aa405b4e4278ac32892e1e1756b807", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": 1, + "name": "fetch-depth", + "description": "Number of commits to fetch. 0 indicates all history for all branches and tags.", + "_id": "00fb2bfc353337457e5be8ea01e27984", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "false", + "name": "fetch-tags", + "description": "Whether to fetch tags, even if fetch-depth > 0.", + "_id": "a32f4c2bbd5fb33b56c7b2634b343e62", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "true", + "name": "show-progress", + "description": "Whether to show progress status output when fetching.", + "_id": "c871387fa99df881290dbd906bae83a6", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "false", + "name": "lfs", + "description": "Whether to download Git-LFS files", + "_id": "99f82cfda3b119fcf2b38ebc651dc0a8", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "false", + "name": "submodules", + "description": "Whether to checkout submodules: `true` to checkout submodules or `recursive` to recursively checkout submodules.\n\nWhen the `ssh-key` input is not provided, SSH URLs beginning with `git@github.com:` are converted to HTTPS.\n", + "_id": "7cdf94b14af56d70417e7b6d92ff316e", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "default": "true", + "name": "set-safe-directory", + "description": "Add repository path as safe.directory for Git global config by running `git config --global --add safe.directory `", + "_id": "98aaa1fa6903184fa592300457546c9e", + "required": false, + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "actions/checkout", + "name": "github-server-url", + "description": "The base URL for the GitHub instance that you are trying to clone from, will use environment defaults to fetch from the same instance that the workflow is running from unless specified. Example URLs are https://github.com or https://my-ghes-server.example.com", + "_id": "e8f0d587842e43432e7c96ddcf2ef5e8", + "url": "https://github.com/actions/checkout/tree/main/action.yml", + "required": false, + "labels": [ + "CompositeActionInput" + ] + }, + { + "path": "RavenIntegrationTests/Demo-1/.github/workflows/demo-workflow.yml", + "name": "run", + "is_public": true, + "_id": "22137cee99760506f369934d49f719f8", + "trigger": [ + "workflow_dispatch" + ], + "url": "https://github.com/RavenIntegrationTests/Demo-1/tree/main/.github/workflows/demo-workflow.yml", + "labels": [ + "Workflow" + ] + }, + { + "path": "RavenIntegrationTests/Demo-1/.github/workflows/demo-workflow.yml", + "machine": [ + "ubuntu-latest" + ], + "name": "demo_test", + "_id": "ec3b1d0fd5ec71b9f43547f026f579fd", + "url": "https://github.com/RavenIntegrationTests/Demo-1/tree/main/.github/workflows/demo-workflow.yml", + "labels": [ + "Job" + ] + }, + { + "path": "RavenIntegrationTests/Demo-1/.github/workflows/demo-workflow.yml", + "ref": "v4", + "uses": "actions/checkout@v4", + "_id": "c1306c77e3af9cdedcedee69c59c96c1", + "url": "https://github.com/RavenIntegrationTests/Demo-1/tree/main/.github/workflows/demo-workflow.yml", + "labels": [ + "Step" + ] + }, + { + "path": "RavenIntegrationTests/Demo-1/.github/workflows/demo-workflow.yml", + "name": "PrintEnv", + "run": "printenv", + "_id": "6f76bec5bf9d4e683ef736d3ffe8b842", + "url": "https://github.com/RavenIntegrationTests/Demo-1/tree/main/.github/workflows/demo-workflow.yml", + "labels": [ + "Step" + ] + } + ], + "relationships": [ + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "775c9f7b8b404a9df37b16c9d4d8f336" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "6bf31cd9bee2b2c9a0fd9a8d3c578903" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "5f4fa35f28767a9155a5813b2cc934d5" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "c53d6fe7a19a576c8bfefe6cfa80870b" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "145d5c9a78c4d7564f7b79ec495b6ff2" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "ec56ff9d7ca004ba974a2fca1b01f2c5" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "c647109fbdf1f6f88205b9d01b7e84cb" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "fe19be86972043c79c7ad0f618559ade" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "3470907d9833db0f15032d1aadd1dbc2" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "a8d8ef152d8986d297b84bc2faf66d1e" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "974202e17cde42b352e662c9c55df9ff" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "c626f387aeac44aa82b5e27f4cbaca54" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "90aa405b4e4278ac32892e1e1756b807" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "00fb2bfc353337457e5be8ea01e27984" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "a32f4c2bbd5fb33b56c7b2634b343e62" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "c871387fa99df881290dbd906bae83a6" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "99f82cfda3b119fcf2b38ebc651dc0a8" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "7cdf94b14af56d70417e7b6d92ff316e" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "98aaa1fa6903184fa592300457546c9e" + }, + { + "start_node": "d35e7df441120da9624b8c11e36151be", + "type": "COMPOSITE_ACTION_INPUT", + "end_node": "e8f0d587842e43432e7c96ddcf2ef5e8" + }, + { + "start_node": "c1306c77e3af9cdedcedee69c59c96c1", + "type": "ACTION", + "end_node": "d35e7df441120da9624b8c11e36151be" + }, + { + "start_node": "ec3b1d0fd5ec71b9f43547f026f579fd", + "type": "STEPS", + "end_node": "c1306c77e3af9cdedcedee69c59c96c1" + }, + { + "start_node": "ec3b1d0fd5ec71b9f43547f026f579fd", + "type": "STEPS", + "end_node": "6f76bec5bf9d4e683ef736d3ffe8b842" + }, + { + "start_node": "22137cee99760506f369934d49f719f8", + "type": "JOBS", + "end_node": "ec3b1d0fd5ec71b9f43547f026f579fd" + } + ] +} \ No newline at end of file diff --git a/tests/integration/test_single_repo_download.py b/tests/integration/test_single_repo_download.py new file mode 100644 index 0000000..ac9088a --- /dev/null +++ b/tests/integration/test_single_repo_download.py @@ -0,0 +1,28 @@ +from colorama import Fore, Style +from tests.utils import ( + get_graph_structure, + assert_graph_structures, +) +from tests.integration.integration_consts import TESTS_CONFIGS +from tests.tests_init import init_integration_single_env + + +def test_single_repo_download() -> None: + init_integration_single_env() + + test_config = next((item for item in TESTS_CONFIGS if item["test_name"] == "test_demo_index_single_repo"), None) + + print( + f"{Fore.CYAN}Running integration test: {test_config['test_name']}.{Style.RESET_ALL}" + ) + + # Get the queries from the test config + query_config = test_config["queries"] + nodes_query = query_config["nodes_query"].format(**query_config["to_format"]) + relationships_query = query_config["relationships_query"].format( + **query_config["to_format"] + ) + + # Get the graph structure from the queries and assert it + graph_structure = get_graph_structure(nodes_query, relationships_query) + assert_graph_structures(graph_structure, test_config["json_path"]) diff --git a/tests/tests_init.py b/tests/tests_init.py index b864e5e..356d543 100644 --- a/tests/tests_init.py +++ b/tests/tests_init.py @@ -1,6 +1,6 @@ from os import getenv from src.config.config import load_downloader_config, load_indexer_config -from src.downloader.download import download_account_workflows_and_actions +from src.downloader.download import download_account_workflows_and_actions, download_repo_workflows_and_actions from src.indexer.index import index_downloaded_workflows_and_actions @@ -10,12 +10,19 @@ def init_integration_env(): index_downloaded_workflows_and_actions() +def init_integration_single_env(): + load_integration_tests_config() + download_repo_workflows_and_actions() + index_downloaded_workflows_and_actions() + + def load_integration_tests_config() -> None: load_downloader_config( { "debug": False, "token": getenv("GITHUB_TOKEN"), "account_name": ["RavenIntegrationTests"], + "repo_name": ["RavenIntegrationTests/Demo-1"], "redis_host": "raven-redis-test", "redis_port": 6379, "clean_redis": True, From 4e622d6539be88dbd0f2d2bbec8e2cbbfe43a93a Mon Sep 17 00:00:00 2001 From: Pavel Tsakalidis Date: Sat, 6 Jul 2024 21:57:49 +0100 Subject: [PATCH 2/6] Added support to target specific branch, using the org/repo@branch format --- src/downloader/gh_api.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/downloader/gh_api.py b/src/downloader/gh_api.py index 6009907..d79c35f 100644 --- a/src/downloader/gh_api.py +++ b/src/downloader/gh_api.py @@ -266,8 +266,15 @@ def get_repository_workflows(repo: str) -> Dict[str, str]: headers["Authorization"] = f"Token {Config.github_token}" + repo_name = repo + params = {} + if '@' in repo: + # The repo has the format of "org/repo@branch". + repo_name, repo_branch = repo.split('@') + params['ref'] = repo_branch + file_path = ".github/workflows" - r = get(CONTENTS_URL.format(repo_path=repo, file_path=file_path), headers=headers) + r = get(CONTENTS_URL.format(repo_path=repo_name, file_path=file_path), headers=headers, params=params) if r.status_code == 404: return {} if r.status_code == 403 and int(r.headers["X-RateLimit-Remaining"]) == 0: From 5a418b2fe068b597ac9b1955f74baa4b79ccb068 Mon Sep 17 00:00:00 2001 From: Pavel Tsakalidis Date: Sat, 6 Jul 2024 22:52:54 +0100 Subject: [PATCH 3/6] Added support for exporting findings as SARIF --- src/cmdline.py | 3 +- src/config/config.py | 1 + src/queries/__init__.py | 5 +++ src/reporter/report.py | 83 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 1 deletion(-) diff --git a/src/cmdline.py b/src/cmdline.py index 2c642eb..af609a0 100644 --- a/src/cmdline.py +++ b/src/cmdline.py @@ -32,6 +32,7 @@ QUERIES_PATH_DEFAULT, REPORT_RAW_FORMAT, REPORT_JSON_FORMAT, + REPORT_SARIF_FORMAT, SEVERITY_LEVELS, QUERY_TAGS, QUERY_IDS, @@ -247,7 +248,7 @@ def raven() -> None: "--format", "-f", default=REPORT_RAW_FORMAT, - choices=[REPORT_RAW_FORMAT, REPORT_JSON_FORMAT], + choices=[REPORT_RAW_FORMAT, REPORT_JSON_FORMAT, REPORT_SARIF_FORMAT], help="Report format (default: raw)", ) diff --git a/src/config/config.py b/src/config/config.py index 38b881f..2c285fa 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -8,6 +8,7 @@ QUERIES_PATH_DEFAULT = "library" REPORT_RAW_FORMAT = "raw" REPORT_JSON_FORMAT = "json" +REPORT_SARIF_FORMAT = "sarif" SLACK_REPORTER = "slack" NEO4J_URI_DEFAULT = "neo4j://localhost:7687" diff --git a/src/queries/__init__.py b/src/queries/__init__.py index aeb234f..ff78a32 100644 --- a/src/queries/__init__.py +++ b/src/queries/__init__.py @@ -12,6 +12,7 @@ def __init__( id: str, name: str, description: str, + full_description: str, tags: list, severity: str, query: list, @@ -19,6 +20,7 @@ def __init__( self.id = id self.name = name self.description = description + self.full_description = full_description self.tags = tags self.severity = severity self.query = query @@ -81,7 +83,9 @@ def to_raw(self) -> str: report += f"{Fore.CYAN}Severity:{Style.RESET_ALL} {self.severity}\n" wrapped_description = textwrap.fill(self.description, width=description_length) + wrapped_full_description = textwrap.fill(self.full_description, width=description_length) report += f"{Fore.CYAN}Description:{Style.RESET_ALL} {wrapped_description}\n" + report += f"{Fore.CYAN}Full Description:{Style.RESET_ALL} {wrapped_full_description}\n" report += f"{Fore.CYAN}Tags:{Style.RESET_ALL} {self.tags}\n" report += f"{Fore.CYAN}Workflow URLS:{Style.RESET_ALL}\n" @@ -98,6 +102,7 @@ def _to_dict(self) -> dict: "id": self.id, "name": self.name, "description": self.description, + "full_description": self.full_description, "tags": self.tags, "severity": self.severity, "result": self.result, diff --git a/src/reporter/report.py b/src/reporter/report.py index 2dbe09e..eeebb4c 100644 --- a/src/reporter/report.py +++ b/src/reporter/report.py @@ -2,6 +2,7 @@ Config, REPORT_RAW_FORMAT, REPORT_JSON_FORMAT, + REPORT_SARIF_FORMAT, SLACK_REPORTER, ) from src.reporter import slack_reporter @@ -27,6 +28,12 @@ def json_reporter(queries: List[Query]) -> str: return json.dumps([query.to_json() for query in queries], indent=4) +def sarif_reporter(queries: List[Query]) -> str: + # Get the JSON first as it's easier to parse, then create the sarif result. + json_report = json.loads(json_reporter(queries)) + return json.dumps(convert_json_to_sarif(json_report), indent=4) + + def get_queries() -> List[Query]: queries = [] for query_file in listdir(Config.queries_path): @@ -38,6 +45,7 @@ def get_queries() -> List[Query]: id=yml_query.get("id"), name=detection_info.get("name"), description=detection_info.get("description"), + full_description=detection_info.get("full-description"), tags=detection_info.get("tags"), severity=detection_info.get("severity"), query=yml_query.get("query"), @@ -60,6 +68,8 @@ def generate() -> None: report = raw_reporter(filtered_queries) elif Config.format == REPORT_JSON_FORMAT: report = json_reporter(filtered_queries) + elif Config.format == REPORT_SARIF_FORMAT: + report = sarif_reporter(filtered_queries) if Config.reporter == SLACK_REPORTER: if Config.slack_token and Config.channel_id: @@ -76,3 +86,76 @@ def generate() -> None: print(report) success_exit() + + +def convert_json_to_sarif(findings: dict) -> dict: + all_rules = [] + all_results = [] + + # Match severity to CVSS score. + scores = { + 'critical': 10, + 'high': 8, + 'medium': 6, + 'low': 3, + 'info': 0 + } + + for finding in findings: + # First add the rules. + rule = { + "id": finding["id"], + "name": finding["name"], + "shortDescription": { + "text": finding["description"] + }, + "fullDescription": { + "text": finding["full_description"] + }, + "properties": { + "security-severity": scores[finding["severity"]], + "tags": [ + "security" + ] + } + } + + all_rules.append(rule) + + # Now for each file mentioned in the "result" list, add a finding. + for result in finding["result"]: + item = { + "ruleId": finding["id"], + "message": { + "text": finding["description"] + }, + "locations": [ + { + "physicalLocation": { + "artifactLocation": { + "uri": result + } + } + } + ] + } + + all_results.append(item) + + return { + "version": "2.1.0", + "$schema": "http://json.schemastore.org/sarif-2.1.0.json", + "runs": [ + { + "tool": { + "driver": { + "name": "Raven Security Analyzer", + "version": "1.0.0", + "informationUri": "https://github.com/CycodeLabs/raven/", + "rules": all_rules + } + }, + "results": all_results + } + ] + } From 26f34d78b3180a853e5d6633b4defcc838a55ebc Mon Sep 17 00:00:00 2001 From: Pavel Tsakalidis Date: Sat, 6 Jul 2024 23:29:48 +0100 Subject: [PATCH 4/6] Added support to save report output using --output, and added tests for sarif format --- src/cmdline.py | 9 +++++ src/config/config.py | 3 ++ src/reporter/report.py | 8 +++-- tests/integration/test_report_sarif_export.py | 34 +++++++++++++++++++ tests/tests_init.py | 26 ++++++++++---- 5 files changed, 71 insertions(+), 9 deletions(-) create mode 100644 tests/integration/test_report_sarif_export.py diff --git a/src/cmdline.py b/src/cmdline.py index af609a0..36d8be1 100644 --- a/src/cmdline.py +++ b/src/cmdline.py @@ -33,6 +33,7 @@ REPORT_RAW_FORMAT, REPORT_JSON_FORMAT, REPORT_SARIF_FORMAT, + REPORT_OUTPUT, SEVERITY_LEVELS, QUERY_TAGS, QUERY_IDS, @@ -251,6 +252,12 @@ def raven() -> None: choices=[REPORT_RAW_FORMAT, REPORT_JSON_FORMAT, REPORT_SARIF_FORMAT], help="Report format (default: raw)", ) + report_parser.add_argument( + "--output", + "-o", + default="", + help="Location to save report output" + ) format_sub_parser = report_parser.add_subparsers( dest="report_command", @@ -289,5 +296,7 @@ def raven() -> None: elif args.command == REPORT_COMMAND: load_reporter_config(vars(args)) COMMAND_FUNCTIONS[args.command]() + # Moved this function outside of report.py, otherwise pytest was failing + log.success_exit() else: parser.print_help() diff --git a/src/config/config.py b/src/config/config.py index 2c285fa..468fba8 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -9,6 +9,7 @@ REPORT_RAW_FORMAT = "raw" REPORT_JSON_FORMAT = "json" REPORT_SARIF_FORMAT = "sarif" +REPORT_OUTPUT = "" SLACK_REPORTER = "slack" NEO4J_URI_DEFAULT = "neo4j://localhost:7687" @@ -132,6 +133,7 @@ def load_reporter_config(args): Config.reporter = args.get("report_command") Config.slack_token = args.get("slack_token") Config.channel_id = args.get("channel_id") + Config.output = args.get("output") load_redis_config(args) load_neo4j_config(args) @@ -179,6 +181,7 @@ class Config: reporter: str = None slack_token: str = None channel_id: str = None + output: str = "" # Neo4j Config neo4j_uri: str = None diff --git a/src/reporter/report.py b/src/reporter/report.py index eeebb4c..96c6dbb 100644 --- a/src/reporter/report.py +++ b/src/reporter/report.py @@ -83,9 +83,11 @@ def generate() -> None: ) else: - print(report) - - success_exit() + if len(Config.output) > 0: + with open(Config.output, "w") as output_file: + output_file.write(report) + else: + print(report) def convert_json_to_sarif(findings: dict) -> dict: diff --git a/tests/integration/test_report_sarif_export.py b/tests/integration/test_report_sarif_export.py new file mode 100644 index 0000000..c74e26f --- /dev/null +++ b/tests/integration/test_report_sarif_export.py @@ -0,0 +1,34 @@ +import os +import json +from tests.tests_init import init_integration_env +from src.reporter.report import generate as report_generate +from src.config.config import Config + + +def test_report_sarif_export(tmp_path) -> None: + init_integration_env() + + assert os.path.isfile(Config.output) is False + report_generate() + assert os.path.isfile(Config.output) is True + + with open(Config.output, "r") as file: + content = file.read() + + os.remove(Config.output) + + sarif = json.loads(content) + + driver = sarif["runs"][0]["tool"]["driver"] + rules = driver["rules"] + results = sarif["runs"][0]["results"] + + assert driver["name"] == "Raven Security Analyzer" + assert len(rules) == 1 + assert rules[0]["id"] == "RQ-11" + assert rules[0]["name"] == "Title Context Injection" + + assert len(results) == 1 + assert results[0]["ruleId"] == "RQ-11" + assert len(results[0]["locations"]) == 1 + assert results[0]["locations"][0]["physicalLocation"]["artifactLocation"]["uri"] == "https://github.com/RavenIntegrationTests/Integration-1/tree/main/.github/workflows/integration-workflow.yml" \ No newline at end of file diff --git a/tests/tests_init.py b/tests/tests_init.py index 356d543..4dab629 100644 --- a/tests/tests_init.py +++ b/tests/tests_init.py @@ -1,7 +1,10 @@ -from os import getenv -from src.config.config import load_downloader_config, load_indexer_config +from pathlib import Path +import tempfile +import os +from src.config.config import load_downloader_config, load_indexer_config, load_reporter_config from src.downloader.download import download_account_workflows_and_actions, download_repo_workflows_and_actions from src.indexer.index import index_downloaded_workflows_and_actions +from src.config.config import LAST_QUERY_ID, QUERIES_PATH_DEFAULT def init_integration_env(): @@ -20,10 +23,11 @@ def load_integration_tests_config() -> None: load_downloader_config( { "debug": False, - "token": getenv("GITHUB_TOKEN"), + "token": os.getenv("GITHUB_TOKEN"), "account_name": ["RavenIntegrationTests"], "repo_name": ["RavenIntegrationTests/Demo-1"], - "redis_host": "raven-redis-test", + # "redis_host": "raven-redis-test", + "redis_host": "localhost", "redis_port": 6379, "clean_redis": True, } @@ -32,13 +36,23 @@ def load_integration_tests_config() -> None: load_indexer_config( { "debug": False, - "redis_host": "raven-redis-test", + # "redis_host": "raven-redis-test", + "redis_host": "localhost", "redis_port": 6379, "clean_redis": True, - "neo4j_uri": "neo4j://raven-neo4j-test:7687", + # "neo4j_uri": "neo4j://raven-neo4j-test:7687", + "neo4j_uri": "neo4j://localhost:7687", "neo4j_user": "neo4j", "neo4j_pass": "123456789", "threads": 1, "clean_neo4j": True, } ) + + load_reporter_config( + { + "format": "sarif", + "queries_path": Path(__file__).parent.parent / QUERIES_PATH_DEFAULT, + "output": os.path.join(tempfile.gettempdir(), next(tempfile._get_candidate_names())) + } + ) From d8deee65b260b460ce86e51a81c8d3eb5b71834c Mon Sep 17 00:00:00 2001 From: Pavel Tsakalidis Date: Sat, 6 Jul 2024 23:58:08 +0100 Subject: [PATCH 5/6] Added support to download individual workflows using --workflow --- src/cmdline.py | 8 ++++++ src/config/config.py | 2 ++ src/downloader/download.py | 15 ++++++++-- .../test_single_workflow_download.py | 28 +++++++++++++++++++ tests/tests_init.py | 14 ++++------ 5 files changed, 57 insertions(+), 10 deletions(-) create mode 100644 tests/integration/test_single_workflow_download.py diff --git a/src/cmdline.py b/src/cmdline.py index 36d8be1..9a45974 100644 --- a/src/cmdline.py +++ b/src/cmdline.py @@ -184,6 +184,14 @@ def raven() -> None: help="Repository to download" ) + repo_download_parser.add_argument( + "--workflow", + required=False, + action="append", + type=str, + help="Workflow to download" + ) + crawl_download_parser.add_argument( "--max-stars", type=int, help="Maximum number of stars for a repository" ) diff --git a/src/config/config.py b/src/config/config.py index 468fba8..85085b5 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -75,6 +75,7 @@ def load_downloader_config(args) -> None: Config.max_stars = args.get("max_stars") Config.account_name = args.get("account_name") Config.repo_name = args.get("repo_name") + Config.workflow = args.get("workflow") Config.personal = args.get("personal") Config.clean_redis = args.get("clean_redis", REDIS_CLEAN_DEFAULT) @@ -149,6 +150,7 @@ class Config: max_stars: int = None account_name: list[str] = [] repo_name: list[str] = [] + workflow: list[str] = [] personal: bool = None # Indexer Configs diff --git a/src/downloader/download.py b/src/downloader/download.py index b95fc6d..34745d7 100644 --- a/src/downloader/download.py +++ b/src/downloader/download.py @@ -77,12 +77,14 @@ def download_all_workflows_and_actions() -> None: download_workflows_and_actions(repo) -def download_workflows_and_actions(repo: str) -> None: +def download_workflows_and_actions(repo: str, only_workflows: list = []) -> None: """The flow is the following: - First we enumerate .github/workflows directory for workflows - For each such workflow we download it - If that workflow contains uses:..., we analyze the string, and download the action or the reusable workflow. + + We can also filter specific workflows if we only want to test a specific one. """ with RedisConnection(Config.redis_objects_ops_db) as ops_db: if ops_db.exists_in_set(Config.workflow_download_history_set, repo): @@ -94,6 +96,10 @@ def download_workflows_and_actions(repo: str) -> None: log.debug(f"[+] Found {len(workflows)} workflows for {repo}") for name, url in workflows.items(): + if len(only_workflows) > 0 and name.lower() not in only_workflows: + log.debug(f"[+] Skipping {name}") + continue + if is_url_contains_a_token(url): """ If the URL contains a token, it means it is a private repository. @@ -245,10 +251,15 @@ def download_repo_workflows_and_actions() -> None: """ log.info(f"[+] Scanning single repository") + only_workflows = [] + if Config.workflow is not None and len(Config.workflow) > 0: + only_workflows = list(map(str.lower, Config.workflow)) + log.info(f"[+] Will only download the following workflows: {', '.join(only_workflows)}") + for repo in Config.repo_name: # Ensure it's of the "org/repo" format. if repo.count("/") != 1: log.error(f"[-] Repository '{repo}' is not a repository") log.fail_exit() continue - download_workflows_and_actions(repo) + download_workflows_and_actions(repo, only_workflows=only_workflows) diff --git a/tests/integration/test_single_workflow_download.py b/tests/integration/test_single_workflow_download.py new file mode 100644 index 0000000..95cb881 --- /dev/null +++ b/tests/integration/test_single_workflow_download.py @@ -0,0 +1,28 @@ +from colorama import Fore, Style +from tests.utils import ( + get_graph_structure, + assert_graph_structures, +) +from tests.integration.integration_consts import TESTS_CONFIGS +from tests.tests_init import init_integration_single_env + + +def test_single_workflow_download() -> None: + init_integration_single_env(only_workflows=["demo-workflow.yml"]) + + test_config = next((item for item in TESTS_CONFIGS if item["test_name"] == "test_demo_index_single_repo"), None) + + print( + f"{Fore.CYAN}Running integration test: {test_config['test_name']}.{Style.RESET_ALL}" + ) + + # Get the queries from the test config + query_config = test_config["queries"] + nodes_query = query_config["nodes_query"].format(**query_config["to_format"]) + relationships_query = query_config["relationships_query"].format( + **query_config["to_format"] + ) + + # Get the graph structure from the queries and assert it + graph_structure = get_graph_structure(nodes_query, relationships_query) + assert_graph_structures(graph_structure, test_config["json_path"]) diff --git a/tests/tests_init.py b/tests/tests_init.py index 4dab629..e6edd45 100644 --- a/tests/tests_init.py +++ b/tests/tests_init.py @@ -4,7 +4,7 @@ from src.config.config import load_downloader_config, load_indexer_config, load_reporter_config from src.downloader.download import download_account_workflows_and_actions, download_repo_workflows_and_actions from src.indexer.index import index_downloaded_workflows_and_actions -from src.config.config import LAST_QUERY_ID, QUERIES_PATH_DEFAULT +from src.config.config import LAST_QUERY_ID, QUERIES_PATH_DEFAULT, Config def init_integration_env(): @@ -13,8 +13,9 @@ def init_integration_env(): index_downloaded_workflows_and_actions() -def init_integration_single_env(): +def init_integration_single_env(only_workflows: list = []): load_integration_tests_config() + Config.workflow = only_workflows download_repo_workflows_and_actions() index_downloaded_workflows_and_actions() @@ -26,8 +27,7 @@ def load_integration_tests_config() -> None: "token": os.getenv("GITHUB_TOKEN"), "account_name": ["RavenIntegrationTests"], "repo_name": ["RavenIntegrationTests/Demo-1"], - # "redis_host": "raven-redis-test", - "redis_host": "localhost", + "redis_host": "raven-redis-test", "redis_port": 6379, "clean_redis": True, } @@ -36,12 +36,10 @@ def load_integration_tests_config() -> None: load_indexer_config( { "debug": False, - # "redis_host": "raven-redis-test", - "redis_host": "localhost", + "redis_host": "raven-redis-test", "redis_port": 6379, "clean_redis": True, - # "neo4j_uri": "neo4j://raven-neo4j-test:7687", - "neo4j_uri": "neo4j://localhost:7687", + "neo4j_uri": "neo4j://raven-neo4j-test:7687", "neo4j_user": "neo4j", "neo4j_pass": "123456789", "threads": 1, From 74f7e4bd7084185773a3f147a8d2902450de8f22 Mon Sep 17 00:00:00 2001 From: Pavel Tsakalidis Date: Sun, 7 Jul 2024 12:02:17 +0100 Subject: [PATCH 6/6] Added some error checking when parsing workflow files --- src/indexer/index.py | 4 ++++ src/workflow_components/workflow.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/indexer/index.py b/src/indexer/index.py index 4864b90..74ba0c0 100644 --- a/src/indexer/index.py +++ b/src/indexer/index.py @@ -150,6 +150,10 @@ def index_workflow_file(workflow: str) -> None: obj["url"] = url obj["is_public"] = is_public + if 'on' not in obj: + # Could be some invalid/config file like: + # https://github.com/spotify/druid/blob/master/.github/workflows/codeql-config.yml + return Config.graph.push_object(Workflow.from_dict(obj)) ops_db.insert_to_set(Config.workflow_index_history_set, workflow_full_name) diff --git a/src/workflow_components/workflow.py b/src/workflow_components/workflow.py index 4eb2b49..f6750d8 100644 --- a/src/workflow_components/workflow.py +++ b/src/workflow_components/workflow.py @@ -207,7 +207,9 @@ def from_dict(obj_dict: Dict[str, Any]) -> "Workflow": # When we meet it, we want to create a special relation to inputs of the reusable workflow. # We continue to treat the workflow as a regular workflow, and not as a reusable workflow. # But the difference is that we connected the different inputs to the workflow. - if "workflow_call" in w.trigger: + # However, a workflow_call can be empty like in: + # https://github.com/python/cpython/blob/68e279b37aae3019979a05ca55f462b11aac14be/.github/workflows/reusable-docs.yml#L4 + if "workflow_call" in w.trigger and obj_dict["on"]["workflow_call"] is not None: wokrflow_call = obj_dict["on"]["workflow_call"] inputs = wokrflow_call["inputs"] for input_name, input in inputs.items():