CycodeLabs · sadreck · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/src/cmdline.py b/src/cmdline.py
@@ -4,6 +4,7 @@
 from src.downloader.download import (
     download_all_workflows_and_actions,
     download_account_workflows_and_actions,
+    download_repo_workflows_and_actions,
 )
 from src.indexer.index import index_downloaded_workflows_and_actions
 from src.reporter.report import generate
@@ -24,12 +25,15 @@
     REDIS_CLEAN_DEFAULT,
     DOWNLOAD_COMMAND,
     DOWNLOAD_ACCOUNT_COMMAND,
+    DOWNLOAD_REPO_COMMAND,
     DOWNLOAD_CRAWL_COMMAND,
     INDEX_COMMAND,
     REPORT_COMMAND,
     QUERIES_PATH_DEFAULT,
     REPORT_RAW_FORMAT,
     REPORT_JSON_FORMAT,
+    REPORT_SARIF_FORMAT,
+    REPORT_OUTPUT,
     SEVERITY_LEVELS,
     QUERY_TAGS,
     QUERY_IDS,
@@ -39,6 +43,7 @@
     DOWNLOAD_COMMAND: {
         DOWNLOAD_CRAWL_COMMAND: download_all_workflows_and_actions,
         DOWNLOAD_ACCOUNT_COMMAND: download_account_workflows_and_actions,
+        DOWNLOAD_REPO_COMMAND: download_repo_workflows_and_actions,
     },
     INDEX_COMMAND: index_downloaded_workflows_and_actions,
     REPORT_COMMAND: generate,
@@ -165,6 +170,28 @@ def raven() -> None:
         help="Download repositories owned by the authenticated user",
     )
 
+    repo_download_parser = download_sub_parser.add_parser(
+        "repo",
+        help="Download specific repository",
+        parents=[download_parser_options, redis_parser],
+    )
+
+    repo_download_parser.add_argument(
+        "--repo-name",
+        required=True,
+        action="append",
+        type=str,
+        help="Repository to download"
+    )
+
+    repo_download_parser.add_argument(
+        "--workflow",
+        required=False,
+        action="append",
+        type=str,
+        help="Workflow to download"
+    )
+
     crawl_download_parser.add_argument(
         "--max-stars", type=int, help="Maximum number of stars for a repository"
     )
@@ -230,9 +257,15 @@ def raven() -> None:
         "--format",
         "-f",
         default=REPORT_RAW_FORMAT,
-        choices=[REPORT_RAW_FORMAT, REPORT_JSON_FORMAT],
+        choices=[REPORT_RAW_FORMAT, REPORT_JSON_FORMAT, REPORT_SARIF_FORMAT],
         help="Report format (default: raw)",
     )
+    report_parser.add_argument(
+        "--output",
+        "-o",
+        default="",
+        help="Location to save report output"
+    )
 
     format_sub_parser = report_parser.add_subparsers(
         dest="report_command",
@@ -271,5 +304,7 @@ def raven() -> None:
         elif args.command == REPORT_COMMAND:
             load_reporter_config(vars(args))
             COMMAND_FUNCTIONS[args.command]()
+            # Moved this function outside of report.py, otherwise pytest was failing
+            log.success_exit()
     else:
         parser.print_help()
diff --git a/src/config/config.py b/src/config/config.py
@@ -8,6 +8,8 @@
 QUERIES_PATH_DEFAULT = "library"
 REPORT_RAW_FORMAT = "raw"
 REPORT_JSON_FORMAT = "json"
+REPORT_SARIF_FORMAT = "sarif"
+REPORT_OUTPUT = ""
 SLACK_REPORTER = "slack"
 
 NEO4J_URI_DEFAULT = "neo4j://localhost:7687"
@@ -38,6 +40,7 @@
 # CLI commands
 DOWNLOAD_COMMAND = "download"
 DOWNLOAD_ACCOUNT_COMMAND = "account"
+DOWNLOAD_REPO_COMMAND = "repo"
 DOWNLOAD_CRAWL_COMMAND = "crawl"
 INDEX_COMMAND = "index"
 REPORT_COMMAND = "report"
@@ -71,6 +74,8 @@ def load_downloader_config(args) -> None:
     Config.min_stars = args.get("min_stars", MIN_STARS_DEFAULT)
     Config.max_stars = args.get("max_stars")
     Config.account_name = args.get("account_name")
+    Config.repo_name = args.get("repo_name")
+    Config.workflow = args.get("workflow")
     Config.personal = args.get("personal")
     Config.clean_redis = args.get("clean_redis", REDIS_CLEAN_DEFAULT)
 
@@ -129,6 +134,7 @@ def load_reporter_config(args):
     Config.reporter = args.get("report_command")
     Config.slack_token = args.get("slack_token")
     Config.channel_id = args.get("channel_id")
+    Config.output = args.get("output")
 
     load_redis_config(args)
     load_neo4j_config(args)
@@ -143,6 +149,8 @@ class Config:
     min_stars: int = None
     max_stars: int = None
     account_name: list[str] = []
+    repo_name: list[str] = []
+    workflow: list[str] = []
     personal: bool = None
 
     # Indexer Configs
@@ -175,6 +183,7 @@ class Config:
     reporter: str = None
     slack_token: str = None
     channel_id: str = None
+    output: str = ""
 
     # Neo4j Config
     neo4j_uri: str = None

diff --git a/src/downloader/download.py b/src/downloader/download.py
@@ -77,12 +77,14 @@ def download_all_workflows_and_actions() -> None:
         download_workflows_and_actions(repo)
 
 
-def download_workflows_and_actions(repo: str) -> None:
+def download_workflows_and_actions(repo: str, only_workflows: list = []) -> None:
     """The flow is the following:
 
     - First we enumerate .github/workflows directory for workflows
     - For each such workflow we download it
     - If that workflow contains uses:..., we analyze the string, and download the action or the reusable workflow.
+
+    We can also filter specific workflows if we only want to test a specific one.
     """
     with RedisConnection(Config.redis_objects_ops_db) as ops_db:
         if ops_db.exists_in_set(Config.workflow_download_history_set, repo):
@@ -94,6 +96,10 @@ def download_workflows_and_actions(repo: str) -> None:
 
         log.debug(f"[+] Found {len(workflows)} workflows for {repo}")
         for name, url in workflows.items():
+            if len(only_workflows) > 0 and name.lower() not in only_workflows:
+                log.debug(f"[+] Skipping {name}")
+                continue
+
             if is_url_contains_a_token(url):
                 """
                 If the URL contains a token, it means it is a private repository.
@@ -229,3 +235,31 @@ def download_action_or_reusable_workflow(uses_string: str, repo: str) -> None:
             )
             # In the future, ref will be with commit sha
             add_ref_pointer_to_redis(full_path, full_path)
+
+
+def download_repo_workflows_and_actions() -> None:
+    """Download single repository
+
+    We are enumerating the .github/workflows directory, and downloading all the workflows.
+    In addition if the repository contains action.yml file, it means it is a composite action,
+    so we download it as well.
+
+    For each such workflow we also scan if it uses additional external actions.
+    If so, we download these as well.
+
+    We are trying to cache the downloads as much as we can to reduce redundant download attempts.
+    """
+    log.info(f"[+] Scanning single repository")
+
+    only_workflows = []
+    if Config.workflow is not None and len(Config.workflow) > 0:
+        only_workflows = list(map(str.lower, Config.workflow))
+        log.info(f"[+] Will only download the following workflows: {', '.join(only_workflows)}")
+
+    for repo in Config.repo_name:
+        # Ensure it's of the "org/repo" format.
+        if repo.count("/") != 1:
+            log.error(f"[-] Repository '{repo}' is not a repository")
+            log.fail_exit()
+            continue
+        download_workflows_and_actions(repo, only_workflows=only_workflows)
diff --git a/src/downloader/gh_api.py b/src/downloader/gh_api.py
@@ -266,8 +266,15 @@ def get_repository_workflows(repo: str) -> Dict[str, str]:
 
     headers["Authorization"] = f"Token {Config.github_token}"
 
+    repo_name = repo
+    params = {}
+    if '@' in repo:
+        # The repo has the format of "org/repo@branch".
+        repo_name, repo_branch = repo.split('@')
+        params['ref'] = repo_branch
+
     file_path = ".github/workflows"
-    r = get(CONTENTS_URL.format(repo_path=repo, file_path=file_path), headers=headers)
+    r = get(CONTENTS_URL.format(repo_path=repo_name, file_path=file_path), headers=headers, params=params)
     if r.status_code == 404:
         return {}
     if r.status_code == 403 and int(r.headers["X-RateLimit-Remaining"]) == 0:

diff --git a/src/indexer/index.py b/src/indexer/index.py
@@ -150,6 +150,10 @@ def index_workflow_file(workflow: str) -> None:
             obj["url"] = url
             obj["is_public"] = is_public
 
+            if 'on' not in obj:
+                # Could be some invalid/config file like:
+                # https://github.com/spotify/druid/blob/master/.github/workflows/codeql-config.yml
+                return
             Config.graph.push_object(Workflow.from_dict(obj))
             ops_db.insert_to_set(Config.workflow_index_history_set, workflow_full_name)
 

diff --git a/src/queries/__init__.py b/src/queries/__init__.py
@@ -12,13 +12,15 @@ def __init__(
         id: str,
         name: str,
         description: str,
+        full_description: str,
         tags: list,
         severity: str,
         query: list,
     ) -> None:
         self.id = id
         self.name = name
         self.description = description
+        self.full_description = full_description
         self.tags = tags
         self.severity = severity
         self.query = query
@@ -81,7 +83,9 @@ def to_raw(self) -> str:
         report += f"{Fore.CYAN}Severity:{Style.RESET_ALL} {self.severity}\n"
 
         wrapped_description = textwrap.fill(self.description, width=description_length)
+        wrapped_full_description = textwrap.fill(self.full_description, width=description_length)
         report += f"{Fore.CYAN}Description:{Style.RESET_ALL} {wrapped_description}\n"
+        report += f"{Fore.CYAN}Full Description:{Style.RESET_ALL} {wrapped_full_description}\n"
         report += f"{Fore.CYAN}Tags:{Style.RESET_ALL} {self.tags}\n"
 
         report += f"{Fore.CYAN}Workflow URLS:{Style.RESET_ALL}\n"
@@ -98,6 +102,7 @@ def _to_dict(self) -> dict:
             "id": self.id,
             "name": self.name,
             "description": self.description,
+            "full_description": self.full_description,
             "tags": self.tags,
             "severity": self.severity,
             "result": self.result,

diff --git a/src/reporter/report.py b/src/reporter/report.py
@@ -2,6 +2,7 @@
     Config,
     REPORT_RAW_FORMAT,
     REPORT_JSON_FORMAT,
+    REPORT_SARIF_FORMAT,
     SLACK_REPORTER,
 )
 from src.reporter import slack_reporter
@@ -27,6 +28,12 @@ def json_reporter(queries: List[Query]) -> str:
     return json.dumps([query.to_json() for query in queries], indent=4)
 
 
+def sarif_reporter(queries: List[Query]) -> str:
+    # Get the JSON first as it's easier to parse, then create the sarif result.
+    json_report = json.loads(json_reporter(queries))
+    return json.dumps(convert_json_to_sarif(json_report), indent=4)
+
+
 def get_queries() -> List[Query]:
     queries = []
     for query_file in listdir(Config.queries_path):
@@ -38,6 +45,7 @@ def get_queries() -> List[Query]:
                 id=yml_query.get("id"),
                 name=detection_info.get("name"),
                 description=detection_info.get("description"),
+                full_description=detection_info.get("full-description"),
                 tags=detection_info.get("tags"),
                 severity=detection_info.get("severity"),
                 query=yml_query.get("query"),
@@ -60,6 +68,8 @@ def generate() -> None:
         report = raw_reporter(filtered_queries)
     elif Config.format == REPORT_JSON_FORMAT:
         report = json_reporter(filtered_queries)
+    elif Config.format == REPORT_SARIF_FORMAT:
+        report = sarif_reporter(filtered_queries)
 
     if Config.reporter == SLACK_REPORTER:
         if Config.slack_token and Config.channel_id:
@@ -73,6 +83,81 @@ def generate() -> None:
             )
 
     else:
-        print(report)
-
-    success_exit()
+        if len(Config.output) > 0:
+            with open(Config.output, "w") as output_file:
+                output_file.write(report)
+        else:
+            print(report)
+
+
+def convert_json_to_sarif(findings: dict) -> dict:
+    all_rules = []
+    all_results = []
+
+    # Match severity to CVSS score.
+    scores = {
+        'critical': 10,
+        'high': 8,
+        'medium': 6,
+        'low': 3,
+        'info': 0
+    }
+
+    for finding in findings:
+        # First add the rules.
+        rule = {
+            "id": finding["id"],
+            "name": finding["name"],
+            "shortDescription": {
+                "text": finding["description"]
+            },
+            "fullDescription": {
+                "text": finding["full_description"]
+            },
+            "properties": {
+                "security-severity": scores[finding["severity"]],
+                "tags": [
+                    "security"
+                ]
+            }
+        }
+
+        all_rules.append(rule)
+
+        # Now for each file mentioned in the "result" list, add a finding.
+        for result in finding["result"]:
+            item = {
+                "ruleId": finding["id"],
+                "message": {
+                    "text": finding["description"]
+                },
+                "locations": [
+                    {
+                        "physicalLocation": {
+                            "artifactLocation": {
+                                "uri": result
+                            }
+                        }
+                    }
+                ]
+            }
+
+            all_results.append(item)
+
+    return {
+        "version": "2.1.0",
+        "$schema": "http://json.schemastore.org/sarif-2.1.0.json",
+        "runs": [
+            {
+                "tool": {
+                    "driver": {
+                        "name": "Raven Security Analyzer",
+                        "version": "1.0.0",
+                        "informationUri": "https://github.com/CycodeLabs/raven/",
+                        "rules": all_rules
+                    }
+                },
+                "results": all_results
+            }
+        ]
+    }
diff --git a/src/workflow_components/workflow.py b/src/workflow_components/workflow.py
@@ -207,7 +207,9 @@ def from_dict(obj_dict: Dict[str, Any]) -> "Workflow":
         # When we meet it, we want to create a special relation to inputs of the reusable workflow.
         # We continue to treat the workflow as a regular workflow, and not as a reusable workflow.
         # But the difference is that we connected the different inputs to the workflow.
-        if "workflow_call" in w.trigger:
+        # However, a workflow_call can be empty like in:
+        # https://github.com/python/cpython/blob/68e279b37aae3019979a05ca55f462b11aac14be/.github/workflows/reusable-docs.yml#L4
+        if "workflow_call" in w.trigger and obj_dict["on"]["workflow_call"] is not None:
             wokrflow_call = obj_dict["on"]["workflow_call"]
             inputs = wokrflow_call["inputs"]
             for input_name, input in inputs.items():