Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,4 @@ dmypy.json

# Pyre type checker
.pyre/
.idea/
25 changes: 25 additions & 0 deletions src/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from src.downloader.download import (
download_all_workflows_and_actions,
download_account_workflows_and_actions,
download_repo_workflows_and_actions,
)
from src.indexer.index import index_downloaded_workflows_and_actions
from src.reporter.report import generate
Expand All @@ -24,6 +25,7 @@
REDIS_CLEAN_DEFAULT,
DOWNLOAD_COMMAND,
DOWNLOAD_ACCOUNT_COMMAND,
DOWNLOAD_REPO_COMMAND,
DOWNLOAD_CRAWL_COMMAND,
INDEX_COMMAND,
REPORT_COMMAND,
Expand All @@ -39,6 +41,7 @@
DOWNLOAD_COMMAND: {
DOWNLOAD_CRAWL_COMMAND: download_all_workflows_and_actions,
DOWNLOAD_ACCOUNT_COMMAND: download_account_workflows_and_actions,
DOWNLOAD_REPO_COMMAND: download_repo_workflows_and_actions,
},
INDEX_COMMAND: index_downloaded_workflows_and_actions,
REPORT_COMMAND: generate,
Expand Down Expand Up @@ -165,6 +168,28 @@ def raven() -> None:
help="Download repositories owned by the authenticated user",
)

repo_download_parser = download_sub_parser.add_parser(
"repo",
help="Download specific repository",
parents=[download_parser_options, redis_parser],
)

repo_download_parser.add_argument(
"--repo-name",
required=True,
action="append",
type=str,
help="Repository to download"
)

repo_download_parser.add_argument(
"--workflow",
required=False,
action="append",
type=str,
help="Workflow to download"
)

crawl_download_parser.add_argument(
"--max-stars", type=int, help="Maximum number of stars for a repository"
)
Expand Down
5 changes: 5 additions & 0 deletions src/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
# CLI commands
DOWNLOAD_COMMAND = "download"
DOWNLOAD_ACCOUNT_COMMAND = "account"
DOWNLOAD_REPO_COMMAND = "repo"
DOWNLOAD_CRAWL_COMMAND = "crawl"
INDEX_COMMAND = "index"
REPORT_COMMAND = "report"
Expand Down Expand Up @@ -71,6 +72,8 @@ def load_downloader_config(args) -> None:
Config.min_stars = args.get("min_stars", MIN_STARS_DEFAULT)
Config.max_stars = args.get("max_stars")
Config.account_name = args.get("account_name")
Config.repo_name = args.get("repo_name")
Config.workflow = args.get("workflow")
Config.personal = args.get("personal")
Config.clean_redis = args.get("clean_redis", REDIS_CLEAN_DEFAULT)

Expand Down Expand Up @@ -143,6 +146,8 @@ class Config:
min_stars: int = None
max_stars: int = None
account_name: list[str] = []
repo_name: list[str] = []
workflow: list[str] = []
personal: bool = None

# Indexer Configs
Expand Down
47 changes: 41 additions & 6 deletions src/downloader/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,29 @@ def download_all_workflows_and_actions() -> None:
download_workflows_and_actions(repo)


def download_workflows_and_actions(repo: str) -> None:
def download_workflows_and_actions(repo: str, only_workflows: list = [], branch: str = '') -> None:
"""The flow is the following:

- First we enumerate .github/workflows directory for workflows
- For each such workflow we download it
- If that workflow contains uses:..., we analyze the string, and download the action or the reusable workflow.

We can also filter specific workflows if we only want to test a specific one.
"""
with RedisConnection(Config.redis_objects_ops_db) as ops_db:
if ops_db.exists_in_set(Config.workflow_download_history_set, repo):
log.debug(f"[!] Repo {repo} already scanned, skipping.")
return

workflows = get_repository_workflows(repo)
workflows = get_repository_workflows(repo, branch=branch)
is_public = 1

log.debug(f"[+] Found {len(workflows)} workflows for {repo}")
for name, url in workflows.items():
if len(only_workflows) > 0 and name.lower() not in only_workflows:
log.debug(f"[+] Skipping {name}")
continue

if is_url_contains_a_token(url):
"""
If the URL contains a token, it means it is a private repository.
Expand All @@ -112,7 +118,7 @@ def download_workflows_and_actions(repo: str) -> None:
# We look for dependant external actions.
uses_strings = find_uses_strings(resp.text)
for uses_string in uses_strings:
download_action_or_reusable_workflow(uses_string=uses_string, repo=repo)
download_action_or_reusable_workflow(uses_string=uses_string, repo=repo, branch=branch)

# Save workflow to redis
workflow_unix_path = convert_workflow_to_unix_path(repo, name)
Expand All @@ -131,7 +137,7 @@ def download_workflows_and_actions(repo: str) -> None:
ops_db.insert_to_set(Config.workflow_download_history_set, repo)


def download_action_or_reusable_workflow(uses_string: str, repo: str) -> None:
def download_action_or_reusable_workflow(uses_string: str, repo: str, branch: str = '') -> None:
"""Whenever we find that workflow is using a "uses:" string,
it means we are referencing a composite action or reusable workflow, we try to fetch it.

Expand All @@ -156,9 +162,9 @@ def download_action_or_reusable_workflow(uses_string: str, repo: str) -> None:
return

if uses_string_obj.type == UsesStringType.REUSABLE_WORKFLOW:
url = get_repository_reusable_workflow(full_path)
url = get_repository_reusable_workflow(full_path, branch=branch, same_repo=uses_string.startswith('./'))
elif uses_string_obj.type == UsesStringType.ACTION:
url = get_repository_composite_action(full_path)
url = get_repository_composite_action(full_path, branch=branch, same_repo=uses_string.startswith('./'))
else:
# Can happen with docker references.
return
Expand Down Expand Up @@ -229,3 +235,32 @@ def download_action_or_reusable_workflow(uses_string: str, repo: str) -> None:
)
# In the future, ref will be with commit sha
add_ref_pointer_to_redis(full_path, full_path)


def download_repo_workflows_and_actions() -> None:
"""Download single repository
We are enumerating the .github/workflows directory, and downloading all the workflows.
In addition if the repository contains action.yml file, it means it is a composite action,
so we download it as well.
For each such workflow we also scan if it uses additional external actions.
If so, we download these as well.
We are trying to cache the downloads as much as we can to reduce redundant download attempts.
"""
log.info(f"[+] Scanning single repository")

only_workflows = []
if Config.workflow is not None and len(Config.workflow) > 0:
only_workflows = list(map(str.lower, Config.workflow))
log.info(f"[+] Will only download the following workflows: {', '.join(only_workflows)}")

for repo in Config.repo_name:
# Ensure it's of the "org/repo" format.
if repo.count("/") != 1:
log.error(f"[-] Repository '{repo}' is not a repository")
log.fail_exit()

branch = ''
if '@' in repo:
repo, branch = repo.split('@')

download_workflows_and_actions(repo, only_workflows=only_workflows, branch=branch)
15 changes: 10 additions & 5 deletions src/downloader/gh_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def get_repository_search(query: str, page: int = 1) -> Dict[str, Any]:
return r.json()["items"]


def get_repository_workflows(repo: str) -> Dict[str, str]:
def get_repository_workflows(repo: str, branch: str = '') -> Dict[str, str]:
"""Returns list of workflows for the specified repository.
Returns a dictionary that maps workflow file name, to its donwloadable URL.

Expand All @@ -265,9 +265,10 @@ def get_repository_workflows(repo: str) -> Dict[str, str]:
"""

headers["Authorization"] = f"Token {Config.github_token}"
params = {} if len(branch) == 0 else {'ref': branch}

file_path = ".github/workflows"
r = get(CONTENTS_URL.format(repo_path=repo, file_path=file_path), headers=headers)
r = get(CONTENTS_URL.format(repo_path=repo, file_path=file_path), headers=headers, params=params)
if r.status_code == 404:
return {}
if r.status_code == 403 and int(r.headers["X-RateLimit-Remaining"]) == 0:
Expand All @@ -278,7 +279,7 @@ def get_repository_workflows(repo: str) -> Dict[str, str]:
f"[*] Ratelimit for for contents API depleted. Sleeping {time_to_sleep} seconds"
)
time.sleep(time_to_sleep)
return get_repository_workflows(repo)
return get_repository_workflows(repo, branch=branch)
if r.status_code != 200:
log.error(f"status code: {r.status_code}. Response: {r.text}")
return {}
Expand All @@ -298,7 +299,7 @@ def get_repository_workflows(repo: str) -> Dict[str, str]:
return workflows


def get_repository_composite_action(path: str) -> str:
def get_repository_composite_action(path: str, branch: str = '', same_repo: bool = False) -> str:
"""Returns downloadble URL for a composite action in the specific path.

receives 'path_in_repo' relative path to the repository root to where search the action.yml.
Expand All @@ -311,12 +312,14 @@ def get_repository_composite_action(path: str) -> str:
relative_path = "/".join(path_splitted[2:])

headers["Authorization"] = f"Token {Config.github_token}"
params = {'ref': branch} if len(branch) > 0 and same_repo else {}

for suffix in ["action.yml", "action.yaml"]:
file_path = os.path.join(relative_path, suffix)
r = get(
CONTENTS_URL.format(repo_path=repo, file_path=file_path),
headers=headers,
params=params
)
if r.status_code == 404:
# can be both yml and yaml
Expand All @@ -329,7 +332,7 @@ def get_repository_composite_action(path: str) -> str:
return r.json()["download_url"]


def get_repository_reusable_workflow(path: str) -> str:
def get_repository_reusable_workflow(path: str, branch: str = '', same_repo: bool = False) -> str:
"""Returns downlodable URL for a reusable workflows in the specific path.

Raises exception if network error occured.
Expand All @@ -339,10 +342,12 @@ def get_repository_reusable_workflow(path: str) -> str:
relative_path = "/".join(path_splitted[2:])

headers["Authorization"] = f"Token {Config.github_token}"
params = {'ref': branch} if len(branch) > 0 and same_repo else {}

r = get(
CONTENTS_URL.format(repo_path=repo, file_path=relative_path),
headers=headers,
params=params
)
if r.status_code == 404:
return
Expand Down
5 changes: 5 additions & 0 deletions src/indexer/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ def index_workflow_file(workflow: str) -> None:
obj["url"] = url
obj["is_public"] = is_public

if 'on' not in obj:
# Could be some invalid/config file like:
# https://github.com/spotify/druid/blob/master/.github/workflows/codeql-config.yml
return

Config.graph.push_object(Workflow.from_dict(obj))
ops_db.insert_to_set(Config.workflow_index_history_set, workflow_full_name)

Expand Down
4 changes: 3 additions & 1 deletion src/workflow_components/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,9 @@ def from_dict(obj_dict: Dict[str, Any]) -> "Workflow":
# When we meet it, we want to create a special relation to inputs of the reusable workflow.
# We continue to treat the workflow as a regular workflow, and not as a reusable workflow.
# But the difference is that we connected the different inputs to the workflow.
if "workflow_call" in w.trigger:
# However, a workflow_call can be empty like in:
# https://github.com/python/cpython/blob/68e279b37aae3019979a05ca55f462b11aac14be/.github/workflows/reusable-docs.yml#L4
if "workflow_call" in w.trigger and obj_dict["on"]["workflow_call"] is not None:
wokrflow_call = obj_dict["on"]["workflow_call"]
inputs = wokrflow_call["inputs"]
for input_name, input in inputs.items():
Expand Down
15 changes: 15 additions & 0 deletions tests/integration/integration_consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,19 @@
},
},
},
{
"test_name": "test_demo_index_single_repo",
"json_path": "tests/integration/structures_json/demo-1-index.json",
"description": "Tests Demo-1's graph structures combined. It has a workflow that uses the checkout action.",
"queries": {
"nodes_query": GET_NODES_BY_PATH_QUERY,
"relationships_query": GET_RELATIONSHIPS_BY_PATH_QUERY,
"to_format": {
"paths_list": [
"RavenIntegrationTests/Demo-1/.github/workflows/demo-workflow.yml",
"actions/checkout",
]
},
},
}
]
Loading