From cfdb5c5669789fbb60bc0906bc6f690e13d6278d Mon Sep 17 00:00:00 2001 From: Jonathan Brockett <79410516+Jbrocket@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:45:18 -0800 Subject: [PATCH 1/4] Users/jbrockett/auto integrate (#5) * added automatic PR for integration of awesome azd * changing the script for specific types of generation * adding output option * more changes to generate_fields * git ignore updates to ignore pycache * modified scripts and generated fields for exec docs * removing stray file * removing pycache * workflow dispatch * changes to prompts * only decent keyfeatures * added exec docs workloads * delete unsupported workloads * Updated title (#162) * Users/jbrockett/auto integrate (#2) * added automatic PR for integration of awesome azd * some more testing * testing with new PAT * Successful PR * Auto Integrate Workloads.json * dev --------- Co-authored-by: Abhishek Bhombore <65986598+abhishekbhombore@users.noreply.github.com> --- .../workflows/Autointegrate_awesomeAzd.yml | 65 ++++++++++++++ .gitignore | 1 + scripts/add_awesome_azd/add_azd.py | 62 ------------- scripts/add_workloads/add_azd.py | 80 +++++++++++++++++ scripts/add_workloads/add_exec_docs.py | 86 +++++++++++++++++++ scripts/add_workloads/delete_workloads.py | 28 ++++++ scripts/generate_fields/generate_fields.py | 71 +++++++++++++++ scripts/generate_fields/requirements.txt | 3 + .../{ => generate_fields}/utils/__init__.py | 0 .../generate_fields/utils/external_data.py | 42 +++++++++ scripts/generate_fields/utils/fields.py | 17 ++++ .../utils/file_functions.py | 0 .../utils/get_responses.py | 26 +++--- scripts/{ => generate_fields}/utils/prompt.py | 38 +++++++- scripts/generate_workloads/requirements.txt | 1 - scripts/generate_workloads/workloads.py | 30 ------- scripts/utils/external_data.py | 12 --- 17 files changed, 446 insertions(+), 116 deletions(-) create mode 100644 .github/workflows/Autointegrate_awesomeAzd.yml delete mode 100644 scripts/add_awesome_azd/add_azd.py create mode 100644 scripts/add_workloads/add_azd.py create mode 100644 scripts/add_workloads/add_exec_docs.py create mode 100644 scripts/add_workloads/delete_workloads.py create mode 100644 scripts/generate_fields/generate_fields.py create mode 100644 scripts/generate_fields/requirements.txt rename scripts/{ => generate_fields}/utils/__init__.py (100%) create mode 100644 scripts/generate_fields/utils/external_data.py create mode 100644 scripts/generate_fields/utils/fields.py rename scripts/{ => generate_fields}/utils/file_functions.py (100%) rename scripts/{ => generate_fields}/utils/get_responses.py (74%) rename scripts/{ => generate_fields}/utils/prompt.py (71%) delete mode 100644 scripts/generate_workloads/requirements.txt delete mode 100644 scripts/generate_workloads/workloads.py delete mode 100644 scripts/utils/external_data.py diff --git a/.github/workflows/Autointegrate_awesomeAzd.yml b/.github/workflows/Autointegrate_awesomeAzd.yml new file mode 100644 index 00000000..43a0cae0 --- /dev/null +++ b/.github/workflows/Autointegrate_awesomeAzd.yml @@ -0,0 +1,65 @@ +name: Awesome-azd and Exec Docs Sync +permissions: + actions: write + contents: write +on: + workflow_dispatch: + +jobs: + Workloads-PR: + runs-on: ubuntu-latest + environment: "AzD Integration" + + steps: + - name: Checkout respository + uses: actions/checkout@v2 + with: + ref: dev + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r ${{ github.workspace }}/scripts/generate_fields/requirements.txt + + - name: Fetch workloads from azd and exec docs scenarios + run: | + curl -o ${{ github.workspace }}/templates.json https://raw.githubusercontent.com/Azure/awesome-azd/main/website/static/templates.json + curl -o ${{ github.workspace }}/exec_metadata.json https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/metadata.json + + - name: Updating Workloads + run: | + echo "Running script" + python ${{ github.workspace }}/scripts/add_workloads/add_azd.py --root ${{ github.workspace }} --input_file ${{ github.workspace }}/templates.json + python ${{ github.workspace }}/scripts/add_workloads/add_exec_docs.py --root ${{ github.workspace }} --input_file ${{ github.workspace }}/exec_metadata.json + rm ${{ github.workspace }}/templates.json + rm ${{ github.workspace }}/exec_metadata.json + + - name: Generating Fields + env: + AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} + AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + run: | + echo "Generating New Fields" + python ${{ github.workspace }}/scripts/generate_fields/generate_fields.py --root ${{ github.workspace }} + + - name: Configure Git + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + + - name: Raise PR + uses: peter-evans/create-pull-request@v4 + with: + token: ${{ secrets.GIT_PAT }} + branch: "auto-pr-branch-${{ github.run_number }}" + commit-message: "Triggered update of workloads.json" + title: "Triggered PR: workloads.json update by ${{ github.actor}}" + body: "Triggered update of workloads.json by ${{ github.actor}}" + author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" + labels: "automated-pr" + delete-branch: true \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3492bbde..d70b4cf1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store helper_functions.sh +**/__pycache__/** \ No newline at end of file diff --git a/scripts/add_awesome_azd/add_azd.py b/scripts/add_awesome_azd/add_azd.py deleted file mode 100644 index 33c950d4..00000000 --- a/scripts/add_awesome_azd/add_azd.py +++ /dev/null @@ -1,62 +0,0 @@ -import json -import uuid - -def main(): - with open("./workloads/azd_templates.json", "r") as f: - data = json.load(f) - - new_workloads = [] - - ## Get azd workloads - for workload in data: - if "msft" in workload["tags"]: - new_workloads.append(workload) - - ## Add correct fields - with open("./workloads/workloads.json", "r") as f: - workloads = json.load(f) - - correct_keys = [] - unique_azd = {} - for key in workloads[0].keys(): - correct_keys.append(key) - - for workload in workloads: - unique_azd[workload["source"]] = workload - - ## Add correct keys for new_workloads - for workload in new_workloads: - if workload["source"] in unique_azd: - keys = unique_azd[workload["source"]].keys() - for key in keys: - if key in workload: - unique_azd[workload["source"]][key] = workload[key] - else: - new_workload = {} - for key in correct_keys: - if key in workload: - new_workload[key] = workload[key] - else: - match key: - case "tags": - new_workload[key] = [] - case "products": - new_workload[key] = [] - case "sampleQueries": - new_workload[key] = [] - case "deploymentOptions": - new_workload[key] = ["AzD"] - case "sourceType": - new_workload[key] = "Azd" - case "deploymentConfig": - new_workload[key] = {} - case "id": - new_workload[key] = str(uuid.uuid4()) - workloads.append(new_workload) - - ## Write to file - with open("./workloads/new_workloads.json", "w") as f: - json.dump(workloads, f, indent=4) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/add_workloads/add_azd.py b/scripts/add_workloads/add_azd.py new file mode 100644 index 00000000..7e6d6d19 --- /dev/null +++ b/scripts/add_workloads/add_azd.py @@ -0,0 +1,80 @@ +import json, uuid, argparse, logging + +logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s - %(message)s') + +def main(root: str, input_file: str, check_quality: bool = False): + workloads_file = f"{root}/workloads/workloads.json" + + with open(input_file, "r") as f: + data = json.load(f) + + new_workloads = [] + + ## Get azd workloads + for workload in data: + if "msft" in workload["tags"]: + new_workloads.append(workload) + elif check_quality: + if workload["quality"]: + new_workloads.append(workload) + + ## Add correct fields + with open(workloads_file, "r") as f: + workloads = json.load(f) + + correct_keys = workloads[0].keys() + unique_azd = {} + for workload in workloads: + unique_azd[workload["source"]] = workload + + ## Add correct keys for new_workloads + for azd_workload in new_workloads: + logging.info(f"Processing workload: {workload['title']}") + if azd_workload["source"] in unique_azd: + for key in azd_workload.keys(): + if key in correct_keys: + unique_azd[azd_workload["source"]][key] = azd_workload[key] + else: + logging.info(f"Adding new workload: {workload['title']}") + new_workload = {} + for key in correct_keys: + if key in azd_workload: + new_workload[key] = azd_workload[key] + else: + match key: + case "tags": + new_workload[key] = [] + case "products": + new_workload[key] = [] + case "sampleQueries": + new_workload[key] = [] + case "deploymentOptions": + new_workload[key] = ["AzD"] + case "sourceType": + new_workload[key] = "Azd" + case "deploymentConfig": + new_workload[key] = {} + case "id": + new_workload[key] = str(uuid.uuid4()) + case "tech": + new_workload[key] = [] + case "keyFeatures": + new_workload[key] = [] + case _: + new_workload[key] = [] + workloads.append(new_workload) + + ## Write to file + with open(workloads_file, "w") as f: + json.dump(workloads, f, indent=4) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process workloads and add necessary fields.", + usage="%(prog)s --root ROOT --input_file INPUT_FILE [--check-quality]") + parser.add_argument("-r", "--root", help="Path to the root directory.", required=True) + parser.add_argument("-i", "--input_file", help="Path to the input JSON file.", required=True) + parser.add_argument("--check-quality", action="store_true", help="Whether to check the quality field in the workloads.") + + args = parser.parse_args() + + main(args.root, args.input_file, args.check_quality) \ No newline at end of file diff --git a/scripts/add_workloads/add_exec_docs.py b/scripts/add_workloads/add_exec_docs.py new file mode 100644 index 00000000..ed96d09e --- /dev/null +++ b/scripts/add_workloads/add_exec_docs.py @@ -0,0 +1,86 @@ +import json, uuid, argparse, logging + +logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s - %(message)s') + +def main(root: str, input_file: str, check_quality: bool = False): + workloads_file = f"{root}/workloads/workloads.json" + + with open(input_file, "r") as f: + data = json.load(f) + + new_workloads = [] + + ## Get azd workloads + for workload in data: + if workload["status"] == "active": + new_workloads.append(workload) + elif check_quality: + if workload.get("quality", None): + new_workloads.append(workload) + + ## Add correct fields + with open(workloads_file, "r") as f: + workloads = json.load(f) + + correct_keys = workloads[0].keys() + unique_exec = {} + for workload in workloads: + unique_exec[workload["source"]] = workload + + ## Add correct keys for new_workloads + for exec_workload in new_workloads: + logging.info(f"Processing workload: {workload['title']}") + if exec_workload["sourceUrl"] in unique_exec: + for key in exec_workload.keys(): + if key in correct_keys: + unique_exec[exec_workload["sourceUrl"]][key] = exec_workload[key] + else: + logging.info(f"Adding new workload: {workload['title']}") + new_workload = {} + for key in correct_keys: + if key in exec_workload: + new_workload[key] = exec_workload[key] + else: + match key: + case "source": + new_workload[key] = exec_workload["sourceUrl"] + case "tags": + new_workload[key] = [] + case "products": + new_workload[key] = [] + case "sampleQueries": + new_workload[key] = [] + case "deploymentOptions": + new_workload[key] = ["azcli"] + case "sourceType": + new_workload[key] = "ExecDocs" + case "deploymentConfig": + new_workload[key] = { + "execDocs": { + "path": exec_workload["key"].replace("/", "%2F") + } + } + case "id": + new_workload[key] = str(uuid.uuid4()) + case "tech": + new_workload[key] = [] + case "keyFeatures": + new_workload[key] = [] + case _: + new_workload[key] = [] + workloads.append(new_workload) + + ## Write to file + with open(workloads_file, "w") as f: + json.dump(workloads, f, indent=4) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process workloads and add necessary fields.", + usage="%(prog)s --root ROOT --input_file INPUT_FILE [--check-quality]") + parser.add_argument("-r", "--root", help="Path to the root directory.", required=True) + parser.add_argument("-i", "--input_file", help="Path to the input JSON file.", required=True) + parser.add_argument("--check-quality", action="store_true", help="Whether to check the quality field in the workloads.") + + args = parser.parse_args() + + main(args.root, args.input_file, args.check_quality) \ No newline at end of file diff --git a/scripts/add_workloads/delete_workloads.py b/scripts/add_workloads/delete_workloads.py new file mode 100644 index 00000000..b991f4ff --- /dev/null +++ b/scripts/add_workloads/delete_workloads.py @@ -0,0 +1,28 @@ +import argparse, json, requests, logging + +logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s - %(message)s') + +def main(root="."): + workloads = json.load(open(f"{root}/workloads/workloads.json", "r")) + keep_workloads = [] + + for workload in workloads: + res = requests.get(workload["source"]) + + print(res.status_code, workload["source"]) + if res.status_code == 200: + logging.info(f"Keeping workload {workload['title']}") + keep_workloads.append(workload) + else: + logging.info(f"Removing workload {workload['title']}") + + json.dump(keep_workloads, open(f"{root}/workloads/workloads.json", "w"), indent=4) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Delete unsupported workloads in workloads.json. Does so by looking at the source url.", + usage="%(prog)s --root ROOT") + parser.add_argument("-r", "--root", help="Path to the root directory.", required=True) + + args = parser.parse_args() + + main(args.root) \ No newline at end of file diff --git a/scripts/generate_fields/generate_fields.py b/scripts/generate_fields/generate_fields.py new file mode 100644 index 00000000..b8b43e6e --- /dev/null +++ b/scripts/generate_fields/generate_fields.py @@ -0,0 +1,71 @@ +import os +import argparse +from openai import AzureOpenAI + +from utils.file_functions import read_file, write_file +from utils.get_responses import get_responses +from utils.fields import get_fields + +FILE_PATH = "workloads/workloads.json" +DEPLOYMENT_MODEL = "gpt-4o-mini" +FAILED_WORKLOADS = "failed_workloads.json" +SUCCESSFUL_WORKLOADS = "workloads/workloads.json" + + +def main(all_workloads: bool = False, fields: str = "", root: str = ".", output: str = ""): + """ + Process workloads with Azure OpenAI. + + Arguments: + all_workloads -- Boolean indicating whether to process all workloads. + fields -- String of fields to turn on, represented by letters. + + Usage examples: + 1. To process all workloads: + python script_name.py --all-workloads + + 2. To process specific fields: + python script_name.py --fields abc + """ + + fields_that_need_responses = get_fields(fields) + + if not root: root = "." + workloads = read_file(f"{root}/{FILE_PATH}") + client = AzureOpenAI( + api_key = os.getenv("AZURE_OPENAI_API_KEY"), + api_version = "2024-02-01", + azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + ) + + finished_workloads = [] + + for workload in workloads: + finished_workloads.append(get_responses(workload, client, DEPLOYMENT_MODEL, fields_that_need_responses, all_workloads)) + + if not output: + write_file(f"{root}/{SUCCESSFUL_WORKLOADS}", finished_workloads) + else: + write_file(output, finished_workloads) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=( + 'Process workloads with Azure OpenAI.\n\n' + 'Usage examples:\n' + '1. To process all workloads:\n' + ' python script_name.py --all-workloads\n' + '\n' + '2. To process specific fields:\n' + ' python script_name.py --fields ' + ), + formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument('-r', '--root', type=str, help='Path to the root directory') + parser.add_argument('-a', '--all-workloads', action='store_true', help='Process all workloads') + parser.add_argument('-f', '--fields', type=str, default='', help='String of fields to turn on, letters to indicate fields') + parser.add_argument('-o', '--output', type=str, help='Path to the output file') + + args = parser.parse_args() + + main(all_workloads=args.all_workloads, fields=args.fields, root=args.root, output=args.output) \ No newline at end of file diff --git a/scripts/generate_fields/requirements.txt b/scripts/generate_fields/requirements.txt new file mode 100644 index 00000000..957ed9ea --- /dev/null +++ b/scripts/generate_fields/requirements.txt @@ -0,0 +1,3 @@ +openai===1.44.1 +requests===2.25.1 +httpx==0.27.2 # Required for OpenAI API BUG: TypeError: AsyncClient.__init__() got an unexpected keyword argument 'proxies' \ No newline at end of file diff --git a/scripts/utils/__init__.py b/scripts/generate_fields/utils/__init__.py similarity index 100% rename from scripts/utils/__init__.py rename to scripts/generate_fields/utils/__init__.py diff --git a/scripts/generate_fields/utils/external_data.py b/scripts/generate_fields/utils/external_data.py new file mode 100644 index 00000000..0cd892a1 --- /dev/null +++ b/scripts/generate_fields/utils/external_data.py @@ -0,0 +1,42 @@ +import requests, os, logging +from utils.prompt import README_PROMPT + +PAT = os.getenv("GITHUB_EMU_PAT") +HEADERS = {'Authorization': f'token {PAT}'} if PAT else {} + +def get_readme(github_url: str) -> str: + if github_url.startswith("https://github.com"): + raw_github = "https://raw.githubusercontent.com" + github_url = github_url.replace("https://github.com", raw_github) + readme_url = f"{github_url}/refs/heads/main/README.md" + response = requests.get(readme_url) + if response.status_code == 404: + try: + github_api = github_url.replace(raw_github, "https://api.github.com/repos") + res = requests.get(github_api, headers=HEADERS) + repo_info = res.json() + main_branch = repo_info.get("default_branch", "main") + + res = requests.get(f"{github_api}/contents", headers=HEADERS) + contents = res.json() + readme_file = [file['name'] for file in contents if 'readme' in file['name'].lower()][0] + + raw_github = "https://raw.githubusercontent.com" + github_url = github_url.replace("https://github.com", raw_github) + readme_url = f"{github_url}/refs/heads/{main_branch}/{readme_file}" + + response = requests.get(readme_url) + except Exception as e: + logging.error(f"Error getting readme file: {e}") + raw_github = "https://raw.githubusercontent.com" + github_url = github_url.replace("https://github.com", raw_github) + readme_url = f"{github_url}/refs/heads/main/README.md" + response = requests.get(readme_url) + else: + readme_url = github_url + response = requests.get(readme_url, headers=HEADERS) + + return format_readme_request(response.text) + +def format_readme_request(response) -> str: + return f"{README_PROMPT}:\n```\n{response}\n```{README_PROMPT}\n" \ No newline at end of file diff --git a/scripts/generate_fields/utils/fields.py b/scripts/generate_fields/utils/fields.py new file mode 100644 index 00000000..c706877b --- /dev/null +++ b/scripts/generate_fields/utils/fields.py @@ -0,0 +1,17 @@ +ALL_FIELDS = { + "G": "tags", + "F": "keyFeatures", + "P": "products", + "T": "tech", + "Q": "sampleQueries", +} + +def get_fields(field_string: str) -> list: + if not field_string: + return ALL_FIELDS.values() + + fields = [] + for char in field_string: + if char not in ALL_FIELDS: raise ValueError(f"Invalid field: {char}") + fields.append(ALL_FIELDS[char]) + return fields \ No newline at end of file diff --git a/scripts/utils/file_functions.py b/scripts/generate_fields/utils/file_functions.py similarity index 100% rename from scripts/utils/file_functions.py rename to scripts/generate_fields/utils/file_functions.py diff --git a/scripts/utils/get_responses.py b/scripts/generate_fields/utils/get_responses.py similarity index 74% rename from scripts/utils/get_responses.py rename to scripts/generate_fields/utils/get_responses.py index 987c9090..6d80a589 100644 --- a/scripts/utils/get_responses.py +++ b/scripts/generate_fields/utils/get_responses.py @@ -1,31 +1,36 @@ import json - +import uuid +import time from openai import AzureOpenAI + from utils.external_data import get_readme -from utils.prompt import KEY_FEATURES_PROMPT, DISCLAIMER_PROMPT, SAMPLE_NEGATIVE_MATCH_PROMPT, SAMPLE_PRODUCTS_PROMPT, SAMPLE_TECH_PROMPT, SAMPLE_QUERIES_PROMPT, WORKLOAD_TYPE_PROMPT +from utils.prompt import KEY_FEATURES_PROMPT, DISCLAIMER_PROMPT, SAMPLE_NEGATIVE_MATCH_PROMPT, SAMPLE_PRODUCTS_PROMPT, SAMPLE_TECH_PROMPT, SAMPLE_QUERIES_PROMPT, WORKLOAD_TYPE_PROMPT, TAGS_PROMPT import logging -FIELDS_THAT_NEED_RESPONSES = ["keyFeatures"] +FIELDS_THAT_NEED_RESPONSES = ["tags", "keyFeatures", "sampleQueries", "tech", "products"] # Configure logging logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s - %(message)s') -def get_responses(workload: dict, client: AzureOpenAI, deployment_model: str) -> dict | str: +def get_responses(workload: dict, client: AzureOpenAI, deployment_model: str, fields_that_need_responses = FIELDS_THAT_NEED_RESPONSES, all_workloads = False) -> dict | str: logging.info(f"Processing workload: {workload['title']}") - if workload["sourceType"] != "ExecDocs": external_data = get_readme(workload['source']) - else: external_data = "" - for field in FIELDS_THAT_NEED_RESPONSES: + external_data = get_readme(workload['source']) + if workload["id"] == "": workload["id"] = str(uuid.uuid4()) + for field in fields_that_need_responses: + if (not all_workloads and workload.get(field, None)): continue logging.info(f"Getting response for {field}") + try: - workload[field] = get_field_response(client, workload, field, deployment_model) + workload[field] = get_field_response(client, workload, field, deployment_model, external_data) logging.info(f"Successfully got response for {field}") logging.info(f"Field {field} - Response: {workload[field]}") except Exception as e: logging.error(f"Error getting response for {field}: {e}") workload[field] = str(e) + time.sleep(5) return workload -def get_field_response(client: AzureOpenAI, workload: dict, field: str, deployment_model: str) -> str: +def get_field_response(client: AzureOpenAI, workload: dict, field: str, deployment_model: str, external_data: str) -> str: ask_prompt = "" match field: @@ -40,6 +45,8 @@ def get_field_response(client: AzureOpenAI, workload: dict, field: str, deployme ask_prompt = SAMPLE_NEGATIVE_MATCH_PROMPT case "keyFeatures": ask_prompt = KEY_FEATURES_PROMPT + case "tags": + ask_prompt = TAGS_PROMPT prompt = f"""title: {workload["title"]}\ndescription: {workload["description"]}\ntags:{workload['tags']}\n\n{external_data}\n\n{DISCLAIMER_PROMPT}\n\nYour Response:\n""" @@ -53,7 +60,6 @@ def get_field_response(client: AzureOpenAI, workload: dict, field: str, deployme } ], ) - # print(f"{field}: {response.choices[0].message.content}") return json.loads(response.choices[0].message.content) except Exception as e: return str(response.choices[0].message.content) \ No newline at end of file diff --git a/scripts/utils/prompt.py b/scripts/generate_fields/utils/prompt.py similarity index 71% rename from scripts/utils/prompt.py rename to scripts/generate_fields/utils/prompt.py index 7c7c8377..4a54d14b 100644 --- a/scripts/utils/prompt.py +++ b/scripts/generate_fields/utils/prompt.py @@ -1,5 +1,30 @@ PROMPT = """Add more sample queries to this json: """ +TAGS_PROMPT = """ +Tags are anything that you can associate with the workload. This can be languages, frameworks, APIs, technology, author, or key words (like "tutorial", "ai", "cli", or "application"). Here are some example +tags based on some workloads: + +``` +"title": "Kubernetes React Web App with Node.js API and MongoDB", +"description": "A blueprint for getting a React.js web app with a Node.js API and a MongoDB database on Azure. The blueprint includes sample application code (a ToDo web app) which can be removed and replaced with your own application code. Add your own source code and leverage the Infrastructure as Code assets (written in Bicep) to get up and running quickly. This architecture is for running Kubernetes clusters without setting up the control plane.", +"author": "Azure Dev", +"source": "https://github.com/Azure-Samples/todo-nodejs-mongo-aks", +"tags": ["bicep", "nodejs", "typescript", "javascript", "mongodb", "monitor", "keyvault", "reactjs", "appservice", "cosmosdb", "aks", "msft"] + +"title": "Quickstart: Deploy an Azure Kubernetes Service (AKS) cluster using Azure CLI", +"description": "Learn how to quickly deploy a Kubernetes cluster and deploy an application in Azure Kubernetes Service (AKS) using Azure CLI.", +"author": "Microsoft", +"source": "https://raw.githubusercontent.com/MicrosoftDocs/executable-docs/main/scenarios/azure-docs/articles/aks/learn/quick-kubernetes-deploy-cli.md", +"tags": ["kubernetes", "azure", "CLI", "aks", "deployment", "cluster", "application", "setup", "tutorial", "provision", "msft"] +``` + +A lot of these keywords are gotten from the README of the file, which I will also provide you with. +Make a json list object that can be read with json.loads() in python that contains all tags that you think are necessary for the workload. DONT GET TOO CRAZY AND LIST TOO MANY. +Here's the data from the README of the workload (if it's an exec doc it actually deploys using Innovation Engine and some provided steps). After looking at all the information and exmaples, Make a json list readable by json.loads(): + + +""" + README_PROMPT = """ This is data straight from the github readme of the workload. You can also use this in your decisions for creating the correct responses for the field you are creating. IF THE TECHNOLOGY IN THE README IS POINTING TO OR REFERENCING ANOTHER WEBSITE OR REPOSITORY THEN DON'T USE THAT TECHNOLOGY. FOR EXAMPLE: @@ -20,6 +45,8 @@ Your Response: "["How to use Azure AI Search to power ChatGPT-style and Q&A experiences", "Can I use Azure OpenAI to enhance ChatGPT responses with my company's proprietary data?", "How to integrate Azure AI Search with ChatGPT for custom knowledge bases?"]" +Avoid using verbiage like "what is the best". The questions should be more geared toward broader usage context, specific tech stack mentions, or deployment. + WORKLOAD DETAILS: """ @@ -37,6 +64,7 @@ IT IS IMPORTANT YOU ONLY USE LANGUAGES AND FRAMWORKS AND DATABASES AND APIS IN THIS, NO OTHER AZURE PRODUCTS, SERVICES, OR PLATFORMS. LETS BE CONSISTENT ESPECIALLY WITH LANGUAGES OF ALL FLAVORS, WHETHER IT'S A PROGRAMMING LANGUAGE OR A MARKUP LANGUAGE, IT BELONGS HERE. THIS CAN BE LEFT BLANK IF THERE ARE NO LANGUAGES OR FRAMEWORKS MENTIONED. DON'T INCLUDE PLATFORMS (HINT: ChatGPT is a platform, OpenAI is an API). AND START EVERY STRING WITH A CAPITAL LETTER AND ITS FORMAL NAME LIKE dotnetsharp=.NET OR C# AND NO SPACES! DONT FORGET BICEP OR TERRAFORM! +Operating systems can also be mentioned, or web server like Apache, Nginx, or IIS. WORKLOAD DETAILS: @@ -89,4 +117,12 @@ KEY_FEATURES_PROMPT = """You are an expert at reading all the provided content and understanding the key features of the workload. Please list the key features of the workload in a json list object that can be read with json.loads() in python. THIS MEANS YOU ONLY RETURN THE LIST OBJECT WITH NO OTHER WORDS, AND STRING ARE DOUBLE QUOTED ALWAYS. -Key features are the main capabilities of the workload that make it unique and valuable. These are the features that users would be most interested in when deciding whether to use the workload. Include anything important.""" \ No newline at end of file +Key features are the main capabilities of the workload that make it unique and valuable. These are the features that users would be most interested in when deciding whether to use the workload. Include anything important. +Your important information that you are including from the README.md should be longer than a word or two, but not overly verbose either. Short sentence length or three word bullet point key feature. +THESE SHOULD BE OVERARCHING THEMES, NOT SOMETHING SMALL AND SPECIFIC THAT DOESN"T ENCAPSULATE GENERAL IDEAS. "CREATE A RESOURCE GROUP" DOES NOT ENCAPSUATE GENERAL IDEAS. If you want to mention that you're creating resources, explain more specific resources and why it's important to the workload. + +GOOD Examples: +"React application for Azure AI Search", "Web interface for visualizing retrieval modes", and "Supports image search using text-to-image and image-to-image functionalities" are all great examples of a multi-dimensional answer. + +DON"T PUT SHORT BAD EXAMPLES LIKE "create a resource group" THIS IS BAD. +""" \ No newline at end of file diff --git a/scripts/generate_workloads/requirements.txt b/scripts/generate_workloads/requirements.txt deleted file mode 100644 index e63b219f..00000000 --- a/scripts/generate_workloads/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -openai===1.44.1 \ No newline at end of file diff --git a/scripts/generate_workloads/workloads.py b/scripts/generate_workloads/workloads.py deleted file mode 100644 index 7d166a85..00000000 --- a/scripts/generate_workloads/workloads.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -from utils.file_functions import read_file, write_file -from utils.get_responses import get_responses -from openai import AzureOpenAI - -FILE_PATH = "workloads/workloads.json" -DEPLOYMENT_MODEL = "gpt4o" -FAILED_WORKLOADS = "failed_workloads.json" -SUCCESSFUL_WORKLOADS = "new_workloads.json" - - -def main(): - workloads = read_file(FILE_PATH) - - client = AzureOpenAI( - api_key = os.getenv("AZURE_OPENAI_API_KEY"), - api_version = "2024-02-01", - azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") - ) - - finished_workloads = [] - - for workload in workloads: - finished_workloads.append(get_responses(workload, client, DEPLOYMENT_MODEL)) - - write_file(SUCCESSFUL_WORKLOADS, finished_workloads) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/scripts/utils/external_data.py b/scripts/utils/external_data.py deleted file mode 100644 index 5f5a96ba..00000000 --- a/scripts/utils/external_data.py +++ /dev/null @@ -1,12 +0,0 @@ -import requests -from utils.prompt import README_PROMPT - -def get_readme(github_url: str) -> str: - raw_github = "https://raw.githubusercontent.com" - github_url = github_url.replace("https://github.com", raw_github) - readme_url = f"{github_url}/refs/heads/main/README.md" - response = requests.get(readme_url) - return format_readme_request(response.text) - -def format_readme_request(response) -> str: - return f"{README_PROMPT}:\n```\n{response}\n```{README_PROMPT}\n" \ No newline at end of file From e8a29233f381fa0f6bebfa778af925e7a1634ef0 Mon Sep 17 00:00:00 2001 From: Jonathan Brockett Date: Tue, 17 Dec 2024 08:52:24 -0800 Subject: [PATCH 2/4] fixed bug related to not being able to access azure samples workloads via api --- scripts/add_workloads/add_exec_docs.py | 11 +++++++++ .../generate_fields/utils/external_data.py | 23 ++++++++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/scripts/add_workloads/add_exec_docs.py b/scripts/add_workloads/add_exec_docs.py index ed96d09e..01494e32 100644 --- a/scripts/add_workloads/add_exec_docs.py +++ b/scripts/add_workloads/add_exec_docs.py @@ -9,11 +9,13 @@ def main(root: str, input_file: str, check_quality: bool = False): data = json.load(f) new_workloads = [] + active_workload_sources = set() ## Get azd workloads for workload in data: if workload["status"] == "active": new_workloads.append(workload) + active_workload_sources.add(workload["sourceUrl"]) elif check_quality: if workload.get("quality", None): new_workloads.append(workload) @@ -21,6 +23,15 @@ def main(root: str, input_file: str, check_quality: bool = False): ## Add correct fields with open(workloads_file, "r") as f: workloads = json.load(f) + + non_active_exec_docs = [] + for workload in workloads: + if workload["sourceType"] == "ExecDocs" and workload["source"] not in active_workload_sources: + non_active_exec_docs.append(workload) + + for workload in non_active_exec_docs: + logging.info(f"Removing workload: {workload['title']}") + workloads.remove(workload) correct_keys = workloads[0].keys() unique_exec = {} diff --git a/scripts/generate_fields/utils/external_data.py b/scripts/generate_fields/utils/external_data.py index 0cd892a1..0c8682da 100644 --- a/scripts/generate_fields/utils/external_data.py +++ b/scripts/generate_fields/utils/external_data.py @@ -2,7 +2,10 @@ from utils.prompt import README_PROMPT PAT = os.getenv("GITHUB_EMU_PAT") +EMU_PAT = os.getenv("GITHUB_EMU_PAT") + HEADERS = {'Authorization': f'token {PAT}'} if PAT else {} +EMU_HEADERS = {'Authorization': f'token {EMU_PAT}'} if EMU_PAT else {} def get_readme(github_url: str) -> str: if github_url.startswith("https://github.com"): @@ -12,20 +15,17 @@ def get_readme(github_url: str) -> str: response = requests.get(readme_url) if response.status_code == 404: try: + logging.info(f"Getting correct branch and readme from {github_url}") github_api = github_url.replace(raw_github, "https://api.github.com/repos") - res = requests.get(github_api, headers=HEADERS) - repo_info = res.json() - main_branch = repo_info.get("default_branch", "main") res = requests.get(f"{github_api}/contents", headers=HEADERS) + if res.status_code != 200: + res = requests.get(f"{github_api}/contents", headers=EMU_HEADERS) contents = res.json() - readme_file = [file['name'] for file in contents if 'readme' in file['name'].lower()][0] - - raw_github = "https://raw.githubusercontent.com" - github_url = github_url.replace("https://github.com", raw_github) - readme_url = f"{github_url}/refs/heads/{main_branch}/{readme_file}" + readme_file = [file['download_url'] for file in contents if 'readme' in file['name'].lower()][0] + logging.info(f"Getting data from {readme_file}") - response = requests.get(readme_url) + response = requests.get(readme_file) except Exception as e: logging.error(f"Error getting readme file: {e}") raw_github = "https://raw.githubusercontent.com" @@ -33,10 +33,11 @@ def get_readme(github_url: str) -> str: readme_url = f"{github_url}/refs/heads/main/README.md" response = requests.get(readme_url) else: + logging.info(f"Getting data from {github_url}") readme_url = github_url - response = requests.get(readme_url, headers=HEADERS) + response = requests.get(readme_url) - return format_readme_request(response.text) + return format_readme_request(response.text), response def format_readme_request(response) -> str: return f"{README_PROMPT}:\n```\n{response}\n```{README_PROMPT}\n" \ No newline at end of file From b9451704f55edc947ac78ac89a10dca5e4d9923e Mon Sep 17 00:00:00 2001 From: Jonathan Brockett Date: Fri, 20 Dec 2024 13:15:57 -0800 Subject: [PATCH 3/4] made sure env variable referenced in yml --- .github/workflows/Autointegrate_awesomeAzd.yml | 2 ++ scripts/generate_fields/utils/external_data.py | 8 ++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/Autointegrate_awesomeAzd.yml b/.github/workflows/Autointegrate_awesomeAzd.yml index 43a0cae0..90e7bd11 100644 --- a/.github/workflows/Autointegrate_awesomeAzd.yml +++ b/.github/workflows/Autointegrate_awesomeAzd.yml @@ -43,6 +43,8 @@ jobs: env: AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} + GIT_EMU_PAT: ${{ secrets.GIT_EMU_PAT }} + run: | echo "Generating New Fields" python ${{ github.workspace }}/scripts/generate_fields/generate_fields.py --root ${{ github.workspace }} diff --git a/scripts/generate_fields/utils/external_data.py b/scripts/generate_fields/utils/external_data.py index 0c8682da..bfa1fc0a 100644 --- a/scripts/generate_fields/utils/external_data.py +++ b/scripts/generate_fields/utils/external_data.py @@ -1,10 +1,8 @@ import requests, os, logging from utils.prompt import README_PROMPT -PAT = os.getenv("GITHUB_EMU_PAT") -EMU_PAT = os.getenv("GITHUB_EMU_PAT") +EMU_PAT = os.getenv("GIT_EMU_PAT") -HEADERS = {'Authorization': f'token {PAT}'} if PAT else {} EMU_HEADERS = {'Authorization': f'token {EMU_PAT}'} if EMU_PAT else {} def get_readme(github_url: str) -> str: @@ -18,9 +16,7 @@ def get_readme(github_url: str) -> str: logging.info(f"Getting correct branch and readme from {github_url}") github_api = github_url.replace(raw_github, "https://api.github.com/repos") - res = requests.get(f"{github_api}/contents", headers=HEADERS) - if res.status_code != 200: - res = requests.get(f"{github_api}/contents", headers=EMU_HEADERS) + res = requests.get(f"{github_api}/contents", headers=EMU_HEADERS) contents = res.json() readme_file = [file['download_url'] for file in contents if 'readme' in file['name'].lower()][0] logging.info(f"Getting data from {readme_file}") From da6c386e22cef95b75d93c38e85c87e98764611f Mon Sep 17 00:00:00 2001 From: Jonathan Brockett Date: Thu, 26 Dec 2024 09:05:07 -0800 Subject: [PATCH 4/4] remove duplicates that already exist in workloads --- scripts/add_workloads/delete_workloads.py | 4 +- .../workloads_vector_only.json | 64 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 scripts/test_embeddings/workloads_vector_only.json diff --git a/scripts/add_workloads/delete_workloads.py b/scripts/add_workloads/delete_workloads.py index b991f4ff..c184a79b 100644 --- a/scripts/add_workloads/delete_workloads.py +++ b/scripts/add_workloads/delete_workloads.py @@ -5,14 +5,16 @@ def main(root="."): workloads = json.load(open(f"{root}/workloads/workloads.json", "r")) keep_workloads = [] + sources = set() for workload in workloads: res = requests.get(workload["source"]) print(res.status_code, workload["source"]) - if res.status_code == 200: + if res.status_code == 200 and workload["source"] not in sources: logging.info(f"Keeping workload {workload['title']}") keep_workloads.append(workload) + sources.add(workload["source"]) else: logging.info(f"Removing workload {workload['title']}") diff --git a/scripts/test_embeddings/workloads_vector_only.json b/scripts/test_embeddings/workloads_vector_only.json new file mode 100644 index 00000000..ca8b331a --- /dev/null +++ b/scripts/test_embeddings/workloads_vector_only.json @@ -0,0 +1,64 @@ +{ + "I want to create a website based on python": [ + { + "title": "Python (Django) Web App with PostgreSQL in VNet", + "source": "https://github.com/Azure-Samples/msdocs-django-postgresql-sample-app", + "search_score": 0.83078945, + "search_rerankerScore": null + }, + { + "title": "Python (Flask) Web App with PostgreSQL in VNet", + "source": "https://github.com/Azure-Samples/msdocs-flask-postgresql-sample-app", + "search_score": 0.82961214, + "search_rerankerScore": null + }, + { + "title": "Serverless Azure OpenAI Quick Start with LlamaIndex (Python)", + "source": "https://github.com/Azure-Samples/llama-index-python", + "search_score": 0.828326, + "search_rerankerScore": null + }, + { + "title": "React Web App with Python API and MongoDB - Terraform", + "source": "https://github.com/Azure-Samples/todo-python-mongo-terraform", + "search_score": 0.8278469, + "search_rerankerScore": null + }, + { + "title": "React Web App with Python API and MongoDB", + "source": "https://github.com/Azure-Samples/todo-python-mongo", + "search_score": 0.8262746, + "search_rerankerScore": null + }, + { + "title": "Static React Web App + Functions with Python API and MongoDB", + "source": "https://github.com/Azure-Samples/todo-python-mongo-swa-func", + "search_score": 0.826117, + "search_rerankerScore": null + }, + { + "title": "Python (Django) Web App with PostgreSQL via Azure Container Apps", + "source": "https://github.com/Azure-Samples/azure-django-postgres-aca", + "search_score": 0.8220776, + "search_rerankerScore": null + }, + { + "title": "Containerized React Web App with Python API and MongoDB", + "source": "https://github.com/Azure-Samples/todo-python-mongo-aca", + "search_score": 0.82124877, + "search_rerankerScore": null + }, + { + "title": "Creative Writing Assistant: Working with Agents using Promptflow (Python Implementation)", + "source": "https://github.com/Azure-Samples/agent-openai-python-prompty", + "search_score": 0.8212195, + "search_rerankerScore": null + }, + { + "title": "FastAPI on Azure Functions", + "source": "https://github.com/Azure-Samples/fastapi-on-azure-functions", + "search_score": 0.8145797, + "search_rerankerScore": null + } + ] +} \ No newline at end of file