diff --git a/.github/workflows/profile.yml b/.github/workflows/profile.yml index ff6c82a..c1070c4 100644 --- a/.github/workflows/profile.yml +++ b/.github/workflows/profile.yml @@ -1,26 +1,94 @@ name: profiling-online-endpoints on: workflow_dispatch: + inputs: + SKU_LIST: + description: 'Define the list of skus in the format of ["sku:num_concurrent_requests", "sku:num_concurrent_requests"]' + required: true + default: '["Standard_F2s_v2:1", "Standard_F4s_v2:2"]' jobs: - build: + create_profiler_compute: runs-on: ubuntu-latest + outputs: + PROFILER_COMPUTE_NAME: ${{ steps.set_profiler_compute_info.outputs.PROFILER_COMPUTE_NAME }} steps: - - name: check out repo + - name: Azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZURE_CREDENTIALS}} + - name: Install dependencies + run: | + az extension add -n ml -y + sudo apt-get update -y && sudo apt-get install jq + - name: Setup az environment + run: | + az config set defaults.workspace=${{secrets.AML_WORKSPACE}} + az config set defaults.group=${{secrets.RESOURCE_GROUP}} + az account set -s ${{secrets.SUBSCRIPTION_ID}} + - name: Set profiler compute info + id: set_profiler_compute_info + run: | + export PROFILER_COMPUTE_NAME=profilingTest + echo PROFILER_COMPUTE_NAME=$PROFILER_COMPUTE_NAME >> $GITHUB_ENV + echo PROFILER_COMPUTE_SIZE=Standard_F4s_v2 >> $GITHUB_ENV + echo "::set-output name=PROFILER_COMPUTE_NAME::$PROFILER_COMPUTE_NAME" + - name: Check out repo uses: actions/checkout@v2 - - name: azure login + - name: Create profiling compute + run: bash -x profiling/create-profiling-compute.sh + env: + PROFILER_COMPUTE_NAME: ${{ env.PROFILER_COMPUTE_NAME }} + PROFILER_COMPUTE_SIZE: ${{ env.PROFILER_COMPUTE_SIZE }} + working-directory: code + + profiling: + runs-on: ubuntu-latest + needs: create_profiler_compute + strategy: + fail-fast: false + matrix: + sku_connection_pair: ${{ fromJson(github.event.inputs.SKU_LIST) }} + steps: + - name: Check out repo + uses: actions/checkout@v2 + - name: Azure login uses: azure/login@v1 with: creds: ${{secrets.AZURE_CREDENTIALS}} - - name: install dependencies + - name: Install dependencies run: | - az extension add -n ml -y - apt-get update -y && apt-get install jq - - name: setup + az extension add -n ml -y + sudo apt-get update -y && sudo apt-get install jq + - name: Setup az environment run: | - az config set defaults.workspace=${{secrets.AML_WORKSPACE}} - az config set defaults.group=${{secrets.RESOURCE_GROUP}} - az account set -s ${{secrets.SUBSCRIPTION_ID}} - - name: run job + az config set defaults.workspace=${{secrets.AML_WORKSPACE}} + az config set defaults.group=${{secrets.RESOURCE_GROUP}} + az account set -s ${{secrets.SUBSCRIPTION_ID}} + - name: Generate unique online-endpoint name and online-deployment name + run: | + export ENDPOINT_NAME=endpt-`echo $RANDOM` + echo ENDPOINT_NAME=$ENDPOINT_NAME >> $GITHUB_ENV + echo DEPLOYMENT_NAME=$ENDPOINT_NAME-dep >> $GITHUB_ENV + - name: Create online-endpoint and online-deployment + run: bash -x profiling/create-online-endpoint.sh + env: + ENDPOINT_NAME: ${{ env.ENDPOINT_NAME }} + DEPLOYMENT_NAME: ${{ env.DEPLOYMENT_NAME }} + SKU_CONNECTION_PAIR: ${{ matrix.sku_connection_pair }} + working-directory: code + - name: Run profiling job run: bash -x profiling/how-to-profile-online-endpoint.sh + env: + ENDPOINT_NAME: ${{ env.ENDPOINT_NAME }} + DEPLOYMENT_NAME: ${{ env.DEPLOYMENT_NAME }} + SKU_CONNECTION_PAIR: ${{ matrix.sku_connection_pair }} + PROFILER_COMPUTE_NAME: ${{ needs.create_profiler_compute.outputs.PROFILER_COMPUTE_NAME }} + working-directory: code + - name: Delete online-endpoint and online-deployment + run: bash -x profiling/delete-online-endpoint.sh + env: + ENDPOINT_NAME: ${{ env.ENDPOINT_NAME }} working-directory: code + + \ No newline at end of file diff --git a/code/online-endpoint/blue-deployment.yml b/code/online-endpoint/blue-deployment-tmpl.yml similarity index 64% rename from code/online-endpoint/blue-deployment.yml rename to code/online-endpoint/blue-deployment-tmpl.yml index 874aebb..44622aa 100644 --- a/code/online-endpoint/blue-deployment.yml +++ b/code/online-endpoint/blue-deployment-tmpl.yml @@ -2,13 +2,15 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.sch name: blue endpoint_name: my-endpoint model: - local_path: model-1/model/sklearn_regression_model.pkl + path: model-1/model/ code_configuration: - code: - local_path: model-1/onlinescoring/ + code: model-1/onlinescoring/ scoring_script: score.py environment: conda_file: model-1/environment/conda.yml image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 -instance_type: Standard_F2s_v2 +instance_type: <% COMPUTER_SIZE %> instance_count: 1 +request_settings: + request_timeout_ms: 3000 + max_concurrent_requests_per_instance: 1024 diff --git a/code/online-endpoint/model-1/environment/conda.yml b/code/online-endpoint/model-1/environment/conda.yml index c23bd70..97f5beb 100644 --- a/code/online-endpoint/model-1/environment/conda.yml +++ b/code/online-endpoint/model-1/environment/conda.yml @@ -8,6 +8,5 @@ dependencies: - scikit-learn=0.24.2 - scipy=1.7.1 - pip: - - azureml-defaults==1.33.0 - - inference-schema[numpy-support]==1.3.0 + - azureml-defaults==1.38.0 - joblib==1.0.1 diff --git a/code/online-endpoint/model-1/onlinescoring/score.py b/code/online-endpoint/model-1/onlinescoring/score.py index ac565f7..5d5c3a7 100644 --- a/code/online-endpoint/model-1/onlinescoring/score.py +++ b/code/online-endpoint/model-1/onlinescoring/score.py @@ -13,8 +13,9 @@ def init(): global model # AZUREML_MODEL_DIR is an environment variable created during deployment. # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) + # Please provide your model's folder name if there is one model_path = os.path.join( - os.getenv("AZUREML_MODEL_DIR"), "sklearn_regression_model.pkl" + os.getenv("AZUREML_MODEL_DIR"), "model/sklearn_regression_model.pkl" ) # deserialize the model file back into a sklearn model model = joblib.load(model_path) @@ -27,7 +28,7 @@ def run(raw_data): In the example we extract the data from the json input and call the scikit-learn model's predict() method and return the result back """ - logging.info("Request received") + logging.info("model 1: request received") data = json.loads(raw_data)["data"] data = numpy.array(data) result = model.predict(data) diff --git a/code/online-endpoint/model-1/onlinescoring/score_managedidentity.py b/code/online-endpoint/model-1/onlinescoring/score_managedidentity.py index c1704fa..18eea27 100644 --- a/code/online-endpoint/model-1/onlinescoring/score_managedidentity.py +++ b/code/online-endpoint/model-1/onlinescoring/score_managedidentity.py @@ -56,8 +56,9 @@ def init(): # AZUREML_MODEL_DIR is an environment variable created during deployment. # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) # For multiple models, it points to the folder containing all deployed models (./azureml-models) + # Please provide your model's folder name if there is one model_path = os.path.join( - os.getenv("AZUREML_MODEL_DIR"), "sklearn_regression_model.pkl" + os.getenv("AZUREML_MODEL_DIR"), "model/sklearn_regression_model.pkl" ) # deserialize the model file back into a sklearn model model = joblib.load(model_path) diff --git a/code/online-endpoint/model-2/environment/conda.yml b/code/online-endpoint/model-2/environment/conda.yml index 30ba157..87af935 100644 --- a/code/online-endpoint/model-2/environment/conda.yml +++ b/code/online-endpoint/model-2/environment/conda.yml @@ -8,6 +8,5 @@ dependencies: - scikit-learn=0.24.2 - scipy=1.7.1 - pip: - - azureml-defaults==1.33.0 - - inference-schema[numpy-support]==1.3.0 + - azureml-defaults==1.38.0 - joblib==1.0.1 \ No newline at end of file diff --git a/code/online-endpoint/model-2/onlinescoring/score.py b/code/online-endpoint/model-2/onlinescoring/score.py index ac565f7..e248af3 100644 --- a/code/online-endpoint/model-2/onlinescoring/score.py +++ b/code/online-endpoint/model-2/onlinescoring/score.py @@ -13,8 +13,9 @@ def init(): global model # AZUREML_MODEL_DIR is an environment variable created during deployment. # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION) + # Please provide your model's folder name if there is one model_path = os.path.join( - os.getenv("AZUREML_MODEL_DIR"), "sklearn_regression_model.pkl" + os.getenv("AZUREML_MODEL_DIR"), "model/sklearn_regression_model.pkl" ) # deserialize the model file back into a sklearn model model = joblib.load(model_path) @@ -27,9 +28,14 @@ def run(raw_data): In the example we extract the data from the json input and call the scikit-learn model's predict() method and return the result back """ - logging.info("Request received") - data = json.loads(raw_data)["data"] - data = numpy.array(data) - result = model.predict(data) + logging.info("model 2: request received") + result = [0.5, 0.5] logging.info("Request processed") - return result.tolist() + # return hardcoded result so that it is easy to validate safe rollout scenario: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-safely-rollout-managed-endpoints + return result + + # actual scoring logic for reference: + # data = json.loads(raw_data)["data"] + # data = numpy.array(data) + # result = model.predict(data) + # return result.tolist() diff --git a/code/profiling/create-online-endpoint.sh b/code/profiling/create-online-endpoint.sh new file mode 100644 index 0000000..9313e33 --- /dev/null +++ b/code/profiling/create-online-endpoint.sh @@ -0,0 +1,34 @@ +# +export SKU_CONNECTION_PAIR=${SKU_CONNECTION_PAIR} +export ENDPOINT_NAME=${ENDPOINT_NAME} +export DEPLOYMENT_NAME=${DEPLOYMENT_NAME} +export DEPLOYMENT_COMPUTER_SIZE=`echo $SKU_CONNECTION_PAIR | awk -F: '{print $1}'` +# the computer size for the online-deployment +# + +# +echo "Creating Endpoint $ENDPOINT_NAME of size $DEPLOYMENT_COMPUTER_SIZE..." +sed -e "s/<% COMPUTER_SIZE %>/$DEPLOYMENT_COMPUTER_SIZE/g" online-endpoint/blue-deployment-tmpl.yml > online-endpoint/${DEPLOYMENT_NAME}.yml +az ml online-endpoint create --name $ENDPOINT_NAME -f online-endpoint/endpoint.yml +az ml online-deployment create --name $DEPLOYMENT_NAME --endpoint $ENDPOINT_NAME -f online-endpoint/${DEPLOYMENT_NAME}.yml --all-traffic +# + +# +endpoint_status=`az ml online-endpoint show -n $ENDPOINT_NAME --query "provisioning_state" -o tsv` +echo $endpoint_status +if [[ $endpoint_status == "Succeeded" ]]; then + echo "Endpoint $ENDPOINT_NAME created successfully" +else + echo "Endpoint $ENDPOINT_NAME creation failed" + exit 1 +fi + +deploy_status=`az ml online-deployment show --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --query "provisioning_state" -o tsv` +echo $deploy_status +if [[ $deploy_status == "Succeeded" ]]; then + echo "Deployment $DEPLOYMENT_NAME completed successfully" +else + echo "Deployment $DEPLOYMENT_NAME failed" + exit 1 +fi +# \ No newline at end of file diff --git a/code/profiling/create-profiling-compute.sh b/code/profiling/create-profiling-compute.sh new file mode 100644 index 0000000..b3f8c9f --- /dev/null +++ b/code/profiling/create-profiling-compute.sh @@ -0,0 +1,30 @@ +# +export PROFILER_COMPUTE_NAME="${PROFILER_COMPUTE_NAME}" # the compute name for hosting the profiler +export PROFILER_COMPUTE_SIZE="${PROFILER_COMPUTE_SIZE}" # the compute size for hosting the profiler +# + +# +# skip compute creation if compute exists already +az ml compute show --name $PROFILER_COMPUTE_NAME +if [[ $? -eq 0 ]]; then echo "compute $PROFILER_COMPUTE_NAME exists already, will skip creation and role assignment." && exit 0; fi + +echo "Creating Compute $PROFILER_COMPUTE_NAME ..." +az ml compute create --name $PROFILER_COMPUTE_NAME --size $PROFILER_COMPUTE_SIZE --identity-type SystemAssigned --type amlcompute --max-instances 3 + +# check compute status +compute_status=`az ml compute show --name $PROFILER_COMPUTE_NAME --query "provisioning_state" -o tsv` +echo $compute_status +if [[ $compute_status == "Succeeded" ]]; then + echo "Compute $PROFILER_COMPUTE_NAME created successfully" +else + echo "Compute $PROFILER_COMPUTE_NAME creation failed" + exit 1 +fi + +# create role assignment for acessing workspace resources +compute_info=`az ml compute show --name $PROFILER_COMPUTE_NAME --query '{"id": id, "identity_object_id": identity.principal_id}' -o json` +workspace_resource_id=`echo $compute_info | jq -r '.id' | sed 's/\(.*\)\/computes\/.*/\1/'` +identity_object_id=`echo $compute_info | jq -r '.identity_object_id'` +az role assignment create --role Contributor --assignee-object-id $identity_object_id --scope $workspace_resource_id +if [[ $? -ne 0 ]]; then echo "Failed to create role assignment for compute $PROFILER_COMPUTE_NAME" && exit 1; fi +# \ No newline at end of file diff --git a/code/profiling/delete-online-endpoint.sh b/code/profiling/delete-online-endpoint.sh new file mode 100644 index 0000000..c07fa3b --- /dev/null +++ b/code/profiling/delete-online-endpoint.sh @@ -0,0 +1,7 @@ +# +export ENDPOINT_NAME=${ENDPOINT_NAME} +# + +# +az ml online-endpoint delete --name $ENDPOINT_NAME -y +# \ No newline at end of file diff --git a/code/profiling/how-to-profile-online-endpoint.sh b/code/profiling/how-to-profile-online-endpoint.sh index b459257..06e6945 100644 --- a/code/profiling/how-to-profile-online-endpoint.sh +++ b/code/profiling/how-to-profile-online-endpoint.sh @@ -11,83 +11,26 @@ ## 7. az configure --defaults group= workspace= # -export ENDPOINT_NAME="" -export DEPLOYMENT_NAME="" -export PROFILING_TOOL="" # allowed values: wrk, wrk2 and labench -export PROFILER_COMPUTE_NAME="" -export PROFILER_COMPUTE_SIZE="" # required only when compute does not exist already +export ENDPOINT_NAME="${ENDPOINT_NAME}" +export DEPLOYMENT_NAME="${DEPLOYMENT_NAME}" +export SKU_CONNECTION_PAIR=${SKU_CONNECTION_PAIR} +export PROFILING_TOOL=wrk # allowed values: wrk, wrk2 and labench +export PROFILER_COMPUTE_NAME="${PROFILER_COMPUTE_NAME}" # the compute name for hosting the profiler export DURATION="" # time for running the profiling tool (duration for each wrk call or labench call), default value is 300s -export CONNECTIONS="" # for wrk and wrk2 only, no. of connections for the profiling tool, default value is set to be the same as the no. of workers, or 1 if no. of workers is not set +export CONNECTIONS=`echo $SKU_CONNECTION_PAIR | awk -F: '{print $2}'` # for wrk and wrk2 only, no. of connections for the profiling tool, default value is set to be the same as the no. of workers, or 1 if no. of workers is not set export THREAD="" # for wrk and wrk2 only, no. of threads allocated for the profiling tool, default value is 1 export TARGET_RPS="" # for labench and wrk2 only, target rps for the profiling tool, default value is 50 export CLIENTS="" # for labench only, no. of clients for the profiling tool, default value is set to be the same as the no. of workers, or 1 if no. of workers is not set export TIMEOUT="" # for labench only, timeout for each request, default value is 10s # -export ENDPOINT_NAME=endpt-`echo $RANDOM` -export DEPLOYMENT_NAME=blue -export PROFILING_TOOL=wrk -export PROFILER_COMPUTE_NAME=profilingTest # the compute name for hosting the profiler -export PROFILER_COMPUTE_SIZE=Standard_F4s_v2 # the compute size for hosting the profiler - -# -echo "Creating Endpoint $ENDPOINT_NAME ..." -az ml online-endpoint create --name $ENDPOINT_NAME -f online-endpoint/endpoint.yml -az ml online-deployment create --name $DEPLOYMENT_NAME --endpoint $ENDPOINT_NAME -f online-endpoint/blue-deployment.yml --all-traffic -# - -# -endpoint_status=`az ml online-endpoint show -n $ENDPOINT_NAME --query "provisioning_state" -o tsv` -echo $endpoint_status -if [[ $endpoint_status == "Succeeded" ]]; then - echo "Endpoint $ENDPOINT_NAME created successfully" -else - echo "Endpoint $ENDPOINT_NAME creation failed" - exit 1 -fi - -deploy_status=`az ml online-deployment show --name $DEPLOYMENT_NAME --endpoint-name $ENDPOINT_NAME --query "provisioning_state" -o tsv` -echo $deploy_status -if [[ $deploy_status == "Succeeded" ]]; then - echo "Deployment $DEPLOYMENT_NAME completed successfully" -else - echo "Deployment $DEPLOYMENT_NAME failed" - exit 1 -fi -# - -# -echo "Creating Compute $PROFILER_COMPUTE_NAME ..." -az ml compute create --name $PROFILER_COMPUTE_NAME --size $PROFILER_COMPUTE_SIZE --identity-type SystemAssigned --type amlcompute - -# check compute status -compute_status=`az ml compute show --name $PROFILER_COMPUTE_NAME --query "provisioning_state" -o tsv` -echo $compute_status -if [[ $compute_status == "Succeeded" ]]; then - echo "Compute $PROFILER_COMPUTE_NAME created successfully" -else - echo "Compute $PROFILER_COMPUTE_NAME creation failed" - exit 1 -fi - -# create role assignment for acessing workspace resources -compute_resource_id=`az ml compute show --name $PROFILER_COMPUTE_NAME --query id -o tsv` -workspace_resource_id=`echo $compute_resource_id | sed 's/\(.*\)\/computes\/.*/\1/'` -access_token=`az account get-access-token --query accessToken -o tsv` -compute_info=`curl https://management.azure.com$compute_resource_id?api-version=2021-03-01-preview -H "Content-Type: application/json" -H "Authorization: Bearer $access_token"` -if [[ $? -ne 0 ]]; then echo "Failed to get info for compute $PROFILER_COMPUTE_NAME" && exit 1; fi -identity_object_id=`echo $compute_info | jq '.identity.principalId' | sed "s/\"//g"` -az role assignment create --role Contributor --assignee-object-id $identity_object_id --scope $workspace_resource_id -if [[ $? -ne 0 ]]; then echo "Failed to create role assignment for compute $PROFILER_COMPUTE_NAME" && exit 1; fi -# - -# +# default_datastore_info=`az ml datastore show --name workspaceblobstore -o json` account_name=`echo $default_datastore_info | jq '.account_name' | sed "s/\"//g"` container_name=`echo $default_datastore_info | jq '.container_name' | sed "s/\"//g"` connection_string=`az storage account show-connection-string --name $account_name -o tsv` -az storage blob upload --container-name $container_name/profiling_payloads --name payload.txt --file endpoints/online/profiling/payload.txt --connection-string $connection_string -# +az storage blob upload --container-name $container_name/profiling_payloads --name ${ENDPOINT_NAME}_payload.txt --file profiling/payload.txt --connection-string $connection_string +# # # please specify environment variable "IDENTITY_ACCESS_TOKEN" when working with ml compute with no appropriate MSI attached @@ -102,11 +45,12 @@ sed \ -e "s/<% TIMEOUT %>/$TIMEOUT/g" \ -e "s/<% THREAD %>/$THREAD/g" \ -e "s/<% COMPUTE_NAME %>/$PROFILER_COMPUTE_NAME/g" \ - profiling/profiling_job_tmpl.yml > profiling_job.yml + -e "s/<% SKU_CONNECTION_PAIR %>/$SKU_CONNECTION_PAIR/g" \ + profiling/profiling_job_tmpl.yml > ${ENDPOINT_NAME}_profiling_job.yml # # -run_id=$(az ml job create -f profiling_job.yml --query name -o tsv) +run_id=$(az ml job create -f ${ENDPOINT_NAME}_profiling_job.yml --query name -o tsv) # # @@ -121,8 +65,4 @@ sleep 10 # az ml job download --name $run_id --download-path report_$run_id echo "Job result has been downloaded to dir report_$run_id" -# - -# -az ml online-endpoint delete --name $ENDPOINT_NAME -y -# \ No newline at end of file +# \ No newline at end of file diff --git a/code/profiling/profiling_job_tmpl.yml b/code/profiling/profiling_job_tmpl.yml index db738bf..45eb576 100644 --- a/code/profiling/profiling_job_tmpl.yml +++ b/code/profiling/profiling_job_tmpl.yml @@ -1,9 +1,10 @@ $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json command: > - profiling.sh -p {inputs.payload} + python -m online_endpoints_model_profiler --payload_path ${{inputs.payload}} experiment_name: profiling-job +display_name: <% SKU_CONNECTION_PAIR %> environment: - image: docker.io/rachyong/profilers:latest + image: mcr.microsoft.com/azureml/online-endpoints-model-profiler:latest environment_variables: ONLINE_ENDPOINT: "<% ENDPOINT_NAME %>" DEPLOYMENT: "<% DEPLOYMENT_NAME %>" @@ -17,4 +18,5 @@ environment_variables: compute: "azureml:<% COMPUTE_NAME %>" inputs: payload: - file: azureml://datastores/workspaceblobstore/paths/profiling_payloads/payload.txt + type: uri_file + path: azureml://datastores/workspaceblobstore/paths/profiling_payloads/<% ENDPOINT_NAME %>_payload.txt \ No newline at end of file