Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/executor_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ jobs:
- name: Build backenddevelopersltd/compute-horde-streaming-job-test:v0-latest image
run: docker build . -t backenddevelopersltd/compute-horde-streaming-job-test:v0-latest
working-directory: ./executor/app/src/compute_horde_executor/executor/tests/integration/docker_image_for_streaming_job_tests
- name: Pull us-central1-docker.pkg.dev/twistlock-secresearch/public/can-ctr-escape-cve-2022-0492:latest
run: docker pull us-central1-docker.pkg.dev/twistlock-secresearch/public/can-ctr-escape-cve-2022-0492:latest
- name: Pull alpine image
run: docker pull alpine:3.19
- name: Run unit tests
run: nox -vs test
- name: Executor Test Results
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/integration_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ jobs:
with:
version: "0.6.x"
enable-cache: true
- name: Pull us-central1-docker.pkg.dev/twistlock-secresearch/public/can-ctr-escape-cve-2022-0492:latest
run: docker pull us-central1-docker.pkg.dev/twistlock-secresearch/public/can-ctr-escape-cve-2022-0492:latest
- name: Pull alpine image
run: docker pull alpine:3.19
- name: Start all services
run: local_stack/run_and_await_readiness.sh /tmp/integration_test_logs/

Expand Down
76 changes: 35 additions & 41 deletions executor/app/src/compute_horde_executor/executor/job_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@

from compute_horde_executor.executor.job_runner import BaseJobRunner
from compute_horde_executor.executor.miner_client import JobError, MinerClient
from compute_horde_executor.executor.utils import get_machine_specs, temporary_process
from compute_horde_executor.executor.utils import (
docker_container_wrapper,
get_docker_container_outputs,
get_machine_specs,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -149,7 +153,7 @@ async def execute(self):
logger.error(f"Job cleanup failed: {e}")

async def _startup_stage(self) -> V0InitialJobRequest:
self.specs = get_machine_specs()
self.specs = await get_machine_specs()
await self.run_security_checks_or_fail()
initial_job_request = await self.miner_client.initial_msg
await self.runner.prepare_initial(initial_job_request)
Expand Down Expand Up @@ -185,66 +189,56 @@ async def run_security_checks_or_fail(self):
await self.run_nvidia_toolkit_version_check_or_fail()

async def run_cve_2022_0492_check_or_fail(self):
# TODO: TIMEOUTS - This doesn't kill the docker container, just the docker process that communicates with it.
async with temporary_process(
"docker",
"run",
"--rm",
CVE_2022_0492_IMAGE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
) as docker_process:
stdout, stderr = await docker_process.communicate()
return_code = docker_process.returncode
async with docker_container_wrapper(
image=CVE_2022_0492_IMAGE, auto_remove=True
) as docker_container:
results = await docker_container.wait()
return_code = results["StatusCode"]
stdout, stderr = await get_docker_container_outputs(docker_container)

if return_code != 0:
raise JobError(
"CVE-2022-0492 check failed",
error_detail=f'stdout="{stdout.decode()}"\nstderr="{stderr.decode()}"',
error_detail=f'stdout="{stdout}"\nstderr="{stderr}"',
)

expected_output = "Contained: cannot escape via CVE-2022-0492"
if expected_output not in stdout.decode():
if expected_output not in stdout:
raise JobError(
f'CVE-2022-0492 check failed: "{expected_output}" not in stdout.',
V0JobFailedRequest.ErrorType.SECURITY_CHECK,
f'stdout="{stdout.decode()}"\nstderr="{stderr.decode()}"',
f'stdout="{stdout}"\nstderr="{stderr}"',
)

async def run_nvidia_toolkit_version_check_or_fail(self):
# TODO: TIMEOUTS - This doesn't kill the docker container, just the docker process that communicates with it.
async with temporary_process(
"docker",
"run",
"--rm",
"--privileged",
"-v",
"/:/host:ro",
"-v",
"/usr/bin:/usr/bin",
"-v",
"/usr/lib:/usr/lib",
"ubuntu:latest",
"bash",
"-c",
"nvidia-container-toolkit --version",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
) as docker_process:
stdout, stderr = await docker_process.communicate()
return_code = docker_process.returncode
async with docker_container_wrapper(
image="ubuntu:latest",
command=["bash", "-c", "nvidia-container-toolkit --version"],
auto_remove=True,
HostConfig={
"Privileged": True,
"Binds": [
"/:/host:ro",
"/usr/bin:/usr/bin",
"/usr/lib:/usr/lib",
],
},
) as docker_container:
results = await docker_container.wait()
return_code = results["StatusCode"]
stdout, stderr = await get_docker_container_outputs(docker_container)

if return_code != 0:
raise JobError(
f"nvidia-container-toolkit check failed: exit code {return_code}",
error_detail=f'stdout="{stdout.decode()}"\nstderr="{stderr.decode()}"',
error_detail=f'stdout="{stdout}"\nstderr="{stderr}"',
)

lines = stdout.decode().splitlines()
lines = stdout.splitlines()
if not lines:
raise JobError(
"nvidia-container-toolkit check failed: no output from nvidia-container-toolkit",
error_detail=f'stdout="{stdout.decode()}"\nstderr="{stderr.decode()}"',
error_detail=f'stdout="{stdout}"\nstderr="{stderr}"',
)

version = lines[0].rpartition(" ")[2]
Expand All @@ -256,7 +250,7 @@ async def run_nvidia_toolkit_version_check_or_fail(self):
f"Outdated NVIDIA Container Toolkit detected:"
f'{version}" not >= {NVIDIA_CONTAINER_TOOLKIT_MINIMUM_SAFE_VERSION}',
V0JobFailedRequest.ErrorType.SECURITY_CHECK,
f'stdout="{stdout.decode()}"\nstderr="{stderr.decode()}',
f'stdout="{stdout}"\nstderr="{stderr}',
)

async def fail_if_execution_unsuccessful(self):
Expand Down
Loading
Loading