From 90ed323e07c8ef2eaba3578280878a9cbb902472 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Fri, 1 Aug 2025 23:59:59 +0100 Subject: [PATCH 01/16] [Breaking] Rename lesser used kwargs & remove deprecated kwargs --- src/runpod_cli/cli.py | 75 ++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 47 deletions(-) diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index 4a066ef..b30064d 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -41,11 +41,9 @@ class PodSpec: """Specification for RunPod infrastructure parameters.""" - image_name: str = DEFAULT_IMAGE_NAME gpu_type: str = "RTX A4000" - cloud_type: str = "SECURE" + image_name: str = DEFAULT_IMAGE_NAME gpu_count: int = 1 - volume_in_gb: int = 10 min_vcpu_count: int = 1 min_memory_in_gb: int = 1 container_disk_in_gb: int = 30 @@ -167,9 +165,8 @@ def create_pod(self, name: Optional[str], spec: PodSpec, runtime: int) -> Dict: name=name, image_name=spec.image_name, gpu_type_id=gpu_id, - cloud_type=spec.cloud_type, + cloud_type="SECURE", gpu_count=spec.gpu_count, - volume_in_gb=spec.volume_in_gb, container_disk_in_gb=spec.container_disk_in_gb, min_vcpu_count=spec.min_vcpu_count, min_memory_in_gb=spec.min_memory_in_gb, @@ -276,70 +273,55 @@ def create( self, name: Optional[str] = None, runtime: int = 60, - spec: Optional[PodSpec] = None, gpu_type: Optional[str] = None, - image_name: Optional[str] = None, - cloud_type: Optional[str] = None, - gpu_count: Optional[int] = None, - volume_in_gb: Optional[int] = None, - min_vcpu_count: Optional[int] = None, - min_memory_in_gb: Optional[int] = None, - container_disk_in_gb: Optional[int] = None, - volume_mount_path: Optional[str] = None, - runpodcli_dir: Optional[str] = None, + cpus: Optional[int] = None, + disk: Optional[int] = None, env: Optional[Dict[str, str]] = None, - update_ssh_config: Optional[bool] = None, forward_agent: Optional[bool] = None, + image_name: Optional[str] = None, + memory: Optional[int] = None, + num_gpus: Optional[int] = None, update_known_hosts: Optional[bool] = None, + update_ssh_config: Optional[bool] = None, + volume_mount_path: Optional[str] = None, ) -> None: """Create a new RunPod instance with the specified parameters. Args: - name: Name for the pod (default: "$USER-$GPU_TYPE") runtime: Time in minutes for pod to run (default: 60) gpu_type: GPU type (default: "RTX A4000") - image_name: Docker image (default: PyTorch 2.8.0 with CUDA 12.8.1) - cloud_type: "SECURE" or "COMMUNITY" (default: "SECURE") - gpu_count: Number of GPUs (default: 1) - volume_in_gb: Ephemeral storage volume size in GB (default: 10) - min_vcpu_count: Minimum CPU count (default: 1) - min_memory_in_gb: Minimum RAM in GB (default: 1) - container_disk_in_gb: Container disk size in GB (default: 30) - volume_mount_path: Volume mount path (default: "/network") - runpodcli_dir: Directory name for runpodcli scripts (default: ".tmp_$name") - env: Environment variables to set in the container - update_ssh_config: Whether to update SSH config (default: True) + num_gpus: Number of GPUs (default: 1) + name: Name for the pod (default: "$USER-$GPU_TYPE") + env: File to load RunPod credentials from (default: .env and ~/.config/runpod_cli/.env) + disk: Container disk size in GB (default: 30) + cpus: Minimum CPU count (default: 1) + memory: Minimum RAM in GB (default: 1) forward_agent: Whether to forward SSH agent (default: False) update_known_hosts: Whether to update known hosts (default: True) + update_ssh_config: Whether to update SSH config (default: True) + image_name: Docker image (default: "PyTorch 2.8.0 with CUDA 12.8.1") Examples: - rpc create --gpu_type="A100 PCIe" --runtime=60 - rpc create --name="my-pod" --gpu_count=2 --runtime=240 + rpc create -r 60 -g "A100 PCIe" + rpc create --runtime=240 --gpu_type="RTX A4000" --num_gpus=2 --name="dual-gpu-pod" """ - if spec is None: - spec = PodSpec() + spec = PodSpec() # Override spec with individual parameters if provided if gpu_type is not None: spec.gpu_type = gpu_type if image_name is not None: spec.image_name = image_name - if cloud_type is not None: - spec.cloud_type = cloud_type - if gpu_count is not None: - spec.gpu_count = gpu_count - if volume_in_gb is not None: - spec.volume_in_gb = volume_in_gb - if min_vcpu_count is not None: - spec.min_vcpu_count = min_vcpu_count - if min_memory_in_gb is not None: - spec.min_memory_in_gb = min_memory_in_gb - if container_disk_in_gb is not None: - spec.container_disk_in_gb = container_disk_in_gb + if num_gpus is not None: + spec.gpu_count = num_gpus + if cpus is not None: + spec.min_vcpu_count = cpus + if memory is not None: + spec.min_memory_in_gb = memory + if disk is not None: + spec.container_disk_in_gb = disk if volume_mount_path is not None: spec.volume_mount_path = volume_mount_path - if runpodcli_dir is not None: - spec.runpodcli_dir = runpodcli_dir if env is not None: spec.env = env if update_ssh_config is not None: @@ -356,7 +338,6 @@ def create( logging.info(f" Region: {self._client.region}") logging.info(f" S3 endpoint: {self._client.s3_endpoint}") logging.info(f" GPU Type: {spec.gpu_type}") - logging.info(f" Cloud Type: {spec.cloud_type}") logging.info(f" GPU Count: {spec.gpu_count}") logging.info(f" runpodcli directory: {spec.runpodcli_dir}") logging.info(f" Time limit: {runtime} minutes") From db15faecb7467e35cf09c97bbe8f07273192c86e Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Thu, 7 Aug 2025 10:36:33 +0100 Subject: [PATCH 02/16] Add time remaining to pod list --- src/runpod_cli/cli.py | 56 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index b30064d..bc71355 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -1,8 +1,10 @@ import logging import os +import re import textwrap import time from dataclasses import dataclass +from datetime import datetime, timedelta from typing import Dict, List, Optional, Tuple import boto3 @@ -249,7 +251,40 @@ def __init__(self, env: Optional[str] = None) -> None: self._client = RunPodClient() - def list(self) -> None: + def _parse_public_ip(self, pod: Dict) -> str: + """Parse public IP and port from pod runtime info.""" + public_ips = [i for i in pod["runtime"]["ports"] if i["isIpPublic"]] + if len(public_ips) != 1: + raise ValueError(f"Expected 1 public IP, got {public_ips}") + ip = public_ips[0].get("ip") + port = public_ips[0].get("publicPort") + return ip, port + + def _parse_time_remaining(self, pod: Dict) -> str: + """Parse time remaining before pod shutdown from pod runtime info.""" + _sleep_re = re.compile(r"\bsleep\s+(\d+)\b") + _date_re = re.compile(r":\s*(\w{3}\s+\w{3}\s+\d{2}\s+\d{4}\s+\d{2}:\d{2}:\d{2})\s+GMT") + start_dt = None + sleep_secs = None + last_status_change = pod.get("lastStatusChange", "") + if isinstance(last_status_change, str): + match = _date_re.search(last_status_change) + if match: + start_dt = datetime.strptime(match.group(1), "%a %b %d %Y %H:%M:%S") + docker_args = pod.get("dockerArgs", "") + if isinstance(docker_args, str): + match = _sleep_re.search(docker_args) + if match: + sleep_secs = int(match.group(1)) + if start_dt is not None and sleep_secs is not None: + shutdown_dt = start_dt + timedelta(seconds=sleep_secs) + remaining = shutdown_dt - datetime.now() + remaining_str = f"{remaining.seconds // 3600}h {remaining.seconds % 3600 // 60}m" + return remaining_str if remaining.total_seconds() > 0 else "Unknown" + else: + return "Unknown" + + def list(self, verbose: bool = False) -> None: """List all pods in your RunPod account. Displays information about each pod including ID, name, GPU type, status, and connection details. @@ -260,13 +295,15 @@ def list(self) -> None: logging.info(f"Pod {i + 1}:") logging.info(f" ID: {pod.get('id')}") logging.info(f" Name: {pod.get('name')}") - logging.info(f" Machine Type: {pod.get('machine', {}).get('gpuDisplayName')}") - logging.info(f" Status: {pod.get('desiredStatus')}") - public_ip = [i for i in pod.get("runtime", {}).get("ports", []) if i["isIpPublic"]] - if len(public_ip) != 1: - raise ValueError(f"Expected 1 public IP, got {len(public_ip)}") - logging.info(f" Public IP: {public_ip[0].get('ip')}") - logging.info(f" Public port: {public_ip[0].get('publicPort')}") + time_remaining = self._parse_time_remaining(pod) + logging.info(f" Time remaining (est.): {time_remaining}") + if verbose: + public_ip, public_port = self._parse_public_ip(pod) + logging.info(f" Public IP: {public_ip}") + logging.info(f" Public port: {public_port}") + logging.info(f" GPUs: {pod.get('gpuCount')} x {pod.get('machine', {}).get('gpuDisplayName')}") + for key in ["memoryInGb", "vcpuCount", "containerDiskInGb", "volumeMountPath", "costPerHr"]: + logging.info(f" {key}: {pod.get(key)}") logging.info("") def create( @@ -339,6 +376,9 @@ def create( logging.info(f" S3 endpoint: {self._client.s3_endpoint}") logging.info(f" GPU Type: {spec.gpu_type}") logging.info(f" GPU Count: {spec.gpu_count}") + logging.info(f" Disk: {spec.container_disk_in_gb} GB") + logging.info(f" Min CPU: {spec.min_vcpu_count}") + logging.info(f" Min Memory: {spec.min_memory_in_gb} GB") logging.info(f" runpodcli directory: {spec.runpodcli_dir}") logging.info(f" Time limit: {runtime} minutes") From 19ac5e7a485d26e5db4e79330f4b0ccc547f83ab Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Thu, 7 Aug 2025 13:18:50 +0100 Subject: [PATCH 03/16] Fixed timezones --- src/runpod_cli/cli.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index bc71355..d02118f 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -4,7 +4,7 @@ import textwrap import time from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from typing import Dict, List, Optional, Tuple import boto3 @@ -270,15 +270,16 @@ def _parse_time_remaining(self, pod: Dict) -> str: if isinstance(last_status_change, str): match = _date_re.search(last_status_change) if match: - start_dt = datetime.strptime(match.group(1), "%a %b %d %Y %H:%M:%S") + start_dt = datetime.strptime(match.group(1), "%a %b %d %Y %H:%M:%S").replace(tzinfo=timezone.utc) docker_args = pod.get("dockerArgs", "") if isinstance(docker_args, str): match = _sleep_re.search(docker_args) if match: sleep_secs = int(match.group(1)) if start_dt is not None and sleep_secs is not None: + now_dt = datetime.now(timezone.utc) shutdown_dt = start_dt + timedelta(seconds=sleep_secs) - remaining = shutdown_dt - datetime.now() + remaining = shutdown_dt - now_dt remaining_str = f"{remaining.seconds // 3600}h {remaining.seconds % 3600 // 60}m" return remaining_str if remaining.total_seconds() > 0 else "Unknown" else: @@ -304,6 +305,9 @@ def list(self, verbose: bool = False) -> None: logging.info(f" GPUs: {pod.get('gpuCount')} x {pod.get('machine', {}).get('gpuDisplayName')}") for key in ["memoryInGb", "vcpuCount", "containerDiskInGb", "volumeMountPath", "costPerHr"]: logging.info(f" {key}: {pod.get(key)}") + # For debugging + # for key in pod.keys(): + # logging.info(f" {key}: {pod.get(key)}") logging.info("") def create( From a1cc89615ae857615150c13b127c0d699a7c19b3 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 07:37:30 +0100 Subject: [PATCH 04/16] Add locales for git --- src/runpod_cli/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py index e7dcaa1..16390a7 100644 --- a/src/runpod_cli/utils.py +++ b/src/runpod_cli/utils.py @@ -70,7 +70,7 @@ def get_setup_root(runpodcli_path: str, volume_mount_path: str) -> Tuple[str, st apt-get update apt-get upgrade -y - apt-get install -y sudo git vim ssh net-tools htop curl zip unzip tmux rsync libopenmpi-dev iputils-ping make fzf restic ripgrep wget pandoc poppler-utils pigz bzip2 nano + apt-get install -y sudo git vim ssh net-tools htop curl zip unzip tmux rsync libopenmpi-dev iputils-ping make fzf restic ripgrep wget pandoc poppler-utils pigz bzip2 nano locales echo 'user ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers echo "export HF_HOME=/workspace/hf_home/" >> /home/user/.bashrc echo 'export PATH="$HOME/.local/bin:$PATH"' >> /home/user/.bashrc From 02ec64d291e67d1f712ea4782e99d6686123eb12 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 07:50:01 +0100 Subject: [PATCH 05/16] Install plotly dependencies + move TransformerLens to system packages --- src/runpod_cli/utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py index 16390a7..5f8c83d 100644 --- a/src/runpod_cli/utils.py +++ b/src/runpod_cli/utils.py @@ -120,13 +120,15 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[ # Install Python packages using uv sudo pip install uv - sudo uv pip install ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git --system - - # Create a virtual environment for the user - uv venv ~/.venv --python 3.11 --system-site-packages + sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git transformer_lens + # For plotly (kaleido) png export + sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2 + sudo plotly_get_chrome -y + # Create a virtual environment for the user, install nnsight locally due to https://github.com/ndif-team/nnsight/issues/495 + python_version=$(python --version | cut -d' ' -f2 | cut -d'.' -f1-2) + uv venv ~/.venv --python $python_version --system-site-packages source ~/.venv/bin/activate - pip install nnsight transformer_lens - + python -m pip install nnsight echo "...user setup completed!" """.replace("RUNPODCLI_PATH", runpodcli_path) .replace("GIT_EMAIL", git_email) From 542d9247a38a6a063f20ce37d21ba13adcc2f99b Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 08:33:47 +0100 Subject: [PATCH 06/16] Switch to virtualenv as it sets Usage: pip [options] Commands: install Install packages. download Download packages. uninstall Uninstall packages. freeze Output installed packages in requirements format. inspect Inspect the python environment. list List installed packages. show Show information about installed packages. check Verify installed packages have compatible dependencies. config Manage local and global configuration. search Search PyPI for packages. cache Inspect and manage pip's wheel cache. index Inspect information available from package indexes. wheel Build wheels from your requirements. hash Compute hashes of package archives. completion A helper command used for command completion. debug Show information useful for debugging. help Show help for commands. General Options: -h, --help Show help. --debug Let unhandled exceptions propagate outside the main subroutine, instead of logging them to stderr. --isolated Run pip in an isolated mode, ignoring environment variables and user configuration. --require-virtualenv Allow pip to only run in a virtual environment; exit with an error otherwise. --python Run pip with the specified Python interpreter. -v, --verbose Give more output. Option is additive, and can be used up to 3 times. -V, --version Show version and exit. -q, --quiet Give less output. Option is additive, and can be used up to 3 times (corresponding to WARNING, ERROR, and CRITICAL logging levels). --log Path to a verbose appending log. --no-input Disable prompting for input. --keyring-provider Enable the credential lookup via the keyring library if user input is allowed. Specify which mechanism to use [disabled, import, subprocess]. (default: disabled) --proxy Specify a proxy in the form scheme://[user:passwd@]proxy.server:port. --retries Maximum number of retries each connection should attempt (default 5 times). --timeout Set the socket timeout (default 15 seconds). --exists-action Default action when a path already exists: (s)witch, (i)gnore, (w)ipe, (b)ackup, (a)bort. --trusted-host Mark this host or host:port pair as trusted, even though it does not have valid or any HTTPS. --cert Path to PEM-encoded CA certificate bundle. If provided, overrides the default. See 'SSL Certificate Verification' in pip documentation for more information. --client-cert Path to SSL client certificate, a single file containing the private key and the certificate in PEM format. --cache-dir Store the cache data in . --no-cache-dir Disable the cache. --disable-pip-version-check Don't periodically check PyPI to determine whether a new version of pip is available for download. Implied with --no-index. --no-color Suppress colored output. --no-python-version-warning Silence deprecation warnings for upcoming unsupported Pythons. --use-feature Enable new functionality, that may be backward incompatible. --use-deprecated Enable deprecated functionality, that will be removed in the future. correctly --- src/runpod_cli/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py index 5f8c83d..c4f6907 100644 --- a/src/runpod_cli/utils.py +++ b/src/runpod_cli/utils.py @@ -126,7 +126,8 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[ sudo plotly_get_chrome -y # Create a virtual environment for the user, install nnsight locally due to https://github.com/ndif-team/nnsight/issues/495 python_version=$(python --version | cut -d' ' -f2 | cut -d'.' -f1-2) - uv venv ~/.venv --python $python_version --system-site-packages + # uv venv ~/.venv --python $python_version --system-site-packages + virtualenv ~/.venv --system-site-packages source ~/.venv/bin/activate python -m pip install nnsight echo "...user setup completed!" From 099f2cc54db6f2ddcc1176c237021e2b29f9d65a Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 09:01:18 +0100 Subject: [PATCH 07/16] Install nnsight to system too --- src/runpod_cli/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py index c4f6907..a6cfff9 100644 --- a/src/runpod_cli/utils.py +++ b/src/runpod_cli/utils.py @@ -58,10 +58,13 @@ def get_setup_root(runpodcli_path: str, volume_mount_path: str) -> Tuple[str, st echo "Setting up system environment..." useradd --uid 1000 --shell /bin/bash user --groups sudo --create-home + # Set NNSIGHT_LOG_PATH to avoid https://github.com/ndif-team/nnsight/issues/495 + echo "export NNSIGHT_LOG_PATH=/root/.local/state/nnsight" >> /root/.profile + echo "export NNSIGHT_LOG_PATH=/home/user/.local/state/nnsight" >> /home/user/.profile + chown user /home/user/.profile mkdir -p /home/user/.ssh/ - touch /home/user/.ssh/authorized_keys - chown user /home/user/.ssh/authorized_keys cat /root/.ssh/authorized_keys >> /home/user/.ssh/authorized_keys + chown user /home/user/.ssh/authorized_keys if [[ VOLUME_MOUNT_PATH != "/workspace" ]]; then rmdir /workspace @@ -120,16 +123,13 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[ # Install Python packages using uv sudo pip install uv - sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git transformer_lens + sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight # For plotly (kaleido) png export sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2 sudo plotly_get_chrome -y - # Create a virtual environment for the user, install nnsight locally due to https://github.com/ndif-team/nnsight/issues/495 + # Create a virtual environment for the user python_version=$(python --version | cut -d' ' -f2 | cut -d'.' -f1-2) - # uv venv ~/.venv --python $python_version --system-site-packages - virtualenv ~/.venv --system-site-packages - source ~/.venv/bin/activate - python -m pip install nnsight + uv venv ~/.venv --python $python_version --system-site-packages echo "...user setup completed!" """.replace("RUNPODCLI_PATH", runpodcli_path) .replace("GIT_EMAIL", git_email) From c2eb6ac615f728b20091c555022ed29d26df7f43 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 09:06:30 +0100 Subject: [PATCH 08/16] Refactor cli.py to remove extra class [by GPT-5] Remove RunPodClient, centralize logic in RunPodManager with PodSpec + tiny helpers; keep Fire CLI unchanged Co-authored-by: GPT-5 Thinking --- src/runpod_cli/cli.py | 259 ++++++++++++++++-------------------------- 1 file changed, 95 insertions(+), 164 deletions(-) diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index d02118f..96b7b27 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -23,7 +23,6 @@ get_terminate, ) except ImportError: - # Allow running cli.py directly from the repository from utils import ( # type: ignore DEFAULT_IMAGE_NAME, GPU_DISPLAY_NAME_TO_ID, @@ -41,8 +40,6 @@ @dataclass class PodSpec: - """Specification for RunPod infrastructure parameters.""" - gpu_type: str = "RTX A4000" image_name: str = DEFAULT_IMAGE_NAME gpu_count: int = 1 @@ -74,138 +71,6 @@ def get_s3_endpoint_from_volume_id(volume_id: str) -> str: return s3_endpoint -class RunPodClient: - """Pure logic client for RunPod and S3 operations without user I/O.""" - - def __init__(self) -> None: - self.api_key = os.getenv("RUNPOD_API_KEY") - if not self.api_key: - raise ValueError("RUNPOD_API_KEY not found in environment. Set it in your .env file.") - - network_volume_id = os.getenv("RUNPOD_NETWORK_VOLUME_ID") - if not network_volume_id: - raise ValueError("RUNPOD_NETWORK_VOLUME_ID not found in environment. Set it in your .env file.") - self.network_volume_id: str = network_volume_id - - self.s3_access_key_id = os.getenv("RUNPOD_S3_ACCESS_KEY_ID") - self.s3_secret_key = os.getenv("RUNPOD_S3_SECRET_KEY") - if not self.s3_access_key_id or not self.s3_secret_key: - raise ValueError("RUNPOD_S3_ACCESS_KEY_ID or RUNPOD_S3_SECRET_KEY not found in environment. Set it in your .env file.") - - runpod.api_key = self.api_key - self.region = get_region_from_volume_id(self.network_volume_id) - self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id) - - def _make_s3_client(self): - return boto3.client( - "s3", - aws_access_key_id=self.s3_access_key_id, - aws_secret_access_key=self.s3_secret_key, - endpoint_url=self.s3_endpoint, - region_name=self.region, - ) - - def get_pods(self) -> List[Dict]: - return runpod.get_pods() # type: ignore - - def get_pod(self, pod_id: str) -> Dict: - return runpod.get_pod(pod_id) - - def terminate_pod(self, pod_id: str) -> Optional[Dict]: - return runpod.terminate_pod(pod_id) - - def upload_script_content(self, content: str, target: str) -> None: - s3 = self._make_s3_client() - s3.put_object(Bucket=self.network_volume_id, Key=target, Body=content.encode("utf-8")) - - def download_file_content(self, remote_path: str) -> str: - s3 = self._make_s3_client() - response = s3.get_object(Bucket=self.network_volume_id, Key=remote_path) - return response["Body"].read().decode("utf-8") - - def _build_docker_args(self, volume_mount_path: str, runpodcli_dir: str, runtime: int) -> str: - runpodcli_path = f"{volume_mount_path}/{runpodcli_dir}" - return ( - "/bin/bash -c '" - + (f"mkdir -p {runpodcli_path}; bash {runpodcli_path}/start_pod.sh; sleep {max(runtime * 60, 20)}; bash {runpodcli_path}/terminate_pod.sh") - + "'" - ) - - def _provision_and_wait(self, pod_id: str, n_attempts: int = 60) -> Dict: - for i in range(n_attempts): - pod = runpod.get_pod(pod_id) - pod_runtime = pod.get("runtime") - if pod_runtime is None or not pod_runtime.get("ports"): - time.sleep(5) - else: - return pod - raise RuntimeError("Pod provisioning failed") - - def create_pod(self, name: Optional[str], spec: PodSpec, runtime: int) -> Dict: - gpu_id = GPU_DISPLAY_NAME_TO_ID[spec.gpu_type] if spec.gpu_type in GPU_DISPLAY_NAME_TO_ID else spec.gpu_type - - name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}" - spec.runpodcli_dir = spec.runpodcli_dir or f".tmp_{name.replace(' ', '_')}" - - # Get scripts and upload to network volume - git_email = os.getenv("GIT_EMAIL", "") - git_name = os.getenv("GIT_NAME", "") - rpc_path = f"{spec.volume_mount_path}/{spec.runpodcli_dir}" - scripts = [ - get_setup_root(rpc_path, spec.volume_mount_path), - get_setup_user(rpc_path, git_email, git_name), - get_start(rpc_path), - get_terminate(rpc_path), - ] - for script_name, script_content in scripts: - self.upload_script_content(content=script_content, target=f"{spec.runpodcli_dir}/{script_name}") - - docker_args = self._build_docker_args(volume_mount_path=spec.volume_mount_path, runpodcli_dir=spec.runpodcli_dir, runtime=runtime) - name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}" - - pod = runpod.create_pod( - name=name, - image_name=spec.image_name, - gpu_type_id=gpu_id, - cloud_type="SECURE", - gpu_count=spec.gpu_count, - container_disk_in_gb=spec.container_disk_in_gb, - min_vcpu_count=spec.min_vcpu_count, - min_memory_in_gb=spec.min_memory_in_gb, - docker_args=docker_args, - env=spec.env, - ports="8888/http,22/tcp", - volume_mount_path=spec.volume_mount_path, - network_volume_id=self.network_volume_id, - ) - - pod_id: str = pod.get("id") # type: ignore - pod = self._provision_and_wait(pod_id) - - return pod - - def get_pod_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]: - """Extract public IP and port from pod runtime info.""" - public_ips = [i for i in pod["runtime"]["ports"] if i["isIpPublic"]] - if len(public_ips) != 1: - raise ValueError(f"Expected 1 public IP, got {public_ips}") - ip = public_ips[0].get("ip") - port = public_ips[0].get("publicPort") - return ip, port - - def get_host_keys(self, remote_path: str) -> List[Tuple[str, str]]: - """Download host keys from S3 and return list of (algorithm, key) pairs.""" - host_keys = [] - for file in ["ssh_ed25519_host_key", "ssh_ecdsa_host_key", "ssh_rsa_host_key", "ssh_dsa_host_key"]: - try: - host_key_text = self.download_file_content(f"{remote_path}/{file}").strip() - alg, key, _ = host_key_text.split(" ") - host_keys.append((alg, key)) - except Exception: - continue - return host_keys - - class RunPodManager: """RunPod Management CLI - A command-line tool for managing RunPod instances via the RunPod API. @@ -225,21 +90,13 @@ class RunPodManager: """ def __init__(self, env: Optional[str] = None) -> None: - """Initialize the RunPod manager. - - Args: - env: Path to the .env file (optional). If not provided, will search for .env files in default locations. - """ - # Load environment variables if env: logging.info(f"Using .env file: {env}") - # Use the specified .env file env_path = os.path.expanduser(env) if not os.path.exists(env_path): raise FileNotFoundError(f"Specified .env file not found: {env_path}") load_dotenv(override=True, dotenv_path=env_path) else: - # Use default .env file search logic xdg_config_dir = os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~/.config")) env_paths = [".env", os.path.join(xdg_config_dir, "runpod_cli/.env")] env_exists = [os.path.exists(os.path.expanduser(path)) for path in env_paths] @@ -249,10 +106,49 @@ def __init__(self, env: Optional[str] = None) -> None: raise FileExistsError(f"Multiple .env files found in {env_paths}") load_dotenv(override=True, dotenv_path=os.path.expanduser(env_paths[env_exists.index(True)])) - self._client = RunPodClient() + api_key = os.getenv("RUNPOD_API_KEY") + if not api_key: + raise ValueError("RUNPOD_API_KEY not found in environment. Set it in your .env file.") + runpod.api_key = api_key + + network_volume_id = os.getenv("RUNPOD_NETWORK_VOLUME_ID") + if not network_volume_id: + raise ValueError("RUNPOD_NETWORK_VOLUME_ID not found in environment. Set it in your .env file.") + self.network_volume_id: str = network_volume_id - def _parse_public_ip(self, pod: Dict) -> str: - """Parse public IP and port from pod runtime info.""" + s3_access_key_id = os.getenv("RUNPOD_S3_ACCESS_KEY_ID") + s3_secret_key = os.getenv("RUNPOD_S3_SECRET_KEY") + if not s3_access_key_id or not s3_secret_key: + raise ValueError("RUNPOD_S3_ACCESS_KEY_ID or RUNPOD_S3_SECRET_KEY not found in environment. Set it in your .env file.") + self.region = get_region_from_volume_id(self.network_volume_id) + self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id) + self._s3 = boto3.client( + "s3", + aws_access_key_id=s3_access_key_id, + aws_secret_access_key=s3_secret_key, + endpoint_url=self.s3_endpoint, + region_name=self.region, + ) + + def _build_docker_args(self, volume_mount_path: str, runpodcli_dir: str, runtime: int) -> str: + runpodcli_path = f"{volume_mount_path}/{runpodcli_dir}" + return ( + "/bin/bash -c '" + + (f"mkdir -p {runpodcli_path}; bash {runpodcli_path}/start_pod.sh; sleep {max(runtime * 60, 20)}; bash {runpodcli_path}/terminate_pod.sh") + + "'" + ) + + def _provision_and_wait(self, pod_id: str, n_attempts: int = 60) -> Dict: + for _ in range(n_attempts): + pod = runpod.get_pod(pod_id) + pod_runtime = pod.get("runtime") + if pod_runtime is None or not pod_runtime.get("ports"): + time.sleep(5) + else: + return pod + raise RuntimeError("Pod provisioning failed") + + def _get_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]: public_ips = [i for i in pod["runtime"]["ports"] if i["isIpPublic"]] if len(public_ips) != 1: raise ValueError(f"Expected 1 public IP, got {public_ips}") @@ -261,7 +157,6 @@ def _parse_public_ip(self, pod: Dict) -> str: return ip, port def _parse_time_remaining(self, pod: Dict) -> str: - """Parse time remaining before pod shutdown from pod runtime info.""" _sleep_re = re.compile(r"\bsleep\s+(\d+)\b") _date_re = re.compile(r":\s*(\w{3}\s+\w{3}\s+\d{2}\s+\d{4}\s+\d{2}:\d{2}:\d{2})\s+GMT") start_dt = None @@ -290,7 +185,7 @@ def list(self, verbose: bool = False) -> None: Displays information about each pod including ID, name, GPU type, status, and connection details. """ - pods = self._client.get_pods() + pods = runpod.get_pods() # type: ignore for i, pod in enumerate(pods): logging.info(f"Pod {i + 1}:") @@ -299,15 +194,12 @@ def list(self, verbose: bool = False) -> None: time_remaining = self._parse_time_remaining(pod) logging.info(f" Time remaining (est.): {time_remaining}") if verbose: - public_ip, public_port = self._parse_public_ip(pod) + public_ip, public_port = self._get_public_ip_and_port(pod) logging.info(f" Public IP: {public_ip}") logging.info(f" Public port: {public_port}") logging.info(f" GPUs: {pod.get('gpuCount')} x {pod.get('machine', {}).get('gpuDisplayName')}") for key in ["memoryInGb", "vcpuCount", "containerDiskInGb", "volumeMountPath", "costPerHr"]: logging.info(f" {key}: {pod.get(key)}") - # For debugging - # for key in pod.keys(): - # logging.info(f" {key}: {pod.get(key)}") logging.info("") def create( @@ -347,8 +239,6 @@ def create( rpc create --runtime=240 --gpu_type="RTX A4000" --num_gpus=2 --name="dual-gpu-pod" """ spec = PodSpec() - - # Override spec with individual parameters if provided if gpu_type is not None: spec.gpu_type = gpu_type if image_name is not None: @@ -372,12 +262,16 @@ def create( if update_known_hosts is not None: spec.update_known_hosts = update_known_hosts + gpu_id = GPU_DISPLAY_NAME_TO_ID[spec.gpu_type] if spec.gpu_type in GPU_DISPLAY_NAME_TO_ID else spec.gpu_type + name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}" + spec.runpodcli_dir = spec.runpodcli_dir or f".tmp_{name.replace(' ', '_')}" + logging.info("Creating pod with:") logging.info(f" Name: {name}") logging.info(f" Image: {spec.image_name}") - logging.info(f" Network volume ID: {self._client.network_volume_id}") - logging.info(f" Region: {self._client.region}") - logging.info(f" S3 endpoint: {self._client.s3_endpoint}") + logging.info(f" Network volume ID: {self.network_volume_id}") + logging.info(f" Region: {self.region}") + logging.info(f" S3 endpoint: {self.s3_endpoint}") logging.info(f" GPU Type: {spec.gpu_type}") logging.info(f" GPU Count: {spec.gpu_count}") logging.info(f" Disk: {spec.container_disk_in_gb} GB") @@ -386,12 +280,42 @@ def create( logging.info(f" runpodcli directory: {spec.runpodcli_dir}") logging.info(f" Time limit: {runtime} minutes") + git_email = os.getenv("GIT_EMAIL", "") + git_name = os.getenv("GIT_NAME", "") + rpc_path = f"{spec.volume_mount_path}/{spec.runpodcli_dir}" + scripts = [ + get_setup_root(rpc_path, spec.volume_mount_path), + get_setup_user(rpc_path, git_email, git_name), + get_start(rpc_path), + get_terminate(rpc_path), + ] + for script_name, script_content in scripts: + self._s3.put_object(Bucket=self.network_volume_id, Key=f"{spec.runpodcli_dir}/{script_name}", Body=script_content.encode("utf-8")) + + docker_args = self._build_docker_args(volume_mount_path=spec.volume_mount_path, runpodcli_dir=spec.runpodcli_dir, runtime=runtime) + + pod = runpod.create_pod( + name=name, + image_name=spec.image_name, + gpu_type_id=gpu_id, + cloud_type="SECURE", + gpu_count=spec.gpu_count, + container_disk_in_gb=spec.container_disk_in_gb, + min_vcpu_count=spec.min_vcpu_count, + min_memory_in_gb=spec.min_memory_in_gb, + docker_args=docker_args, + env=spec.env, + ports="8888/http,22/tcp", + volume_mount_path=spec.volume_mount_path, + network_volume_id=self.network_volume_id, + ) + + pod_id: str = pod.get("id") # type: ignore logging.info("Pod created. Provisioning...") - pod = self._client.create_pod(name, spec, runtime) + pod = self._provision_and_wait(pod_id) logging.info("Pod provisioned.") - # Handle SSH config and known hosts - ip, port = self._client.get_pod_public_ip_and_port(pod) + ip, port = self._get_public_ip_and_port(pod) if spec.update_ssh_config: self._write_ssh_config(ip, port, spec.forward_agent) @@ -419,10 +343,17 @@ def _write_ssh_config(self, ip: str, port: int, forward_agent: bool, config_path logging.info(f"SSH config at {config_path} updated") def _update_known_hosts_file(self, public_ip: str, port: int, runpodcli_dir: str) -> None: - """Update SSH known hosts file with pod host keys.""" - host_keys = self._client.get_host_keys(runpodcli_dir) - known_hosts_path = os.path.expanduser("~/.ssh/known_hosts.runpod_cli") + host_keys: List[Tuple[str, str]] = [] + for file in ["ssh_ed25519_host_key", "ssh_ecdsa_host_key", "ssh_rsa_host_key", "ssh_dsa_host_key"]: + try: + obj = self._s3.get_object(Bucket=self.network_volume_id, Key=f"{runpodcli_dir}/{file}") + host_key_text = obj["Body"].read().decode("utf-8").strip() + alg, key, _ = host_key_text.split(" ") + host_keys.append((alg, key)) + except Exception: + continue + known_hosts_path = os.path.expanduser("~/.ssh/known_hosts.runpod_cli") for alg, key in host_keys: try: with open(known_hosts_path, "a") as dest: @@ -441,7 +372,7 @@ def terminate(self, pod_id: str) -> None: rpc terminate --pod_id=abc123 """ logging.info(f"Terminating pod {pod_id}") - self._client.terminate_pod(pod_id) + _ = runpod.terminate_pod(pod_id) def main(): From 9fa99c0e83441098921f84d94bdc387a99496f25 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 10:00:13 +0100 Subject: [PATCH 09/16] Remove superfluous RunPodSpec class --- src/runpod_cli/cli.py | 127 ++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 85 deletions(-) diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index 96b7b27..fe63b69 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -3,7 +3,6 @@ import re import textwrap import time -from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import Dict, List, Optional, Tuple @@ -38,22 +37,6 @@ import runpod # noqa: E402 -@dataclass -class PodSpec: - gpu_type: str = "RTX A4000" - image_name: str = DEFAULT_IMAGE_NAME - gpu_count: int = 1 - min_vcpu_count: int = 1 - min_memory_in_gb: int = 1 - container_disk_in_gb: int = 30 - volume_mount_path: str = "/network" - runpodcli_dir: Optional[str] = None - env: Optional[Dict[str, str]] = None - update_ssh_config: bool = True - forward_agent: bool = False - update_known_hosts: bool = True - - def get_region_from_volume_id(volume_id: str) -> str: api_key = os.getenv("RUNPOD_API_KEY") url = f"https://rest.runpod.io/v1/networkvolumes/{volume_id}" @@ -206,17 +189,16 @@ def create( self, name: Optional[str] = None, runtime: int = 60, - gpu_type: Optional[str] = None, - cpus: Optional[int] = None, - disk: Optional[int] = None, - env: Optional[Dict[str, str]] = None, - forward_agent: Optional[bool] = None, - image_name: Optional[str] = None, - memory: Optional[int] = None, - num_gpus: Optional[int] = None, - update_known_hosts: Optional[bool] = None, - update_ssh_config: Optional[bool] = None, - volume_mount_path: Optional[str] = None, + gpu_type: str = "RTX A4000", + cpus: int = 1, + disk: int = 30, + forward_agent: bool = False, + image_name: str = DEFAULT_IMAGE_NAME, + memory: int = 1, + num_gpus: int = 1, + update_known_hosts: bool = True, + update_ssh_config: bool = True, + volume_mount_path: str = "/network", ) -> None: """Create a new RunPod instance with the specified parameters. @@ -225,7 +207,7 @@ def create( gpu_type: GPU type (default: "RTX A4000") num_gpus: Number of GPUs (default: 1) name: Name for the pod (default: "$USER-$GPU_TYPE") - env: File to load RunPod credentials from (default: .env and ~/.config/runpod_cli/.env) + env: Path to credentials .env (defalt: .env and ~/.config/runpod_cli/.env) disk: Container disk size in GB (default: 30) cpus: Minimum CPU count (default: 1) memory: Minimum RAM in GB (default: 1) @@ -234,79 +216,56 @@ def create( update_ssh_config: Whether to update SSH config (default: True) image_name: Docker image (default: "PyTorch 2.8.0 with CUDA 12.8.1") - Examples: - rpc create -r 60 -g "A100 PCIe" - rpc create --runtime=240 --gpu_type="RTX A4000" --num_gpus=2 --name="dual-gpu-pod" + Example: + rpc create -r 60 -g "A100 SXM" + rpc create --gpu_type="RTX A4000" --runtime=480 """ - spec = PodSpec() - if gpu_type is not None: - spec.gpu_type = gpu_type - if image_name is not None: - spec.image_name = image_name - if num_gpus is not None: - spec.gpu_count = num_gpus - if cpus is not None: - spec.min_vcpu_count = cpus - if memory is not None: - spec.min_memory_in_gb = memory - if disk is not None: - spec.container_disk_in_gb = disk - if volume_mount_path is not None: - spec.volume_mount_path = volume_mount_path - if env is not None: - spec.env = env - if update_ssh_config is not None: - spec.update_ssh_config = update_ssh_config - if forward_agent is not None: - spec.forward_agent = forward_agent - if update_known_hosts is not None: - spec.update_known_hosts = update_known_hosts - - gpu_id = GPU_DISPLAY_NAME_TO_ID[spec.gpu_type] if spec.gpu_type in GPU_DISPLAY_NAME_TO_ID else spec.gpu_type + gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type] if gpu_type in GPU_DISPLAY_NAME_TO_ID else gpu_type name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}" - spec.runpodcli_dir = spec.runpodcli_dir or f".tmp_{name.replace(' ', '_')}" + runpodcli_dir = f".tmp_{name.replace(' ', '_')}" logging.info("Creating pod with:") logging.info(f" Name: {name}") - logging.info(f" Image: {spec.image_name}") + logging.info(f" Image: {image_name}") logging.info(f" Network volume ID: {self.network_volume_id}") logging.info(f" Region: {self.region}") logging.info(f" S3 endpoint: {self.s3_endpoint}") - logging.info(f" GPU Type: {spec.gpu_type}") - logging.info(f" GPU Count: {spec.gpu_count}") - logging.info(f" Disk: {spec.container_disk_in_gb} GB") - logging.info(f" Min CPU: {spec.min_vcpu_count}") - logging.info(f" Min Memory: {spec.min_memory_in_gb} GB") - logging.info(f" runpodcli directory: {spec.runpodcli_dir}") + logging.info(f" GPU Type: {gpu_type}") + logging.info(f" GPU Count: {num_gpus}") + logging.info(f" Disk: {disk} GB") + logging.info(f" Min CPU: {cpus}") + logging.info(f" Min Memory: {memory} GB") + logging.info(f" runpodcli directory: {runpodcli_dir}") logging.info(f" Time limit: {runtime} minutes") git_email = os.getenv("GIT_EMAIL", "") git_name = os.getenv("GIT_NAME", "") - rpc_path = f"{spec.volume_mount_path}/{spec.runpodcli_dir}" + remote_scripts_path = f"{volume_mount_path}/{runpodcli_dir}" scripts = [ - get_setup_root(rpc_path, spec.volume_mount_path), - get_setup_user(rpc_path, git_email, git_name), - get_start(rpc_path), - get_terminate(rpc_path), + get_setup_root(remote_scripts_path, volume_mount_path), + get_setup_user(remote_scripts_path, git_email, git_name), + get_start(remote_scripts_path), + get_terminate(remote_scripts_path), ] for script_name, script_content in scripts: - self._s3.put_object(Bucket=self.network_volume_id, Key=f"{spec.runpodcli_dir}/{script_name}", Body=script_content.encode("utf-8")) + # s3_key is relative to /volume_mount_path, while remote_scripts_path is relative to / + s3_key = f"{runpodcli_dir}/{script_name}" + self._s3.put_object(Bucket=self.network_volume_id, Key=s3_key, Body=script_content.encode("utf-8")) - docker_args = self._build_docker_args(volume_mount_path=spec.volume_mount_path, runpodcli_dir=spec.runpodcli_dir, runtime=runtime) + docker_args = self._build_docker_args(volume_mount_path=volume_mount_path, runpodcli_dir=runpodcli_dir, runtime=runtime) pod = runpod.create_pod( name=name, - image_name=spec.image_name, + image_name=image_name, gpu_type_id=gpu_id, cloud_type="SECURE", - gpu_count=spec.gpu_count, - container_disk_in_gb=spec.container_disk_in_gb, - min_vcpu_count=spec.min_vcpu_count, - min_memory_in_gb=spec.min_memory_in_gb, + gpu_count=num_gpus, + container_disk_in_gb=disk, + min_vcpu_count=cpus, + min_memory_in_gb=memory, docker_args=docker_args, - env=spec.env, ports="8888/http,22/tcp", - volume_mount_path=spec.volume_mount_path, + volume_mount_path=volume_mount_path, network_volume_id=self.network_volume_id, ) @@ -317,14 +276,12 @@ def create( ip, port = self._get_public_ip_and_port(pod) - if spec.update_ssh_config: - self._write_ssh_config(ip, port, spec.forward_agent) + if update_ssh_config: + self._write_ssh_config(ip, port, forward_agent) - if spec.update_known_hosts: + if update_known_hosts: time.sleep(5) - if spec.runpodcli_dir is None: - raise ValueError("runpodcli_dir should be set by this point") - self._update_known_hosts_file(ip, port, spec.runpodcli_dir) + self._update_known_hosts_file(ip, port, runpodcli_dir) def _generate_ssh_config(self, ip: str, port: int, forward_agent: bool = False) -> str: return textwrap.dedent(f""" From 5af08092e43aa9c207a793a70a3185032988750f Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 10:02:03 +0100 Subject: [PATCH 10/16] Simplify getenv --- src/runpod_cli/cli.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index fe63b69..1a3cfe1 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -54,6 +54,13 @@ def get_s3_endpoint_from_volume_id(volume_id: str) -> str: return s3_endpoint +def getenv(key: str) -> str: + value = os.getenv(key) + if not value: + raise ValueError(f"{key} not found in environment. Set it in your .env file.") + return value + + class RunPodManager: """RunPod Management CLI - A command-line tool for managing RunPod instances via the RunPod API. @@ -89,20 +96,10 @@ def __init__(self, env: Optional[str] = None) -> None: raise FileExistsError(f"Multiple .env files found in {env_paths}") load_dotenv(override=True, dotenv_path=os.path.expanduser(env_paths[env_exists.index(True)])) - api_key = os.getenv("RUNPOD_API_KEY") - if not api_key: - raise ValueError("RUNPOD_API_KEY not found in environment. Set it in your .env file.") - runpod.api_key = api_key - - network_volume_id = os.getenv("RUNPOD_NETWORK_VOLUME_ID") - if not network_volume_id: - raise ValueError("RUNPOD_NETWORK_VOLUME_ID not found in environment. Set it in your .env file.") - self.network_volume_id: str = network_volume_id - - s3_access_key_id = os.getenv("RUNPOD_S3_ACCESS_KEY_ID") - s3_secret_key = os.getenv("RUNPOD_S3_SECRET_KEY") - if not s3_access_key_id or not s3_secret_key: - raise ValueError("RUNPOD_S3_ACCESS_KEY_ID or RUNPOD_S3_SECRET_KEY not found in environment. Set it in your .env file.") + runpod.api_key = getenv("RUNPOD_API_KEY") + self.network_volume_id: str = getenv("RUNPOD_NETWORK_VOLUME_ID") + s3_access_key_id = getenv("RUNPOD_S3_ACCESS_KEY_ID") + s3_secret_key = getenv("RUNPOD_S3_SECRET_KEY") self.region = get_region_from_volume_id(self.network_volume_id) self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id) self._s3 = boto3.client( From 2347162a59b198594ab00405cbee29b5a1a7de1e Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 10:20:50 +0100 Subject: [PATCH 11/16] Implement fuzzy GPU matching --- README.md | 2 +- src/runpod_cli/cli.py | 32 ++++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 82c6be5..bf994f1 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ into the environment. ## Known issues Python Fire has a known issue (fixed & merged on [GitHub](https://github.com/google/python-fire/pull/588/files) but not released on PyPI yet) -with ipython==9.0 which will produce the following error: +with ipython>=9.0 which will produce the following error: ``` ERROR | Uncaught exception | ; Inspector.__init__() missing 1 required keyword-only argument: 'theme_name'; ``` diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index 1a3cfe1..b7a7a60 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -134,6 +134,8 @@ def _get_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]: raise ValueError(f"Expected 1 public IP, got {public_ips}") ip = public_ips[0].get("ip") port = public_ips[0].get("publicPort") + if not ip or port is None: + raise ValueError(f"Expected public IP and port, got {ip} and {port} from {public_ips}") return ip, port def _parse_time_remaining(self, pod: Dict) -> str: @@ -155,11 +157,33 @@ def _parse_time_remaining(self, pod: Dict) -> str: now_dt = datetime.now(timezone.utc) shutdown_dt = start_dt + timedelta(seconds=sleep_secs) remaining = shutdown_dt - now_dt - remaining_str = f"{remaining.seconds // 3600}h {remaining.seconds % 3600 // 60}m" - return remaining_str if remaining.total_seconds() > 0 else "Unknown" + total = int(remaining.total_seconds()) + remaining_str = f"{total // 3600}h {(total % 3600) // 60}m" + return remaining_str if total > 0 else "Unknown" else: return "Unknown" + def _get_gpu_id(self, gpu_type: str) -> Tuple[str, str]: + if gpu_type in GPU_DISPLAY_NAME_TO_ID: + # A name was passed + gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type] + gpu_name = gpu_type + elif gpu_type in GPU_ID_TO_DISPLAY_NAME: + # An ID was passed + gpu_id = gpu_type + gpu_name = GPU_ID_TO_DISPLAY_NAME[gpu_id] + else: + # Attempt fuzzy matching, but only if unique + matches = [gpu_id for gpu_name, gpu_id in GPU_DISPLAY_NAME_TO_ID.items() if gpu_type.lower() in gpu_id.lower() or gpu_type.lower() in gpu_name.lower()] + if len(matches) == 1: + gpu_id = matches[0] + gpu_name = GPU_ID_TO_DISPLAY_NAME[gpu_id] + elif len(matches) > 1: + raise ValueError(f"Ambiguous GPU type: {gpu_type} matches {matches}. Please use a full name or ID from https://docs.runpod.io/references/gpu-types") + else: + raise ValueError(f"Unknown GPU type: {gpu_type}") + return gpu_id, gpu_name + def list(self, verbose: bool = False) -> None: """List all pods in your RunPod account. @@ -217,8 +241,8 @@ def create( rpc create -r 60 -g "A100 SXM" rpc create --gpu_type="RTX A4000" --runtime=480 """ - gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type] if gpu_type in GPU_DISPLAY_NAME_TO_ID else gpu_type - name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}" + gpu_id, gpu_name = self._get_gpu_id(gpu_type) + name = name or f"{os.getenv('USER')}-{gpu_name}" runpodcli_dir = f".tmp_{name.replace(' ', '_')}" logging.info("Creating pod with:") From 59e69a06176bf09622923a002e0bb5153ec4af3e Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Sun, 10 Aug 2025 10:34:06 +0100 Subject: [PATCH 12/16] Make attributes private to now show up in Fire's VALUES help --- README.md | 3 +-- src/runpod_cli/cli.py | 23 +++++++++++------------ 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index bf994f1..6ae5d28 100644 --- a/README.md +++ b/README.md @@ -143,5 +143,4 @@ ERROR | Uncaught exception | ; Inspector.__init__() missing - Allow user to configre an SSH_PUBLIC_KEY_PATH in .env - Pre-install VS Code / Cursor server - Change names & ssh aliases if a user requests multiple GPUs (e.g. runpod, runpod-1, etc.) -- Create a .config/runpod_cli/config file to change the default values (e.g. GPU type, runtime, etc.) -- Allow for shorter cmdline arguments (e.g. -g or --gpu instead of --gpu_type) \ No newline at end of file +- Create a .config/runpod_cli/config file to change the default values (e.g. GPU type, runtime, etc.) \ No newline at end of file diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index b7a7a60..cb48985 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -97,17 +97,17 @@ def __init__(self, env: Optional[str] = None) -> None: load_dotenv(override=True, dotenv_path=os.path.expanduser(env_paths[env_exists.index(True)])) runpod.api_key = getenv("RUNPOD_API_KEY") - self.network_volume_id: str = getenv("RUNPOD_NETWORK_VOLUME_ID") + self._network_volume_id: str = getenv("RUNPOD_NETWORK_VOLUME_ID") s3_access_key_id = getenv("RUNPOD_S3_ACCESS_KEY_ID") s3_secret_key = getenv("RUNPOD_S3_SECRET_KEY") - self.region = get_region_from_volume_id(self.network_volume_id) - self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id) + self._region = get_region_from_volume_id(self._network_volume_id) + s3_endpoint_url = get_s3_endpoint_from_volume_id(self._network_volume_id) self._s3 = boto3.client( "s3", aws_access_key_id=s3_access_key_id, aws_secret_access_key=s3_secret_key, - endpoint_url=self.s3_endpoint, - region_name=self.region, + endpoint_url=s3_endpoint_url, + region_name=self._region, ) def _build_docker_args(self, volume_mount_path: str, runpodcli_dir: str, runtime: int) -> str: @@ -136,7 +136,7 @@ def _get_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]: port = public_ips[0].get("publicPort") if not ip or port is None: raise ValueError(f"Expected public IP and port, got {ip} and {port} from {public_ips}") - return ip, port + return str(ip), int(port) def _parse_time_remaining(self, pod: Dict) -> str: _sleep_re = re.compile(r"\bsleep\s+(\d+)\b") @@ -248,9 +248,8 @@ def create( logging.info("Creating pod with:") logging.info(f" Name: {name}") logging.info(f" Image: {image_name}") - logging.info(f" Network volume ID: {self.network_volume_id}") - logging.info(f" Region: {self.region}") - logging.info(f" S3 endpoint: {self.s3_endpoint}") + logging.info(f" Network volume ID: {self._network_volume_id}") + logging.info(f" Region: {self._region}") logging.info(f" GPU Type: {gpu_type}") logging.info(f" GPU Count: {num_gpus}") logging.info(f" Disk: {disk} GB") @@ -271,7 +270,7 @@ def create( for script_name, script_content in scripts: # s3_key is relative to /volume_mount_path, while remote_scripts_path is relative to / s3_key = f"{runpodcli_dir}/{script_name}" - self._s3.put_object(Bucket=self.network_volume_id, Key=s3_key, Body=script_content.encode("utf-8")) + self._s3.put_object(Bucket=self._network_volume_id, Key=s3_key, Body=script_content.encode("utf-8")) docker_args = self._build_docker_args(volume_mount_path=volume_mount_path, runpodcli_dir=runpodcli_dir, runtime=runtime) @@ -287,7 +286,7 @@ def create( docker_args=docker_args, ports="8888/http,22/tcp", volume_mount_path=volume_mount_path, - network_volume_id=self.network_volume_id, + network_volume_id=self._network_volume_id, ) pod_id: str = pod.get("id") # type: ignore @@ -324,7 +323,7 @@ def _update_known_hosts_file(self, public_ip: str, port: int, runpodcli_dir: str host_keys: List[Tuple[str, str]] = [] for file in ["ssh_ed25519_host_key", "ssh_ecdsa_host_key", "ssh_rsa_host_key", "ssh_dsa_host_key"]: try: - obj = self._s3.get_object(Bucket=self.network_volume_id, Key=f"{runpodcli_dir}/{file}") + obj = self._s3.get_object(Bucket=self._network_volume_id, Key=f"{runpodcli_dir}/{file}") host_key_text = obj["Body"].read().decode("utf-8").strip() alg, key, _ = host_key_text.split(" ") host_keys.append((alg, key)) From 47a2412557a4347bb41acaa8f4a731e6d0563de6 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Fri, 15 Aug 2025 12:21:37 +0100 Subject: [PATCH 13/16] Compile bytecode to avoid slow imports --- src/runpod_cli/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py index a6cfff9..f07349b 100644 --- a/src/runpod_cli/utils.py +++ b/src/runpod_cli/utils.py @@ -123,7 +123,7 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[ # Install Python packages using uv sudo pip install uv - sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight + sudo uv pip install --system --compile-bytecode ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight # For plotly (kaleido) png export sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2 sudo plotly_get_chrome -y From 707c4148b8d13fc265f8e42e112c7754b1483f86 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Mon, 18 Aug 2025 10:39:41 +0100 Subject: [PATCH 14/16] Convert GPU types like 5090 to str --- src/runpod_cli/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py index cb48985..032309c 100644 --- a/src/runpod_cli/cli.py +++ b/src/runpod_cli/cli.py @@ -163,7 +163,8 @@ def _parse_time_remaining(self, pod: Dict) -> str: else: return "Unknown" - def _get_gpu_id(self, gpu_type: str) -> Tuple[str, str]: + def _get_gpu_id(self, gpu_type: str | int) -> Tuple[str, str]: + gpu_type = str(gpu_type) if gpu_type in GPU_DISPLAY_NAME_TO_ID: # A name was passed gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type] From 9d968b8f01e6746d6d921e5f734bfe304e5da248 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Mon, 18 Aug 2025 16:03:01 +0100 Subject: [PATCH 15/16] Add scikit-image to pre-installed packages --- src/runpod_cli/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py index f07349b..ce43ca9 100644 --- a/src/runpod_cli/utils.py +++ b/src/runpod_cli/utils.py @@ -123,7 +123,7 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[ # Install Python packages using uv sudo pip install uv - sudo uv pip install --system --compile-bytecode ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight + sudo uv pip install --system --compile-bytecode ipykernel kaleido nbformat numpy scipy scikit-learn scikit-image transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight # For plotly (kaleido) png export sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2 sudo plotly_get_chrome -y From e4d67ce237dce938434cc69d4671d21c786b2b19 Mon Sep 17 00:00:00 2001 From: Stefan Heimersheim Date: Tue, 30 Sep 2025 11:36:43 +0100 Subject: [PATCH 16/16] Fix .ssh permissions, making folder owned by user --- src/runpod_cli/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py index ce43ca9..c8ef53b 100644 --- a/src/runpod_cli/utils.py +++ b/src/runpod_cli/utils.py @@ -61,10 +61,10 @@ def get_setup_root(runpodcli_path: str, volume_mount_path: str) -> Tuple[str, st # Set NNSIGHT_LOG_PATH to avoid https://github.com/ndif-team/nnsight/issues/495 echo "export NNSIGHT_LOG_PATH=/root/.local/state/nnsight" >> /root/.profile echo "export NNSIGHT_LOG_PATH=/home/user/.local/state/nnsight" >> /home/user/.profile - chown user /home/user/.profile + chown user:user /home/user/.profile mkdir -p /home/user/.ssh/ cat /root/.ssh/authorized_keys >> /home/user/.ssh/authorized_keys - chown user /home/user/.ssh/authorized_keys + chown -R user:user /home/user/.ssh if [[ VOLUME_MOUNT_PATH != "/workspace" ]]; then rmdir /workspace