From 90ed323e07c8ef2eaba3578280878a9cbb902472 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Fri, 1 Aug 2025 23:59:59 +0100
Subject: [PATCH 01/16] [Breaking] Rename lesser used kwargs & remove
 deprecated kwargs

---
 src/runpod_cli/cli.py | 75 ++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 47 deletions(-)

diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index 4a066ef..b30064d 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -41,11 +41,9 @@
 class PodSpec:
     """Specification for RunPod infrastructure parameters."""
 
-    image_name: str = DEFAULT_IMAGE_NAME
     gpu_type: str = "RTX A4000"
-    cloud_type: str = "SECURE"
+    image_name: str = DEFAULT_IMAGE_NAME
     gpu_count: int = 1
-    volume_in_gb: int = 10
     min_vcpu_count: int = 1
     min_memory_in_gb: int = 1
     container_disk_in_gb: int = 30
@@ -167,9 +165,8 @@ def create_pod(self, name: Optional[str], spec: PodSpec, runtime: int) -> Dict:
             name=name,
             image_name=spec.image_name,
             gpu_type_id=gpu_id,
-            cloud_type=spec.cloud_type,
+            cloud_type="SECURE",
             gpu_count=spec.gpu_count,
-            volume_in_gb=spec.volume_in_gb,
             container_disk_in_gb=spec.container_disk_in_gb,
             min_vcpu_count=spec.min_vcpu_count,
             min_memory_in_gb=spec.min_memory_in_gb,
@@ -276,70 +273,55 @@ def create(
         self,
         name: Optional[str] = None,
         runtime: int = 60,
-        spec: Optional[PodSpec] = None,
         gpu_type: Optional[str] = None,
-        image_name: Optional[str] = None,
-        cloud_type: Optional[str] = None,
-        gpu_count: Optional[int] = None,
-        volume_in_gb: Optional[int] = None,
-        min_vcpu_count: Optional[int] = None,
-        min_memory_in_gb: Optional[int] = None,
-        container_disk_in_gb: Optional[int] = None,
-        volume_mount_path: Optional[str] = None,
-        runpodcli_dir: Optional[str] = None,
+        cpus: Optional[int] = None,
+        disk: Optional[int] = None,
         env: Optional[Dict[str, str]] = None,
-        update_ssh_config: Optional[bool] = None,
         forward_agent: Optional[bool] = None,
+        image_name: Optional[str] = None,
+        memory: Optional[int] = None,
+        num_gpus: Optional[int] = None,
         update_known_hosts: Optional[bool] = None,
+        update_ssh_config: Optional[bool] = None,
+        volume_mount_path: Optional[str] = None,
     ) -> None:
         """Create a new RunPod instance with the specified parameters.
 
         Args:
-            name: Name for the pod (default: "$USER-$GPU_TYPE")
             runtime: Time in minutes for pod to run (default: 60)
             gpu_type: GPU type (default: "RTX A4000")
-            image_name: Docker image (default: PyTorch 2.8.0 with CUDA 12.8.1)
-            cloud_type: "SECURE" or "COMMUNITY" (default: "SECURE")
-            gpu_count: Number of GPUs (default: 1)
-            volume_in_gb: Ephemeral storage volume size in GB (default: 10)
-            min_vcpu_count: Minimum CPU count (default: 1)
-            min_memory_in_gb: Minimum RAM in GB (default: 1)
-            container_disk_in_gb: Container disk size in GB (default: 30)
-            volume_mount_path: Volume mount path (default: "/network")
-            runpodcli_dir: Directory name for runpodcli scripts (default: ".tmp_$name")
-            env: Environment variables to set in the container
-            update_ssh_config: Whether to update SSH config (default: True)
+            num_gpus: Number of GPUs (default: 1)
+            name: Name for the pod (default: "$USER-$GPU_TYPE")
+            env: File to load RunPod credentials from (default: .env and ~/.config/runpod_cli/.env)
+            disk: Container disk size in GB (default: 30)
+            cpus: Minimum CPU count (default: 1)
+            memory: Minimum RAM in GB (default: 1)
             forward_agent: Whether to forward SSH agent (default: False)
             update_known_hosts: Whether to update known hosts (default: True)
+            update_ssh_config: Whether to update SSH config (default: True)
+            image_name: Docker image (default: "PyTorch 2.8.0 with CUDA 12.8.1")
 
         Examples:
-            rpc create --gpu_type="A100 PCIe" --runtime=60
-            rpc create --name="my-pod" --gpu_count=2 --runtime=240
+            rpc create -r 60 -g "A100 PCIe"
+            rpc create --runtime=240 --gpu_type="RTX A4000" --num_gpus=2 --name="dual-gpu-pod"
         """
-        if spec is None:
-            spec = PodSpec()
+        spec = PodSpec()
 
         # Override spec with individual parameters if provided
         if gpu_type is not None:
             spec.gpu_type = gpu_type
         if image_name is not None:
             spec.image_name = image_name
-        if cloud_type is not None:
-            spec.cloud_type = cloud_type
-        if gpu_count is not None:
-            spec.gpu_count = gpu_count
-        if volume_in_gb is not None:
-            spec.volume_in_gb = volume_in_gb
-        if min_vcpu_count is not None:
-            spec.min_vcpu_count = min_vcpu_count
-        if min_memory_in_gb is not None:
-            spec.min_memory_in_gb = min_memory_in_gb
-        if container_disk_in_gb is not None:
-            spec.container_disk_in_gb = container_disk_in_gb
+        if num_gpus is not None:
+            spec.gpu_count = num_gpus
+        if cpus is not None:
+            spec.min_vcpu_count = cpus
+        if memory is not None:
+            spec.min_memory_in_gb = memory
+        if disk is not None:
+            spec.container_disk_in_gb = disk
         if volume_mount_path is not None:
             spec.volume_mount_path = volume_mount_path
-        if runpodcli_dir is not None:
-            spec.runpodcli_dir = runpodcli_dir
         if env is not None:
             spec.env = env
         if update_ssh_config is not None:
@@ -356,7 +338,6 @@ def create(
         logging.info(f"  Region: {self._client.region}")
         logging.info(f"  S3 endpoint: {self._client.s3_endpoint}")
         logging.info(f"  GPU Type: {spec.gpu_type}")
-        logging.info(f"  Cloud Type: {spec.cloud_type}")
         logging.info(f"  GPU Count: {spec.gpu_count}")
         logging.info(f"  runpodcli directory: {spec.runpodcli_dir}")
         logging.info(f"  Time limit: {runtime} minutes")

From db15faecb7467e35cf09c97bbe8f07273192c86e Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Thu, 7 Aug 2025 10:36:33 +0100
Subject: [PATCH 02/16] Add time remaining to pod list

---
 src/runpod_cli/cli.py | 56 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 8 deletions(-)

diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index b30064d..bc71355 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -1,8 +1,10 @@
 import logging
 import os
+import re
 import textwrap
 import time
 from dataclasses import dataclass
+from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 
 import boto3
@@ -249,7 +251,40 @@ def __init__(self, env: Optional[str] = None) -> None:
 
         self._client = RunPodClient()
 
-    def list(self) -> None:
+    def _parse_public_ip(self, pod: Dict) -> str:
+        """Parse public IP and port from pod runtime info."""
+        public_ips = [i for i in pod["runtime"]["ports"] if i["isIpPublic"]]
+        if len(public_ips) != 1:
+            raise ValueError(f"Expected 1 public IP, got {public_ips}")
+        ip = public_ips[0].get("ip")
+        port = public_ips[0].get("publicPort")
+        return ip, port
+
+    def _parse_time_remaining(self, pod: Dict) -> str:
+        """Parse time remaining before pod shutdown from pod runtime info."""
+        _sleep_re = re.compile(r"\bsleep\s+(\d+)\b")
+        _date_re = re.compile(r":\s*(\w{3}\s+\w{3}\s+\d{2}\s+\d{4}\s+\d{2}:\d{2}:\d{2})\s+GMT")
+        start_dt = None
+        sleep_secs = None
+        last_status_change = pod.get("lastStatusChange", "")
+        if isinstance(last_status_change, str):
+            match = _date_re.search(last_status_change)
+            if match:
+                start_dt = datetime.strptime(match.group(1), "%a %b %d %Y %H:%M:%S")
+        docker_args = pod.get("dockerArgs", "")
+        if isinstance(docker_args, str):
+            match = _sleep_re.search(docker_args)
+            if match:
+                sleep_secs = int(match.group(1))
+        if start_dt is not None and sleep_secs is not None:
+            shutdown_dt = start_dt + timedelta(seconds=sleep_secs)
+            remaining = shutdown_dt - datetime.now()
+            remaining_str = f"{remaining.seconds // 3600}h {remaining.seconds % 3600 // 60}m"
+            return remaining_str if remaining.total_seconds() > 0 else "Unknown"
+        else:
+            return "Unknown"
+
+    def list(self, verbose: bool = False) -> None:
         """List all pods in your RunPod account.
 
         Displays information about each pod including ID, name, GPU type, status, and connection details.
@@ -260,13 +295,15 @@ def list(self) -> None:
             logging.info(f"Pod {i + 1}:")
             logging.info(f"  ID: {pod.get('id')}")
             logging.info(f"  Name: {pod.get('name')}")
-            logging.info(f"  Machine Type: {pod.get('machine', {}).get('gpuDisplayName')}")
-            logging.info(f"  Status: {pod.get('desiredStatus')}")
-            public_ip = [i for i in pod.get("runtime", {}).get("ports", []) if i["isIpPublic"]]
-            if len(public_ip) != 1:
-                raise ValueError(f"Expected 1 public IP, got {len(public_ip)}")
-            logging.info(f"  Public IP: {public_ip[0].get('ip')}")
-            logging.info(f"  Public port: {public_ip[0].get('publicPort')}")
+            time_remaining = self._parse_time_remaining(pod)
+            logging.info(f"  Time remaining (est.): {time_remaining}")
+            if verbose:
+                public_ip, public_port = self._parse_public_ip(pod)
+                logging.info(f"  Public IP: {public_ip}")
+                logging.info(f"  Public port: {public_port}")
+                logging.info(f"  GPUs: {pod.get('gpuCount')} x {pod.get('machine', {}).get('gpuDisplayName')}")
+                for key in ["memoryInGb", "vcpuCount", "containerDiskInGb", "volumeMountPath", "costPerHr"]:
+                    logging.info(f"  {key}: {pod.get(key)}")
             logging.info("")
 
     def create(
@@ -339,6 +376,9 @@ def create(
         logging.info(f"  S3 endpoint: {self._client.s3_endpoint}")
         logging.info(f"  GPU Type: {spec.gpu_type}")
         logging.info(f"  GPU Count: {spec.gpu_count}")
+        logging.info(f"  Disk: {spec.container_disk_in_gb} GB")
+        logging.info(f"  Min CPU: {spec.min_vcpu_count}")
+        logging.info(f"  Min Memory: {spec.min_memory_in_gb} GB")
         logging.info(f"  runpodcli directory: {spec.runpodcli_dir}")
         logging.info(f"  Time limit: {runtime} minutes")
 

From 19ac5e7a485d26e5db4e79330f4b0ccc547f83ab Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Thu, 7 Aug 2025 13:18:50 +0100
Subject: [PATCH 03/16] Fixed timezones

---
 src/runpod_cli/cli.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index bc71355..d02118f 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -4,7 +4,7 @@
 import textwrap
 import time
 from dataclasses import dataclass
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from typing import Dict, List, Optional, Tuple
 
 import boto3
@@ -270,15 +270,16 @@ def _parse_time_remaining(self, pod: Dict) -> str:
         if isinstance(last_status_change, str):
             match = _date_re.search(last_status_change)
             if match:
-                start_dt = datetime.strptime(match.group(1), "%a %b %d %Y %H:%M:%S")
+                start_dt = datetime.strptime(match.group(1), "%a %b %d %Y %H:%M:%S").replace(tzinfo=timezone.utc)
         docker_args = pod.get("dockerArgs", "")
         if isinstance(docker_args, str):
             match = _sleep_re.search(docker_args)
             if match:
                 sleep_secs = int(match.group(1))
         if start_dt is not None and sleep_secs is not None:
+            now_dt = datetime.now(timezone.utc)
             shutdown_dt = start_dt + timedelta(seconds=sleep_secs)
-            remaining = shutdown_dt - datetime.now()
+            remaining = shutdown_dt - now_dt
             remaining_str = f"{remaining.seconds // 3600}h {remaining.seconds % 3600 // 60}m"
             return remaining_str if remaining.total_seconds() > 0 else "Unknown"
         else:
@@ -304,6 +305,9 @@ def list(self, verbose: bool = False) -> None:
                 logging.info(f"  GPUs: {pod.get('gpuCount')} x {pod.get('machine', {}).get('gpuDisplayName')}")
                 for key in ["memoryInGb", "vcpuCount", "containerDiskInGb", "volumeMountPath", "costPerHr"]:
                     logging.info(f"  {key}: {pod.get(key)}")
+                # For debugging
+                # for key in pod.keys():
+                #     logging.info(f"  {key}: {pod.get(key)}")
             logging.info("")
 
     def create(

From a1cc89615ae857615150c13b127c0d699a7c19b3 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 07:37:30 +0100
Subject: [PATCH 04/16] Add locales for git

---
 src/runpod_cli/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py
index e7dcaa1..16390a7 100644
--- a/src/runpod_cli/utils.py
+++ b/src/runpod_cli/utils.py
@@ -70,7 +70,7 @@ def get_setup_root(runpodcli_path: str, volume_mount_path: str) -> Tuple[str, st
 
         apt-get update
         apt-get upgrade -y
-        apt-get install -y sudo git vim ssh net-tools htop curl zip unzip tmux rsync libopenmpi-dev iputils-ping make fzf restic ripgrep wget pandoc poppler-utils pigz bzip2 nano
+        apt-get install -y sudo git vim ssh net-tools htop curl zip unzip tmux rsync libopenmpi-dev iputils-ping make fzf restic ripgrep wget pandoc poppler-utils pigz bzip2 nano locales
         echo 'user ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
         echo "export HF_HOME=/workspace/hf_home/" >> /home/user/.bashrc
         echo 'export PATH="$HOME/.local/bin:$PATH"' >> /home/user/.bashrc

From 02ec64d291e67d1f712ea4782e99d6686123eb12 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 07:50:01 +0100
Subject: [PATCH 05/16] Install plotly dependencies + move TransformerLens to
 system packages

---
 src/runpod_cli/utils.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py
index 16390a7..5f8c83d 100644
--- a/src/runpod_cli/utils.py
+++ b/src/runpod_cli/utils.py
@@ -120,13 +120,15 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[
 
         # Install Python packages using uv
         sudo pip install uv
-        sudo uv pip install ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git --system
-
-        # Create a virtual environment for the user
-        uv venv ~/.venv --python 3.11 --system-site-packages
+        sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git transformer_lens
+        # For plotly (kaleido) png export
+        sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2
+        sudo plotly_get_chrome -y
+        # Create a virtual environment for the user, install nnsight locally due to https://github.com/ndif-team/nnsight/issues/495
+        python_version=$(python --version | cut -d' ' -f2 | cut -d'.' -f1-2)
+        uv venv ~/.venv --python $python_version --system-site-packages
         source ~/.venv/bin/activate
-        pip install nnsight transformer_lens
-
+        python -m pip install nnsight
         echo "...user setup completed!"
     """.replace("RUNPODCLI_PATH", runpodcli_path)
         .replace("GIT_EMAIL", git_email)

From 542d9247a38a6a063f20ce37d21ba13adcc2f99b Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 08:33:47 +0100
Subject: [PATCH 06/16] Switch to virtualenv as it sets Usage:   pip <command>
 [options]

Commands:
  install                     Install packages.
  download                    Download packages.
  uninstall                   Uninstall packages.
  freeze                      Output installed packages in requirements format.
  inspect                     Inspect the python environment.
  list                        List installed packages.
  show                        Show information about installed packages.
  check                       Verify installed packages have compatible dependencies.
  config                      Manage local and global configuration.
  search                      Search PyPI for packages.
  cache                       Inspect and manage pip's wheel cache.
  index                       Inspect information available from package indexes.
  wheel                       Build wheels from your requirements.
  hash                        Compute hashes of package archives.
  completion                  A helper command used for command completion.
  debug                       Show information useful for debugging.
  help                        Show help for commands.

General Options:
  -h, --help                  Show help.
  --debug                     Let unhandled exceptions propagate outside the
                              main subroutine, instead of logging them to
                              stderr.
  --isolated                  Run pip in an isolated mode, ignoring
                              environment variables and user configuration.
  --require-virtualenv        Allow pip to only run in a virtual environment;
                              exit with an error otherwise.
  --python <python>           Run pip with the specified Python interpreter.
  -v, --verbose               Give more output. Option is additive, and can be
                              used up to 3 times.
  -V, --version               Show version and exit.
  -q, --quiet                 Give less output. Option is additive, and can be
                              used up to 3 times (corresponding to WARNING,
                              ERROR, and CRITICAL logging levels).
  --log <path>                Path to a verbose appending log.
  --no-input                  Disable prompting for input.
  --keyring-provider <keyring_provider>
                              Enable the credential lookup via the keyring
                              library if user input is allowed. Specify which
                              mechanism to use [disabled, import, subprocess].
                              (default: disabled)
  --proxy <proxy>             Specify a proxy in the form
                              scheme://[user:passwd@]proxy.server:port.
  --retries <retries>         Maximum number of retries each connection should
                              attempt (default 5 times).
  --timeout <sec>             Set the socket timeout (default 15 seconds).
  --exists-action <action>    Default action when a path already exists:
                              (s)witch, (i)gnore, (w)ipe, (b)ackup, (a)bort.
  --trusted-host <hostname>   Mark this host or host:port pair as trusted,
                              even though it does not have valid or any HTTPS.
  --cert <path>               Path to PEM-encoded CA certificate bundle. If
                              provided, overrides the default. See 'SSL
                              Certificate Verification' in pip documentation
                              for more information.
  --client-cert <path>        Path to SSL client certificate, a single file
                              containing the private key and the certificate
                              in PEM format.
  --cache-dir <dir>           Store the cache data in <dir>.
  --no-cache-dir              Disable the cache.
  --disable-pip-version-check
                              Don't periodically check PyPI to determine
                              whether a new version of pip is available for
                              download. Implied with --no-index.
  --no-color                  Suppress colored output.
  --no-python-version-warning
                              Silence deprecation warnings for upcoming
                              unsupported Pythons.
  --use-feature <feature>     Enable new functionality, that may be backward
                              incompatible.
  --use-deprecated <feature>  Enable deprecated functionality, that will be
                              removed in the future. correctly
---
 src/runpod_cli/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py
index 5f8c83d..c4f6907 100644
--- a/src/runpod_cli/utils.py
+++ b/src/runpod_cli/utils.py
@@ -126,7 +126,8 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[
         sudo plotly_get_chrome -y
         # Create a virtual environment for the user, install nnsight locally due to https://github.com/ndif-team/nnsight/issues/495
         python_version=$(python --version | cut -d' ' -f2 | cut -d'.' -f1-2)
-        uv venv ~/.venv --python $python_version --system-site-packages
+        # uv venv ~/.venv --python $python_version --system-site-packages
+        virtualenv ~/.venv --system-site-packages
         source ~/.venv/bin/activate
         python -m pip install nnsight
         echo "...user setup completed!"

From 099f2cc54db6f2ddcc1176c237021e2b29f9d65a Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 09:01:18 +0100
Subject: [PATCH 07/16] Install nnsight to system too

---
 src/runpod_cli/utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py
index c4f6907..a6cfff9 100644
--- a/src/runpod_cli/utils.py
+++ b/src/runpod_cli/utils.py
@@ -58,10 +58,13 @@ def get_setup_root(runpodcli_path: str, volume_mount_path: str) -> Tuple[str, st
         echo "Setting up system environment..."
 
         useradd --uid 1000 --shell /bin/bash user --groups sudo --create-home
+        # Set NNSIGHT_LOG_PATH to avoid https://github.com/ndif-team/nnsight/issues/495
+        echo "export NNSIGHT_LOG_PATH=/root/.local/state/nnsight" >> /root/.profile
+        echo "export NNSIGHT_LOG_PATH=/home/user/.local/state/nnsight" >> /home/user/.profile
+        chown user /home/user/.profile
         mkdir -p  /home/user/.ssh/
-        touch /home/user/.ssh/authorized_keys
-        chown user /home/user/.ssh/authorized_keys
         cat /root/.ssh/authorized_keys >> /home/user/.ssh/authorized_keys
+        chown user /home/user/.ssh/authorized_keys
 
         if [[ VOLUME_MOUNT_PATH != "/workspace" ]]; then
             rmdir /workspace
@@ -120,16 +123,13 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[
 
         # Install Python packages using uv
         sudo pip install uv
-        sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git transformer_lens
+        sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight
         # For plotly (kaleido) png export
         sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2
         sudo plotly_get_chrome -y
-        # Create a virtual environment for the user, install nnsight locally due to https://github.com/ndif-team/nnsight/issues/495
+        # Create a virtual environment for the user
         python_version=$(python --version | cut -d' ' -f2 | cut -d'.' -f1-2)
-        # uv venv ~/.venv --python $python_version --system-site-packages
-        virtualenv ~/.venv --system-site-packages
-        source ~/.venv/bin/activate
-        python -m pip install nnsight
+        uv venv ~/.venv --python $python_version --system-site-packages
         echo "...user setup completed!"
     """.replace("RUNPODCLI_PATH", runpodcli_path)
         .replace("GIT_EMAIL", git_email)

From c2eb6ac615f728b20091c555022ed29d26df7f43 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 09:06:30 +0100
Subject: [PATCH 08/16] Refactor cli.py to remove extra class [by GPT-5]

Remove RunPodClient, centralize logic in RunPodManager with PodSpec + tiny helpers; keep Fire CLI unchanged

Co-authored-by: GPT-5 Thinking
---
 src/runpod_cli/cli.py | 259 ++++++++++++++++--------------------------
 1 file changed, 95 insertions(+), 164 deletions(-)

diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index d02118f..96b7b27 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -23,7 +23,6 @@
         get_terminate,
     )
 except ImportError:
-    # Allow running cli.py directly from the repository
     from utils import (  # type: ignore
         DEFAULT_IMAGE_NAME,
         GPU_DISPLAY_NAME_TO_ID,
@@ -41,8 +40,6 @@
 
 @dataclass
 class PodSpec:
-    """Specification for RunPod infrastructure parameters."""
-
     gpu_type: str = "RTX A4000"
     image_name: str = DEFAULT_IMAGE_NAME
     gpu_count: int = 1
@@ -74,138 +71,6 @@ def get_s3_endpoint_from_volume_id(volume_id: str) -> str:
     return s3_endpoint
 
 
-class RunPodClient:
-    """Pure logic client for RunPod and S3 operations without user I/O."""
-
-    def __init__(self) -> None:
-        self.api_key = os.getenv("RUNPOD_API_KEY")
-        if not self.api_key:
-            raise ValueError("RUNPOD_API_KEY not found in environment. Set it in your .env file.")
-
-        network_volume_id = os.getenv("RUNPOD_NETWORK_VOLUME_ID")
-        if not network_volume_id:
-            raise ValueError("RUNPOD_NETWORK_VOLUME_ID not found in environment. Set it in your .env file.")
-        self.network_volume_id: str = network_volume_id
-
-        self.s3_access_key_id = os.getenv("RUNPOD_S3_ACCESS_KEY_ID")
-        self.s3_secret_key = os.getenv("RUNPOD_S3_SECRET_KEY")
-        if not self.s3_access_key_id or not self.s3_secret_key:
-            raise ValueError("RUNPOD_S3_ACCESS_KEY_ID or RUNPOD_S3_SECRET_KEY not found in environment. Set it in your .env file.")
-
-        runpod.api_key = self.api_key
-        self.region = get_region_from_volume_id(self.network_volume_id)
-        self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id)
-
-    def _make_s3_client(self):
-        return boto3.client(
-            "s3",
-            aws_access_key_id=self.s3_access_key_id,
-            aws_secret_access_key=self.s3_secret_key,
-            endpoint_url=self.s3_endpoint,
-            region_name=self.region,
-        )
-
-    def get_pods(self) -> List[Dict]:
-        return runpod.get_pods()  # type: ignore
-
-    def get_pod(self, pod_id: str) -> Dict:
-        return runpod.get_pod(pod_id)
-
-    def terminate_pod(self, pod_id: str) -> Optional[Dict]:
-        return runpod.terminate_pod(pod_id)
-
-    def upload_script_content(self, content: str, target: str) -> None:
-        s3 = self._make_s3_client()
-        s3.put_object(Bucket=self.network_volume_id, Key=target, Body=content.encode("utf-8"))
-
-    def download_file_content(self, remote_path: str) -> str:
-        s3 = self._make_s3_client()
-        response = s3.get_object(Bucket=self.network_volume_id, Key=remote_path)
-        return response["Body"].read().decode("utf-8")
-
-    def _build_docker_args(self, volume_mount_path: str, runpodcli_dir: str, runtime: int) -> str:
-        runpodcli_path = f"{volume_mount_path}/{runpodcli_dir}"
-        return (
-            "/bin/bash -c '"
-            + (f"mkdir -p {runpodcli_path}; bash {runpodcli_path}/start_pod.sh; sleep {max(runtime * 60, 20)}; bash {runpodcli_path}/terminate_pod.sh")
-            + "'"
-        )
-
-    def _provision_and_wait(self, pod_id: str, n_attempts: int = 60) -> Dict:
-        for i in range(n_attempts):
-            pod = runpod.get_pod(pod_id)
-            pod_runtime = pod.get("runtime")
-            if pod_runtime is None or not pod_runtime.get("ports"):
-                time.sleep(5)
-            else:
-                return pod
-        raise RuntimeError("Pod provisioning failed")
-
-    def create_pod(self, name: Optional[str], spec: PodSpec, runtime: int) -> Dict:
-        gpu_id = GPU_DISPLAY_NAME_TO_ID[spec.gpu_type] if spec.gpu_type in GPU_DISPLAY_NAME_TO_ID else spec.gpu_type
-
-        name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}"
-        spec.runpodcli_dir = spec.runpodcli_dir or f".tmp_{name.replace(' ', '_')}"
-
-        # Get scripts and upload to network volume
-        git_email = os.getenv("GIT_EMAIL", "")
-        git_name = os.getenv("GIT_NAME", "")
-        rpc_path = f"{spec.volume_mount_path}/{spec.runpodcli_dir}"
-        scripts = [
-            get_setup_root(rpc_path, spec.volume_mount_path),
-            get_setup_user(rpc_path, git_email, git_name),
-            get_start(rpc_path),
-            get_terminate(rpc_path),
-        ]
-        for script_name, script_content in scripts:
-            self.upload_script_content(content=script_content, target=f"{spec.runpodcli_dir}/{script_name}")
-
-        docker_args = self._build_docker_args(volume_mount_path=spec.volume_mount_path, runpodcli_dir=spec.runpodcli_dir, runtime=runtime)
-        name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}"
-
-        pod = runpod.create_pod(
-            name=name,
-            image_name=spec.image_name,
-            gpu_type_id=gpu_id,
-            cloud_type="SECURE",
-            gpu_count=spec.gpu_count,
-            container_disk_in_gb=spec.container_disk_in_gb,
-            min_vcpu_count=spec.min_vcpu_count,
-            min_memory_in_gb=spec.min_memory_in_gb,
-            docker_args=docker_args,
-            env=spec.env,
-            ports="8888/http,22/tcp",
-            volume_mount_path=spec.volume_mount_path,
-            network_volume_id=self.network_volume_id,
-        )
-
-        pod_id: str = pod.get("id")  # type: ignore
-        pod = self._provision_and_wait(pod_id)
-
-        return pod
-
-    def get_pod_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]:
-        """Extract public IP and port from pod runtime info."""
-        public_ips = [i for i in pod["runtime"]["ports"] if i["isIpPublic"]]
-        if len(public_ips) != 1:
-            raise ValueError(f"Expected 1 public IP, got {public_ips}")
-        ip = public_ips[0].get("ip")
-        port = public_ips[0].get("publicPort")
-        return ip, port
-
-    def get_host_keys(self, remote_path: str) -> List[Tuple[str, str]]:
-        """Download host keys from S3 and return list of (algorithm, key) pairs."""
-        host_keys = []
-        for file in ["ssh_ed25519_host_key", "ssh_ecdsa_host_key", "ssh_rsa_host_key", "ssh_dsa_host_key"]:
-            try:
-                host_key_text = self.download_file_content(f"{remote_path}/{file}").strip()
-                alg, key, _ = host_key_text.split(" ")
-                host_keys.append((alg, key))
-            except Exception:
-                continue
-        return host_keys
-
-
 class RunPodManager:
     """RunPod Management CLI - A command-line tool for managing RunPod instances via the RunPod API.
 
@@ -225,21 +90,13 @@ class RunPodManager:
     """
 
     def __init__(self, env: Optional[str] = None) -> None:
-        """Initialize the RunPod manager.
-
-        Args:
-            env: Path to the .env file (optional). If not provided, will search for .env files in default locations.
-        """
-        # Load environment variables
         if env:
             logging.info(f"Using .env file: {env}")
-            # Use the specified .env file
             env_path = os.path.expanduser(env)
             if not os.path.exists(env_path):
                 raise FileNotFoundError(f"Specified .env file not found: {env_path}")
             load_dotenv(override=True, dotenv_path=env_path)
         else:
-            # Use default .env file search logic
             xdg_config_dir = os.environ.get("XDG_CONFIG_HOME", os.path.expanduser("~/.config"))
             env_paths = [".env", os.path.join(xdg_config_dir, "runpod_cli/.env")]
             env_exists = [os.path.exists(os.path.expanduser(path)) for path in env_paths]
@@ -249,10 +106,49 @@ def __init__(self, env: Optional[str] = None) -> None:
                 raise FileExistsError(f"Multiple .env files found in {env_paths}")
             load_dotenv(override=True, dotenv_path=os.path.expanduser(env_paths[env_exists.index(True)]))
 
-        self._client = RunPodClient()
+        api_key = os.getenv("RUNPOD_API_KEY")
+        if not api_key:
+            raise ValueError("RUNPOD_API_KEY not found in environment. Set it in your .env file.")
+        runpod.api_key = api_key
+
+        network_volume_id = os.getenv("RUNPOD_NETWORK_VOLUME_ID")
+        if not network_volume_id:
+            raise ValueError("RUNPOD_NETWORK_VOLUME_ID not found in environment. Set it in your .env file.")
+        self.network_volume_id: str = network_volume_id
 
-    def _parse_public_ip(self, pod: Dict) -> str:
-        """Parse public IP and port from pod runtime info."""
+        s3_access_key_id = os.getenv("RUNPOD_S3_ACCESS_KEY_ID")
+        s3_secret_key = os.getenv("RUNPOD_S3_SECRET_KEY")
+        if not s3_access_key_id or not s3_secret_key:
+            raise ValueError("RUNPOD_S3_ACCESS_KEY_ID or RUNPOD_S3_SECRET_KEY not found in environment. Set it in your .env file.")
+        self.region = get_region_from_volume_id(self.network_volume_id)
+        self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id)
+        self._s3 = boto3.client(
+            "s3",
+            aws_access_key_id=s3_access_key_id,
+            aws_secret_access_key=s3_secret_key,
+            endpoint_url=self.s3_endpoint,
+            region_name=self.region,
+        )
+
+    def _build_docker_args(self, volume_mount_path: str, runpodcli_dir: str, runtime: int) -> str:
+        runpodcli_path = f"{volume_mount_path}/{runpodcli_dir}"
+        return (
+            "/bin/bash -c '"
+            + (f"mkdir -p {runpodcli_path}; bash {runpodcli_path}/start_pod.sh; sleep {max(runtime * 60, 20)}; bash {runpodcli_path}/terminate_pod.sh")
+            + "'"
+        )
+
+    def _provision_and_wait(self, pod_id: str, n_attempts: int = 60) -> Dict:
+        for _ in range(n_attempts):
+            pod = runpod.get_pod(pod_id)
+            pod_runtime = pod.get("runtime")
+            if pod_runtime is None or not pod_runtime.get("ports"):
+                time.sleep(5)
+            else:
+                return pod
+        raise RuntimeError("Pod provisioning failed")
+
+    def _get_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]:
         public_ips = [i for i in pod["runtime"]["ports"] if i["isIpPublic"]]
         if len(public_ips) != 1:
             raise ValueError(f"Expected 1 public IP, got {public_ips}")
@@ -261,7 +157,6 @@ def _parse_public_ip(self, pod: Dict) -> str:
         return ip, port
 
     def _parse_time_remaining(self, pod: Dict) -> str:
-        """Parse time remaining before pod shutdown from pod runtime info."""
         _sleep_re = re.compile(r"\bsleep\s+(\d+)\b")
         _date_re = re.compile(r":\s*(\w{3}\s+\w{3}\s+\d{2}\s+\d{4}\s+\d{2}:\d{2}:\d{2})\s+GMT")
         start_dt = None
@@ -290,7 +185,7 @@ def list(self, verbose: bool = False) -> None:
 
         Displays information about each pod including ID, name, GPU type, status, and connection details.
         """
-        pods = self._client.get_pods()
+        pods = runpod.get_pods()  # type: ignore
 
         for i, pod in enumerate(pods):
             logging.info(f"Pod {i + 1}:")
@@ -299,15 +194,12 @@ def list(self, verbose: bool = False) -> None:
             time_remaining = self._parse_time_remaining(pod)
             logging.info(f"  Time remaining (est.): {time_remaining}")
             if verbose:
-                public_ip, public_port = self._parse_public_ip(pod)
+                public_ip, public_port = self._get_public_ip_and_port(pod)
                 logging.info(f"  Public IP: {public_ip}")
                 logging.info(f"  Public port: {public_port}")
                 logging.info(f"  GPUs: {pod.get('gpuCount')} x {pod.get('machine', {}).get('gpuDisplayName')}")
                 for key in ["memoryInGb", "vcpuCount", "containerDiskInGb", "volumeMountPath", "costPerHr"]:
                     logging.info(f"  {key}: {pod.get(key)}")
-                # For debugging
-                # for key in pod.keys():
-                #     logging.info(f"  {key}: {pod.get(key)}")
             logging.info("")
 
     def create(
@@ -347,8 +239,6 @@ def create(
             rpc create --runtime=240 --gpu_type="RTX A4000" --num_gpus=2 --name="dual-gpu-pod"
         """
         spec = PodSpec()
-
-        # Override spec with individual parameters if provided
         if gpu_type is not None:
             spec.gpu_type = gpu_type
         if image_name is not None:
@@ -372,12 +262,16 @@ def create(
         if update_known_hosts is not None:
             spec.update_known_hosts = update_known_hosts
 
+        gpu_id = GPU_DISPLAY_NAME_TO_ID[spec.gpu_type] if spec.gpu_type in GPU_DISPLAY_NAME_TO_ID else spec.gpu_type
+        name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}"
+        spec.runpodcli_dir = spec.runpodcli_dir or f".tmp_{name.replace(' ', '_')}"
+
         logging.info("Creating pod with:")
         logging.info(f"  Name: {name}")
         logging.info(f"  Image: {spec.image_name}")
-        logging.info(f"  Network volume ID: {self._client.network_volume_id}")
-        logging.info(f"  Region: {self._client.region}")
-        logging.info(f"  S3 endpoint: {self._client.s3_endpoint}")
+        logging.info(f"  Network volume ID: {self.network_volume_id}")
+        logging.info(f"  Region: {self.region}")
+        logging.info(f"  S3 endpoint: {self.s3_endpoint}")
         logging.info(f"  GPU Type: {spec.gpu_type}")
         logging.info(f"  GPU Count: {spec.gpu_count}")
         logging.info(f"  Disk: {spec.container_disk_in_gb} GB")
@@ -386,12 +280,42 @@ def create(
         logging.info(f"  runpodcli directory: {spec.runpodcli_dir}")
         logging.info(f"  Time limit: {runtime} minutes")
 
+        git_email = os.getenv("GIT_EMAIL", "")
+        git_name = os.getenv("GIT_NAME", "")
+        rpc_path = f"{spec.volume_mount_path}/{spec.runpodcli_dir}"
+        scripts = [
+            get_setup_root(rpc_path, spec.volume_mount_path),
+            get_setup_user(rpc_path, git_email, git_name),
+            get_start(rpc_path),
+            get_terminate(rpc_path),
+        ]
+        for script_name, script_content in scripts:
+            self._s3.put_object(Bucket=self.network_volume_id, Key=f"{spec.runpodcli_dir}/{script_name}", Body=script_content.encode("utf-8"))
+
+        docker_args = self._build_docker_args(volume_mount_path=spec.volume_mount_path, runpodcli_dir=spec.runpodcli_dir, runtime=runtime)
+
+        pod = runpod.create_pod(
+            name=name,
+            image_name=spec.image_name,
+            gpu_type_id=gpu_id,
+            cloud_type="SECURE",
+            gpu_count=spec.gpu_count,
+            container_disk_in_gb=spec.container_disk_in_gb,
+            min_vcpu_count=spec.min_vcpu_count,
+            min_memory_in_gb=spec.min_memory_in_gb,
+            docker_args=docker_args,
+            env=spec.env,
+            ports="8888/http,22/tcp",
+            volume_mount_path=spec.volume_mount_path,
+            network_volume_id=self.network_volume_id,
+        )
+
+        pod_id: str = pod.get("id")  # type: ignore
         logging.info("Pod created. Provisioning...")
-        pod = self._client.create_pod(name, spec, runtime)
+        pod = self._provision_and_wait(pod_id)
         logging.info("Pod provisioned.")
 
-        # Handle SSH config and known hosts
-        ip, port = self._client.get_pod_public_ip_and_port(pod)
+        ip, port = self._get_public_ip_and_port(pod)
 
         if spec.update_ssh_config:
             self._write_ssh_config(ip, port, spec.forward_agent)
@@ -419,10 +343,17 @@ def _write_ssh_config(self, ip: str, port: int, forward_agent: bool, config_path
         logging.info(f"SSH config at {config_path} updated")
 
     def _update_known_hosts_file(self, public_ip: str, port: int, runpodcli_dir: str) -> None:
-        """Update SSH known hosts file with pod host keys."""
-        host_keys = self._client.get_host_keys(runpodcli_dir)
-        known_hosts_path = os.path.expanduser("~/.ssh/known_hosts.runpod_cli")
+        host_keys: List[Tuple[str, str]] = []
+        for file in ["ssh_ed25519_host_key", "ssh_ecdsa_host_key", "ssh_rsa_host_key", "ssh_dsa_host_key"]:
+            try:
+                obj = self._s3.get_object(Bucket=self.network_volume_id, Key=f"{runpodcli_dir}/{file}")
+                host_key_text = obj["Body"].read().decode("utf-8").strip()
+                alg, key, _ = host_key_text.split(" ")
+                host_keys.append((alg, key))
+            except Exception:
+                continue
 
+        known_hosts_path = os.path.expanduser("~/.ssh/known_hosts.runpod_cli")
         for alg, key in host_keys:
             try:
                 with open(known_hosts_path, "a") as dest:
@@ -441,7 +372,7 @@ def terminate(self, pod_id: str) -> None:
             rpc terminate --pod_id=abc123
         """
         logging.info(f"Terminating pod {pod_id}")
-        self._client.terminate_pod(pod_id)
+        _ = runpod.terminate_pod(pod_id)
 
 
 def main():

From 9fa99c0e83441098921f84d94bdc387a99496f25 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 10:00:13 +0100
Subject: [PATCH 09/16] Remove superfluous RunPodSpec class

---
 src/runpod_cli/cli.py | 127 ++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 85 deletions(-)

diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index 96b7b27..fe63b69 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -3,7 +3,6 @@
 import re
 import textwrap
 import time
-from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Dict, List, Optional, Tuple
 
@@ -38,22 +37,6 @@
 import runpod  # noqa: E402
 
 
-@dataclass
-class PodSpec:
-    gpu_type: str = "RTX A4000"
-    image_name: str = DEFAULT_IMAGE_NAME
-    gpu_count: int = 1
-    min_vcpu_count: int = 1
-    min_memory_in_gb: int = 1
-    container_disk_in_gb: int = 30
-    volume_mount_path: str = "/network"
-    runpodcli_dir: Optional[str] = None
-    env: Optional[Dict[str, str]] = None
-    update_ssh_config: bool = True
-    forward_agent: bool = False
-    update_known_hosts: bool = True
-
-
 def get_region_from_volume_id(volume_id: str) -> str:
     api_key = os.getenv("RUNPOD_API_KEY")
     url = f"https://rest.runpod.io/v1/networkvolumes/{volume_id}"
@@ -206,17 +189,16 @@ def create(
         self,
         name: Optional[str] = None,
         runtime: int = 60,
-        gpu_type: Optional[str] = None,
-        cpus: Optional[int] = None,
-        disk: Optional[int] = None,
-        env: Optional[Dict[str, str]] = None,
-        forward_agent: Optional[bool] = None,
-        image_name: Optional[str] = None,
-        memory: Optional[int] = None,
-        num_gpus: Optional[int] = None,
-        update_known_hosts: Optional[bool] = None,
-        update_ssh_config: Optional[bool] = None,
-        volume_mount_path: Optional[str] = None,
+        gpu_type: str = "RTX A4000",
+        cpus: int = 1,
+        disk: int = 30,
+        forward_agent: bool = False,
+        image_name: str = DEFAULT_IMAGE_NAME,
+        memory: int = 1,
+        num_gpus: int = 1,
+        update_known_hosts: bool = True,
+        update_ssh_config: bool = True,
+        volume_mount_path: str = "/network",
     ) -> None:
         """Create a new RunPod instance with the specified parameters.
 
@@ -225,7 +207,7 @@ def create(
             gpu_type: GPU type (default: "RTX A4000")
             num_gpus: Number of GPUs (default: 1)
             name: Name for the pod (default: "$USER-$GPU_TYPE")
-            env: File to load RunPod credentials from (default: .env and ~/.config/runpod_cli/.env)
+            env: Path to credentials .env (defalt: .env and ~/.config/runpod_cli/.env)
             disk: Container disk size in GB (default: 30)
             cpus: Minimum CPU count (default: 1)
             memory: Minimum RAM in GB (default: 1)
@@ -234,79 +216,56 @@ def create(
             update_ssh_config: Whether to update SSH config (default: True)
             image_name: Docker image (default: "PyTorch 2.8.0 with CUDA 12.8.1")
 
-        Examples:
-            rpc create -r 60 -g "A100 PCIe"
-            rpc create --runtime=240 --gpu_type="RTX A4000" --num_gpus=2 --name="dual-gpu-pod"
+        Example:
+            rpc create -r 60 -g "A100 SXM"
+            rpc create --gpu_type="RTX A4000" --runtime=480
         """
-        spec = PodSpec()
-        if gpu_type is not None:
-            spec.gpu_type = gpu_type
-        if image_name is not None:
-            spec.image_name = image_name
-        if num_gpus is not None:
-            spec.gpu_count = num_gpus
-        if cpus is not None:
-            spec.min_vcpu_count = cpus
-        if memory is not None:
-            spec.min_memory_in_gb = memory
-        if disk is not None:
-            spec.container_disk_in_gb = disk
-        if volume_mount_path is not None:
-            spec.volume_mount_path = volume_mount_path
-        if env is not None:
-            spec.env = env
-        if update_ssh_config is not None:
-            spec.update_ssh_config = update_ssh_config
-        if forward_agent is not None:
-            spec.forward_agent = forward_agent
-        if update_known_hosts is not None:
-            spec.update_known_hosts = update_known_hosts
-
-        gpu_id = GPU_DISPLAY_NAME_TO_ID[spec.gpu_type] if spec.gpu_type in GPU_DISPLAY_NAME_TO_ID else spec.gpu_type
+        gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type] if gpu_type in GPU_DISPLAY_NAME_TO_ID else gpu_type
         name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}"
-        spec.runpodcli_dir = spec.runpodcli_dir or f".tmp_{name.replace(' ', '_')}"
+        runpodcli_dir = f".tmp_{name.replace(' ', '_')}"
 
         logging.info("Creating pod with:")
         logging.info(f"  Name: {name}")
-        logging.info(f"  Image: {spec.image_name}")
+        logging.info(f"  Image: {image_name}")
         logging.info(f"  Network volume ID: {self.network_volume_id}")
         logging.info(f"  Region: {self.region}")
         logging.info(f"  S3 endpoint: {self.s3_endpoint}")
-        logging.info(f"  GPU Type: {spec.gpu_type}")
-        logging.info(f"  GPU Count: {spec.gpu_count}")
-        logging.info(f"  Disk: {spec.container_disk_in_gb} GB")
-        logging.info(f"  Min CPU: {spec.min_vcpu_count}")
-        logging.info(f"  Min Memory: {spec.min_memory_in_gb} GB")
-        logging.info(f"  runpodcli directory: {spec.runpodcli_dir}")
+        logging.info(f"  GPU Type: {gpu_type}")
+        logging.info(f"  GPU Count: {num_gpus}")
+        logging.info(f"  Disk: {disk} GB")
+        logging.info(f"  Min CPU: {cpus}")
+        logging.info(f"  Min Memory: {memory} GB")
+        logging.info(f"  runpodcli directory: {runpodcli_dir}")
         logging.info(f"  Time limit: {runtime} minutes")
 
         git_email = os.getenv("GIT_EMAIL", "")
         git_name = os.getenv("GIT_NAME", "")
-        rpc_path = f"{spec.volume_mount_path}/{spec.runpodcli_dir}"
+        remote_scripts_path = f"{volume_mount_path}/{runpodcli_dir}"
         scripts = [
-            get_setup_root(rpc_path, spec.volume_mount_path),
-            get_setup_user(rpc_path, git_email, git_name),
-            get_start(rpc_path),
-            get_terminate(rpc_path),
+            get_setup_root(remote_scripts_path, volume_mount_path),
+            get_setup_user(remote_scripts_path, git_email, git_name),
+            get_start(remote_scripts_path),
+            get_terminate(remote_scripts_path),
         ]
         for script_name, script_content in scripts:
-            self._s3.put_object(Bucket=self.network_volume_id, Key=f"{spec.runpodcli_dir}/{script_name}", Body=script_content.encode("utf-8"))
+            # s3_key is relative to /volume_mount_path, while remote_scripts_path is relative to /
+            s3_key = f"{runpodcli_dir}/{script_name}"
+            self._s3.put_object(Bucket=self.network_volume_id, Key=s3_key, Body=script_content.encode("utf-8"))
 
-        docker_args = self._build_docker_args(volume_mount_path=spec.volume_mount_path, runpodcli_dir=spec.runpodcli_dir, runtime=runtime)
+        docker_args = self._build_docker_args(volume_mount_path=volume_mount_path, runpodcli_dir=runpodcli_dir, runtime=runtime)
 
         pod = runpod.create_pod(
             name=name,
-            image_name=spec.image_name,
+            image_name=image_name,
             gpu_type_id=gpu_id,
             cloud_type="SECURE",
-            gpu_count=spec.gpu_count,
-            container_disk_in_gb=spec.container_disk_in_gb,
-            min_vcpu_count=spec.min_vcpu_count,
-            min_memory_in_gb=spec.min_memory_in_gb,
+            gpu_count=num_gpus,
+            container_disk_in_gb=disk,
+            min_vcpu_count=cpus,
+            min_memory_in_gb=memory,
             docker_args=docker_args,
-            env=spec.env,
             ports="8888/http,22/tcp",
-            volume_mount_path=spec.volume_mount_path,
+            volume_mount_path=volume_mount_path,
             network_volume_id=self.network_volume_id,
         )
 
@@ -317,14 +276,12 @@ def create(
 
         ip, port = self._get_public_ip_and_port(pod)
 
-        if spec.update_ssh_config:
-            self._write_ssh_config(ip, port, spec.forward_agent)
+        if update_ssh_config:
+            self._write_ssh_config(ip, port, forward_agent)
 
-        if spec.update_known_hosts:
+        if update_known_hosts:
             time.sleep(5)
-            if spec.runpodcli_dir is None:
-                raise ValueError("runpodcli_dir should be set by this point")
-            self._update_known_hosts_file(ip, port, spec.runpodcli_dir)
+            self._update_known_hosts_file(ip, port, runpodcli_dir)
 
     def _generate_ssh_config(self, ip: str, port: int, forward_agent: bool = False) -> str:
         return textwrap.dedent(f"""

From 5af08092e43aa9c207a793a70a3185032988750f Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 10:02:03 +0100
Subject: [PATCH 10/16] Simplify getenv

---
 src/runpod_cli/cli.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index fe63b69..1a3cfe1 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -54,6 +54,13 @@ def get_s3_endpoint_from_volume_id(volume_id: str) -> str:
     return s3_endpoint
 
 
+def getenv(key: str) -> str:
+    value = os.getenv(key)
+    if not value:
+        raise ValueError(f"{key} not found in environment. Set it in your .env file.")
+    return value
+
+
 class RunPodManager:
     """RunPod Management CLI - A command-line tool for managing RunPod instances via the RunPod API.
 
@@ -89,20 +96,10 @@ def __init__(self, env: Optional[str] = None) -> None:
                 raise FileExistsError(f"Multiple .env files found in {env_paths}")
             load_dotenv(override=True, dotenv_path=os.path.expanduser(env_paths[env_exists.index(True)]))
 
-        api_key = os.getenv("RUNPOD_API_KEY")
-        if not api_key:
-            raise ValueError("RUNPOD_API_KEY not found in environment. Set it in your .env file.")
-        runpod.api_key = api_key
-
-        network_volume_id = os.getenv("RUNPOD_NETWORK_VOLUME_ID")
-        if not network_volume_id:
-            raise ValueError("RUNPOD_NETWORK_VOLUME_ID not found in environment. Set it in your .env file.")
-        self.network_volume_id: str = network_volume_id
-
-        s3_access_key_id = os.getenv("RUNPOD_S3_ACCESS_KEY_ID")
-        s3_secret_key = os.getenv("RUNPOD_S3_SECRET_KEY")
-        if not s3_access_key_id or not s3_secret_key:
-            raise ValueError("RUNPOD_S3_ACCESS_KEY_ID or RUNPOD_S3_SECRET_KEY not found in environment. Set it in your .env file.")
+        runpod.api_key = getenv("RUNPOD_API_KEY")
+        self.network_volume_id: str = getenv("RUNPOD_NETWORK_VOLUME_ID")
+        s3_access_key_id = getenv("RUNPOD_S3_ACCESS_KEY_ID")
+        s3_secret_key = getenv("RUNPOD_S3_SECRET_KEY")
         self.region = get_region_from_volume_id(self.network_volume_id)
         self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id)
         self._s3 = boto3.client(

From 2347162a59b198594ab00405cbee29b5a1a7de1e Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 10:20:50 +0100
Subject: [PATCH 11/16] Implement fuzzy GPU matching

---
 README.md             |  2 +-
 src/runpod_cli/cli.py | 32 ++++++++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 82c6be5..bf994f1 100644
--- a/README.md
+++ b/README.md
@@ -130,7 +130,7 @@ into the environment.
 
 ## Known issues
 Python Fire has a known issue (fixed & merged on [GitHub](https://github.com/google/python-fire/pull/588/files) but not released on PyPI yet)
-with ipython==9.0 which will produce the following error:
+with ipython>=9.0 which will produce the following error:
 ```
 ERROR  | Uncaught exception | <class 'TypeError'>; Inspector.__init__() missing 1 required keyword-only argument: 'theme_name';
 ```
diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index 1a3cfe1..b7a7a60 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -134,6 +134,8 @@ def _get_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]:
             raise ValueError(f"Expected 1 public IP, got {public_ips}")
         ip = public_ips[0].get("ip")
         port = public_ips[0].get("publicPort")
+        if not ip or port is None:
+            raise ValueError(f"Expected public IP and port, got {ip} and {port} from {public_ips}")
         return ip, port
 
     def _parse_time_remaining(self, pod: Dict) -> str:
@@ -155,11 +157,33 @@ def _parse_time_remaining(self, pod: Dict) -> str:
             now_dt = datetime.now(timezone.utc)
             shutdown_dt = start_dt + timedelta(seconds=sleep_secs)
             remaining = shutdown_dt - now_dt
-            remaining_str = f"{remaining.seconds // 3600}h {remaining.seconds % 3600 // 60}m"
-            return remaining_str if remaining.total_seconds() > 0 else "Unknown"
+            total = int(remaining.total_seconds())
+            remaining_str = f"{total // 3600}h {(total % 3600) // 60}m"
+            return remaining_str if total > 0 else "Unknown"
         else:
             return "Unknown"
 
+    def _get_gpu_id(self, gpu_type: str) -> Tuple[str, str]:
+        if gpu_type in GPU_DISPLAY_NAME_TO_ID:
+            # A name was passed
+            gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type]
+            gpu_name = gpu_type
+        elif gpu_type in GPU_ID_TO_DISPLAY_NAME:
+            # An ID was passed
+            gpu_id = gpu_type
+            gpu_name = GPU_ID_TO_DISPLAY_NAME[gpu_id]
+        else:
+            # Attempt fuzzy matching, but only if unique
+            matches = [gpu_id for gpu_name, gpu_id in GPU_DISPLAY_NAME_TO_ID.items() if gpu_type.lower() in gpu_id.lower() or gpu_type.lower() in gpu_name.lower()]
+            if len(matches) == 1:
+                gpu_id = matches[0]
+                gpu_name = GPU_ID_TO_DISPLAY_NAME[gpu_id]
+            elif len(matches) > 1:
+                raise ValueError(f"Ambiguous GPU type: {gpu_type} matches {matches}. Please use a full name or ID from https://docs.runpod.io/references/gpu-types")
+            else:
+                raise ValueError(f"Unknown GPU type: {gpu_type}")
+        return gpu_id, gpu_name
+
     def list(self, verbose: bool = False) -> None:
         """List all pods in your RunPod account.
 
@@ -217,8 +241,8 @@ def create(
             rpc create -r 60 -g "A100 SXM"
             rpc create --gpu_type="RTX A4000" --runtime=480
         """
-        gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type] if gpu_type in GPU_DISPLAY_NAME_TO_ID else gpu_type
-        name = name or f"{os.getenv('USER')}-{GPU_ID_TO_DISPLAY_NAME[gpu_id]}"
+        gpu_id, gpu_name = self._get_gpu_id(gpu_type)
+        name = name or f"{os.getenv('USER')}-{gpu_name}"
         runpodcli_dir = f".tmp_{name.replace(' ', '_')}"
 
         logging.info("Creating pod with:")

From 59e69a06176bf09622923a002e0bb5153ec4af3e Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Sun, 10 Aug 2025 10:34:06 +0100
Subject: [PATCH 12/16] Make attributes private to now show up in Fire's VALUES
 help

---
 README.md             |  3 +--
 src/runpod_cli/cli.py | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index bf994f1..6ae5d28 100644
--- a/README.md
+++ b/README.md
@@ -143,5 +143,4 @@ ERROR  | Uncaught exception | <class 'TypeError'>; Inspector.__init__() missing
 - Allow user to configre an SSH_PUBLIC_KEY_PATH in .env
 - Pre-install VS Code / Cursor server
 - Change names & ssh aliases if a user requests multiple GPUs (e.g. runpod, runpod-1, etc.)
-- Create a .config/runpod_cli/config file to change the default values (e.g. GPU type, runtime, etc.)
-- Allow for shorter cmdline arguments (e.g. -g or --gpu instead of --gpu_type)
\ No newline at end of file
+- Create a .config/runpod_cli/config file to change the default values (e.g. GPU type, runtime, etc.)
\ No newline at end of file
diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index b7a7a60..cb48985 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -97,17 +97,17 @@ def __init__(self, env: Optional[str] = None) -> None:
             load_dotenv(override=True, dotenv_path=os.path.expanduser(env_paths[env_exists.index(True)]))
 
         runpod.api_key = getenv("RUNPOD_API_KEY")
-        self.network_volume_id: str = getenv("RUNPOD_NETWORK_VOLUME_ID")
+        self._network_volume_id: str = getenv("RUNPOD_NETWORK_VOLUME_ID")
         s3_access_key_id = getenv("RUNPOD_S3_ACCESS_KEY_ID")
         s3_secret_key = getenv("RUNPOD_S3_SECRET_KEY")
-        self.region = get_region_from_volume_id(self.network_volume_id)
-        self.s3_endpoint = get_s3_endpoint_from_volume_id(self.network_volume_id)
+        self._region = get_region_from_volume_id(self._network_volume_id)
+        s3_endpoint_url = get_s3_endpoint_from_volume_id(self._network_volume_id)
         self._s3 = boto3.client(
             "s3",
             aws_access_key_id=s3_access_key_id,
             aws_secret_access_key=s3_secret_key,
-            endpoint_url=self.s3_endpoint,
-            region_name=self.region,
+            endpoint_url=s3_endpoint_url,
+            region_name=self._region,
         )
 
     def _build_docker_args(self, volume_mount_path: str, runpodcli_dir: str, runtime: int) -> str:
@@ -136,7 +136,7 @@ def _get_public_ip_and_port(self, pod: Dict) -> Tuple[str, int]:
         port = public_ips[0].get("publicPort")
         if not ip or port is None:
             raise ValueError(f"Expected public IP and port, got {ip} and {port} from {public_ips}")
-        return ip, port
+        return str(ip), int(port)
 
     def _parse_time_remaining(self, pod: Dict) -> str:
         _sleep_re = re.compile(r"\bsleep\s+(\d+)\b")
@@ -248,9 +248,8 @@ def create(
         logging.info("Creating pod with:")
         logging.info(f"  Name: {name}")
         logging.info(f"  Image: {image_name}")
-        logging.info(f"  Network volume ID: {self.network_volume_id}")
-        logging.info(f"  Region: {self.region}")
-        logging.info(f"  S3 endpoint: {self.s3_endpoint}")
+        logging.info(f"  Network volume ID: {self._network_volume_id}")
+        logging.info(f"  Region: {self._region}")
         logging.info(f"  GPU Type: {gpu_type}")
         logging.info(f"  GPU Count: {num_gpus}")
         logging.info(f"  Disk: {disk} GB")
@@ -271,7 +270,7 @@ def create(
         for script_name, script_content in scripts:
             # s3_key is relative to /volume_mount_path, while remote_scripts_path is relative to /
             s3_key = f"{runpodcli_dir}/{script_name}"
-            self._s3.put_object(Bucket=self.network_volume_id, Key=s3_key, Body=script_content.encode("utf-8"))
+            self._s3.put_object(Bucket=self._network_volume_id, Key=s3_key, Body=script_content.encode("utf-8"))
 
         docker_args = self._build_docker_args(volume_mount_path=volume_mount_path, runpodcli_dir=runpodcli_dir, runtime=runtime)
 
@@ -287,7 +286,7 @@ def create(
             docker_args=docker_args,
             ports="8888/http,22/tcp",
             volume_mount_path=volume_mount_path,
-            network_volume_id=self.network_volume_id,
+            network_volume_id=self._network_volume_id,
         )
 
         pod_id: str = pod.get("id")  # type: ignore
@@ -324,7 +323,7 @@ def _update_known_hosts_file(self, public_ip: str, port: int, runpodcli_dir: str
         host_keys: List[Tuple[str, str]] = []
         for file in ["ssh_ed25519_host_key", "ssh_ecdsa_host_key", "ssh_rsa_host_key", "ssh_dsa_host_key"]:
             try:
-                obj = self._s3.get_object(Bucket=self.network_volume_id, Key=f"{runpodcli_dir}/{file}")
+                obj = self._s3.get_object(Bucket=self._network_volume_id, Key=f"{runpodcli_dir}/{file}")
                 host_key_text = obj["Body"].read().decode("utf-8").strip()
                 alg, key, _ = host_key_text.split(" ")
                 host_keys.append((alg, key))

From 47a2412557a4347bb41acaa8f4a731e6d0563de6 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Fri, 15 Aug 2025 12:21:37 +0100
Subject: [PATCH 13/16] Compile bytecode to avoid slow imports

---
 src/runpod_cli/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py
index a6cfff9..f07349b 100644
--- a/src/runpod_cli/utils.py
+++ b/src/runpod_cli/utils.py
@@ -123,7 +123,7 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[
 
         # Install Python packages using uv
         sudo pip install uv
-        sudo uv pip install --system ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight
+        sudo uv pip install --system --compile-bytecode ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight
         # For plotly (kaleido) png export
         sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2
         sudo plotly_get_chrome -y

From 707c4148b8d13fc265f8e42e112c7754b1483f86 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Mon, 18 Aug 2025 10:39:41 +0100
Subject: [PATCH 14/16] Convert GPU types like 5090 to str

---
 src/runpod_cli/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runpod_cli/cli.py b/src/runpod_cli/cli.py
index cb48985..032309c 100644
--- a/src/runpod_cli/cli.py
+++ b/src/runpod_cli/cli.py
@@ -163,7 +163,8 @@ def _parse_time_remaining(self, pod: Dict) -> str:
         else:
             return "Unknown"
 
-    def _get_gpu_id(self, gpu_type: str) -> Tuple[str, str]:
+    def _get_gpu_id(self, gpu_type: str | int) -> Tuple[str, str]:
+        gpu_type = str(gpu_type)
         if gpu_type in GPU_DISPLAY_NAME_TO_ID:
             # A name was passed
             gpu_id = GPU_DISPLAY_NAME_TO_ID[gpu_type]

From 9d968b8f01e6746d6d921e5f734bfe304e5da248 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Mon, 18 Aug 2025 16:03:01 +0100
Subject: [PATCH 15/16] Add scikit-image to pre-installed packages

---
 src/runpod_cli/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py
index f07349b..ce43ca9 100644
--- a/src/runpod_cli/utils.py
+++ b/src/runpod_cli/utils.py
@@ -123,7 +123,7 @@ def get_setup_user(runpodcli_path: str, git_email: str, git_name: str) -> Tuple[
 
         # Install Python packages using uv
         sudo pip install uv
-        sudo uv pip install --system --compile-bytecode ipykernel kaleido nbformat numpy scipy scikit-learn transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight
+        sudo uv pip install --system --compile-bytecode ipykernel kaleido nbformat numpy scipy scikit-learn scikit-image transformers datasets torchvision pandas matplotlib seaborn plotly jaxtyping einops tqdm ruff basedpyright umap-learn ipywidgets virtualenv  pytest git+https://github.com/callummcdougall/eindex.git transformer_lens nnsight
         # For plotly (kaleido) png export
         sudo apt-get install -y libnss3 libatk-bridge2.0-0 libcups2 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libxkbcommon0 libpango-1.0-0 libcairo2 libasound2
         sudo plotly_get_chrome -y

From e4d67ce237dce938434cc69d4671d21c786b2b19 Mon Sep 17 00:00:00 2001
From: Stefan Heimersheim <stefan.heimersheim@gmail.com>
Date: Tue, 30 Sep 2025 11:36:43 +0100
Subject: [PATCH 16/16] Fix .ssh permissions, making folder owned by user

---
 src/runpod_cli/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runpod_cli/utils.py b/src/runpod_cli/utils.py
index ce43ca9..c8ef53b 100644
--- a/src/runpod_cli/utils.py
+++ b/src/runpod_cli/utils.py
@@ -61,10 +61,10 @@ def get_setup_root(runpodcli_path: str, volume_mount_path: str) -> Tuple[str, st
         # Set NNSIGHT_LOG_PATH to avoid https://github.com/ndif-team/nnsight/issues/495
         echo "export NNSIGHT_LOG_PATH=/root/.local/state/nnsight" >> /root/.profile
         echo "export NNSIGHT_LOG_PATH=/home/user/.local/state/nnsight" >> /home/user/.profile
-        chown user /home/user/.profile
+        chown user:user /home/user/.profile
         mkdir -p  /home/user/.ssh/
         cat /root/.ssh/authorized_keys >> /home/user/.ssh/authorized_keys
-        chown user /home/user/.ssh/authorized_keys
+        chown -R user:user /home/user/.ssh
 
         if [[ VOLUME_MOUNT_PATH != "/workspace" ]]; then
             rmdir /workspace