CogStack · alhendrickson · Dec 23, 2025 · Dec 23, 2025
diff --git a/deployment/kubernetes/charts/medcat-service-helm/README.md b/deployment/kubernetes/charts/medcat-service-helm/README.md
@@ -71,3 +71,42 @@ env:
   DEID_MODE: "true"
   DEID_REDACT: "true"
 ```
+
+
+## GPU Support
+
+To run MedCAT Service with GPU acceleration, use the GPU-enabled image and set the pod runtime class accordingly.
+
+Note GPU support is only used for deidentification
+
+Create a values file like `values-gpu.yaml` with the following content:
+
+```yaml
+image:
+  repository: ghcr.io/cogstack/medcat-service-gpu
+
+runtimeClassName: nvidia
+
+resources:
+  limits:
+      nvidia.com/gpu: 1
+env:
+  APP_CUDA_DEVICE_COUNT: 1
+  APP_TORCH_THREADS: -1
+  DEID_MODE: true
+```
+
+> To use GPU acceleration, your Kubernetes cluster should be configured with the NVIDIA GPU Operator or the following components:
+> - [NVIDIA device plugin for Kubernetes](https://github.com/NVIDIA/k8s-device-plugin)
+> - [NVIDIA GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery)
+> - The [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/)
+
+### Test GPU support
+You can verify that the MedCAT Service pod has access to the GPU by executing `nvidia-smi` inside the pod.
+
+
+```sh
+kubectl exec -it <POD_NAME> -- nvidia-smi
+```
+
+You should see the NVIDIA GPU device listing if the GPU is properly accessible.
diff --git a/deployment/kubernetes/charts/medcat-service-helm/templates/deployment.yaml b/deployment/kubernetes/charts/medcat-service-helm/templates/deployment.yaml
@@ -134,3 +134,6 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      {{- if .Values.runtimeClassName }}
+      runtimeClassName: {{ .Values.runtimeClassName | quote }}
+      {{- end }}
diff --git a/deployment/kubernetes/charts/medcat-service-helm/values.yaml b/deployment/kubernetes/charts/medcat-service-helm/values.yaml
@@ -8,6 +8,7 @@ replicaCount: 1
 # This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/
 image:
   repository: cogstacksystems/medcat-service
+  # repository: cogstacksystems/medcat-service-gpu
   # This sets the pull policy for images.
   # pullPolicy: IfNotPresent
   pullPolicy: Always
@@ -32,7 +33,7 @@ env:
   # DEID_REDACT: true
 
   # Set SERVER_GUNICORN_MAX_REQUESTS to a high number instead of the default 1000. Trust k8s instead to restart pod when needed.
-  SERVER_GUNICORN_MAX_REQUESTS: 100000
+  SERVER_GUNICORN_MAX_REQUESTS: "100000"
 
   # Recommended env vars to set to try to limit to 1 CPU for scaling
   # OMP_NUM_THREADS: "1"
@@ -44,6 +45,10 @@ env:
   # PYTORCH_ENABLE_MPS_FALLBACK: "1"
   # SERVER_GUNICORN_EXTRA_ARGS: "--worker-connections 1 --backlog 1"
 
+  # Recommended env vars for GPU support
+  # APP_CUDA_DEVICE_COUNT: "1"
+  # APP_TORCH_THREADS: "-1"
+
   # Observability Env Vars
   APP_ENABLE_METRICS: true
   APP_ENABLE_TRACING: false
@@ -203,6 +208,10 @@ volumeMounts: []
 #   mountPath: "/etc/foo"
 #   readOnly: true
 
+# Runtime class name for the pod (e.g., "nvidia" for GPU workloads)
+# More information: https://kubernetes.io/docs/concepts/containers/runtime-class/
+runtimeClassName: ""
+
 nodeSelector: {}
 
 tolerations: []

diff --git a/deployment/kubernetes/charts/medcat-trainer-helm/templates/medcat-trainer-deployment.yaml b/deployment/kubernetes/charts/medcat-trainer-helm/templates/medcat-trainer-deployment.yaml
@@ -134,3 +134,6 @@ spec:
       tolerations:
         {{- toYaml . | nindent 8 }}
       {{- end }}
+      {{- if .Values.runtimeClassName }}
+      runtimeClassName: {{ .Values.runtimeClassName | quote }}
+      {{- end }}
diff --git a/deployment/kubernetes/charts/medcat-trainer-helm/values.yaml b/deployment/kubernetes/charts/medcat-trainer-helm/values.yaml
@@ -254,3 +254,6 @@ nodeSelector: {}
 tolerations: []
 
 affinity: {}
+
+# Runtime class name for the pod (e.g., "nvidia" for GPU workloads)
+runtimeClassName: ""