diff --git a/deployment/kubernetes/charts/medcat-service-helm/README.md b/deployment/kubernetes/charts/medcat-service-helm/README.md index 0001f0d..084d30f 100644 --- a/deployment/kubernetes/charts/medcat-service-helm/README.md +++ b/deployment/kubernetes/charts/medcat-service-helm/README.md @@ -71,3 +71,42 @@ env: DEID_MODE: "true" DEID_REDACT: "true" ``` + + +## GPU Support + +To run MedCAT Service with GPU acceleration, use the GPU-enabled image and set the pod runtime class accordingly. + +Note GPU support is only used for deidentification + +Create a values file like `values-gpu.yaml` with the following content: + +```yaml +image: + repository: ghcr.io/cogstack/medcat-service-gpu + +runtimeClassName: nvidia + +resources: + limits: + nvidia.com/gpu: 1 +env: + APP_CUDA_DEVICE_COUNT: 1 + APP_TORCH_THREADS: -1 + DEID_MODE: true +``` + +> To use GPU acceleration, your Kubernetes cluster should be configured with the NVIDIA GPU Operator or the following components: +> - [NVIDIA device plugin for Kubernetes](https://github.com/NVIDIA/k8s-device-plugin) +> - [NVIDIA GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery) +> - The [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/) + +### Test GPU support +You can verify that the MedCAT Service pod has access to the GPU by executing `nvidia-smi` inside the pod. + + +```sh +kubectl exec -it -- nvidia-smi +``` + +You should see the NVIDIA GPU device listing if the GPU is properly accessible. diff --git a/deployment/kubernetes/charts/medcat-service-helm/templates/deployment.yaml b/deployment/kubernetes/charts/medcat-service-helm/templates/deployment.yaml index b92702f..1db23b4 100644 --- a/deployment/kubernetes/charts/medcat-service-helm/templates/deployment.yaml +++ b/deployment/kubernetes/charts/medcat-service-helm/templates/deployment.yaml @@ -134,3 +134,6 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.runtimeClassName }} + runtimeClassName: {{ .Values.runtimeClassName | quote }} + {{- end }} diff --git a/deployment/kubernetes/charts/medcat-service-helm/values.yaml b/deployment/kubernetes/charts/medcat-service-helm/values.yaml index 73e50eb..b7e7df5 100644 --- a/deployment/kubernetes/charts/medcat-service-helm/values.yaml +++ b/deployment/kubernetes/charts/medcat-service-helm/values.yaml @@ -8,6 +8,7 @@ replicaCount: 1 # This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/ image: repository: cogstacksystems/medcat-service + # repository: cogstacksystems/medcat-service-gpu # This sets the pull policy for images. # pullPolicy: IfNotPresent pullPolicy: Always @@ -32,7 +33,7 @@ env: # DEID_REDACT: true # Set SERVER_GUNICORN_MAX_REQUESTS to a high number instead of the default 1000. Trust k8s instead to restart pod when needed. - SERVER_GUNICORN_MAX_REQUESTS: 100000 + SERVER_GUNICORN_MAX_REQUESTS: "100000" # Recommended env vars to set to try to limit to 1 CPU for scaling # OMP_NUM_THREADS: "1" @@ -44,6 +45,10 @@ env: # PYTORCH_ENABLE_MPS_FALLBACK: "1" # SERVER_GUNICORN_EXTRA_ARGS: "--worker-connections 1 --backlog 1" + # Recommended env vars for GPU support + # APP_CUDA_DEVICE_COUNT: "1" + # APP_TORCH_THREADS: "-1" + # Observability Env Vars APP_ENABLE_METRICS: true APP_ENABLE_TRACING: false @@ -203,6 +208,10 @@ volumeMounts: [] # mountPath: "/etc/foo" # readOnly: true +# Runtime class name for the pod (e.g., "nvidia" for GPU workloads) +# More information: https://kubernetes.io/docs/concepts/containers/runtime-class/ +runtimeClassName: "" + nodeSelector: {} tolerations: [] diff --git a/deployment/kubernetes/charts/medcat-trainer-helm/templates/medcat-trainer-deployment.yaml b/deployment/kubernetes/charts/medcat-trainer-helm/templates/medcat-trainer-deployment.yaml index 49e676b..81ca21b 100644 --- a/deployment/kubernetes/charts/medcat-trainer-helm/templates/medcat-trainer-deployment.yaml +++ b/deployment/kubernetes/charts/medcat-trainer-helm/templates/medcat-trainer-deployment.yaml @@ -134,3 +134,6 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.runtimeClassName }} + runtimeClassName: {{ .Values.runtimeClassName | quote }} + {{- end }} diff --git a/deployment/kubernetes/charts/medcat-trainer-helm/values.yaml b/deployment/kubernetes/charts/medcat-trainer-helm/values.yaml index be084a1..8929da0 100644 --- a/deployment/kubernetes/charts/medcat-trainer-helm/values.yaml +++ b/deployment/kubernetes/charts/medcat-trainer-helm/values.yaml @@ -254,3 +254,6 @@ nodeSelector: {} tolerations: [] affinity: {} + +# Runtime class name for the pod (e.g., "nvidia" for GPU workloads) +runtimeClassName: "" \ No newline at end of file