From 3ca41a9b458051db1c3262a78d57fba6f696a90c Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Mon, 23 Feb 2026 10:29:45 -0800 Subject: [PATCH 1/2] [ROCm] Pin PyTorch ROCm wheels to 20260219 CI tests on ROCm 7.1 are failing with `hipblaslt solution not found` when using the 20260220 PyTorch ROCm nightly wheels. Pin the ROCm wheels to `2.12.0.dev20260219+rocm7.1` to unblock CI until the upstream issue is resolved. Signed-off-by: Jagadish Krishnamoorthy --- .../workflows/integration_test_8gpu_autoparallel.yaml | 9 ++++++++- .../integration_test_8gpu_compiler_toolkit.yaml | 9 ++++++++- .github/workflows/integration_test_8gpu_features.yaml | 9 ++++++++- .github/workflows/integration_test_8gpu_h100.yaml | 9 ++++++++- .github/workflows/integration_test_8gpu_models.yaml | 9 ++++++++- .github/workflows/integration_test_8gpu_simple_fsdp.yaml | 9 ++++++++- .github/workflows/integration_test_8gpu_torchft.yaml | 9 ++++++++- ...egration_test_8gpu_transformers_modeling_backend.yaml | 9 ++++++++- .github/workflows/integration_test_8gpu_vlm.yaml | 9 ++++++++- .github/workflows/set-matrix.yaml | 3 ++- 10 files changed, 74 insertions(+), 10 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_autoparallel.yaml b/.github/workflows/integration_test_8gpu_autoparallel.yaml index 7ce391f71a..3b8807d338 100644 --- a/.github/workflows/integration_test_8gpu_autoparallel.yaml +++ b/.github/workflows/integration_test_8gpu_autoparallel.yaml @@ -66,7 +66,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi # Install autoparallel - required dependency for autoparallel experiment python -m pip install git+https://github.com/meta-pytorch/autoparallel.git diff --git a/.github/workflows/integration_test_8gpu_compiler_toolkit.yaml b/.github/workflows/integration_test_8gpu_compiler_toolkit.yaml index e0a500ead9..2cba7cd749 100644 --- a/.github/workflows/integration_test_8gpu_compiler_toolkit.yaml +++ b/.github/workflows/integration_test_8gpu_compiler_toolkit.yaml @@ -68,7 +68,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 4e094d4eb1..7b83f6b72c 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -59,7 +59,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} diff --git a/.github/workflows/integration_test_8gpu_h100.yaml b/.github/workflows/integration_test_8gpu_h100.yaml index cb5a4e704d..98accdb942 100644 --- a/.github/workflows/integration_test_8gpu_h100.yaml +++ b/.github/workflows/integration_test_8gpu_h100.yaml @@ -63,7 +63,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} diff --git a/.github/workflows/integration_test_8gpu_models.yaml b/.github/workflows/integration_test_8gpu_models.yaml index acdbe9cb06..8679ba57f3 100644 --- a/.github/workflows/integration_test_8gpu_models.yaml +++ b/.github/workflows/integration_test_8gpu_models.yaml @@ -60,7 +60,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} diff --git a/.github/workflows/integration_test_8gpu_simple_fsdp.yaml b/.github/workflows/integration_test_8gpu_simple_fsdp.yaml index 302cd46555..3f69bcd02e 100644 --- a/.github/workflows/integration_test_8gpu_simple_fsdp.yaml +++ b/.github/workflows/integration_test_8gpu_simple_fsdp.yaml @@ -61,7 +61,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" diff --git a/.github/workflows/integration_test_8gpu_torchft.yaml b/.github/workflows/integration_test_8gpu_torchft.yaml index 0931fa75b7..551f72a74f 100644 --- a/.github/workflows/integration_test_8gpu_torchft.yaml +++ b/.github/workflows/integration_test_8gpu_torchft.yaml @@ -61,7 +61,14 @@ jobs: pip config --user set global.progress_bar off python -m pip install torchft-nightly - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" diff --git a/.github/workflows/integration_test_8gpu_transformers_modeling_backend.yaml b/.github/workflows/integration_test_8gpu_transformers_modeling_backend.yaml index e5299ce93d..26efc68587 100644 --- a/.github/workflows/integration_test_8gpu_transformers_modeling_backend.yaml +++ b/.github/workflows/integration_test_8gpu_transformers_modeling_backend.yaml @@ -61,7 +61,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} diff --git a/.github/workflows/integration_test_8gpu_vlm.yaml b/.github/workflows/integration_test_8gpu_vlm.yaml index 8584f1f5b9..487842cb73 100644 --- a/.github/workflows/integration_test_8gpu_vlm.yaml +++ b/.github/workflows/integration_test_8gpu_vlm.yaml @@ -61,7 +61,14 @@ jobs: pip config --user set global.progress_bar off - python -m pip install --force-reinstall --pre torch --index-url ${{ matrix.index-url }} + if [ "${{ matrix.gpu-arch-type }}" = "rocm" ]; then + python -m pip install --force-reinstall --pre \ + "torch==${{ matrix.torch-version }}" \ + --index-url ${{ matrix.index-url }} + else + python -m pip install --force-reinstall --pre \ + torch --index-url ${{ matrix.index-url }} + fi USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }} diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index e22717fbda..bc3f839c53 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -53,7 +53,8 @@ jobs: "gpu-arch-type": "rocm", "gpu-arch-version": "7.1", "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", - "index-url": "https://download.pytorch.org/whl/nightly/rocm7.1" + "index-url": "https://download.pytorch.org/whl/nightly/rocm7.1", + "torch-version": "2.12.0.dev20260219+rocm7.1" } EOF ) From 011399257dd19a7bf6c17cf10b672def24b76b84 Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Mon, 23 Feb 2026 11:09:04 -0800 Subject: [PATCH 2/2] Use 2.12.0.dev20260218+rocm7.1 Signed-off-by: Jagadish Krishnamoorthy --- .github/workflows/set-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml index bc3f839c53..ac0e72e47d 100644 --- a/.github/workflows/set-matrix.yaml +++ b/.github/workflows/set-matrix.yaml @@ -54,7 +54,7 @@ jobs: "gpu-arch-version": "7.1", "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12", "index-url": "https://download.pytorch.org/whl/nightly/rocm7.1", - "torch-version": "2.12.0.dev20260219+rocm7.1" + "torch-version": "2.12.0.dev20260218+rocm7.1" } EOF )