diff --git a/.github/workflows/test-notebooks.yml b/.github/workflows/test-notebooks.yml new file mode 100644 index 0000000..a1eb8d7 --- /dev/null +++ b/.github/workflows/test-notebooks.yml @@ -0,0 +1,127 @@ +name: Test Notebooks + +on: + push: + branches: [ main, develop ] + pull_request: + branches: [ main ] + schedule: + # Run tests weekly to catch SDK updates + - cron: '0 6 * * 1' # Monday at 6 AM UTC + +jobs: + test-notebooks: + runs-on: ubuntu-latest + + # This job runs on Trainium instances with Neuron SDK + + strategy: + matrix: + notebook-group: + - "NxD" + - "FineTuning" + - "vLLM" + - "NKI" + fail-fast: false + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install test dependencies + run: | + pip install -r requirements-test.txt + + - name: Validate notebook syntax (without execution) + run: | + # Just validate that notebooks are well-formed JSON + python -c " + import json + from pathlib import Path + + notebooks = list(Path('labs/${{ matrix.notebook-group }}').rglob('*.ipynb')) + print(f'Validating {len(notebooks)} notebooks in ${{ matrix.notebook-group }}') + + for nb_path in notebooks: + if '.ipynb_checkpoints' in str(nb_path): + continue + try: + with open(nb_path) as f: + json.load(f) + print(f'✓ {nb_path}') + except Exception as e: + print(f'✗ {nb_path}: {e}') + exit(1) + " + + - name: Check for required files + run: | + # Verify that notebooks reference existing files + python -c " + import json + from pathlib import Path + import re + + def check_notebook_references(nb_path): + with open(nb_path) as f: + nb = json.load(f) + + issues = [] + for cell in nb.get('cells', []): + if cell.get('cell_type') == 'code': + source = ''.join(cell.get('source', [])) + + # Check for file references + file_refs = re.findall(r'[\"\']([\w\./\-]+\.(?:txt|py|json|yaml|yml|sh))[\"\'']', source) + for file_ref in file_refs: + if not file_ref.startswith('/') and not Path(nb_path.parent / file_ref).exists(): + issues.append(f'Missing file reference: {file_ref}') + + return issues + + notebooks = list(Path('labs/${{ matrix.notebook-group }}').rglob('*.ipynb')) + all_issues = [] + + for nb_path in notebooks: + if '.ipynb_checkpoints' in str(nb_path): + continue + issues = check_notebook_references(nb_path) + if issues: + all_issues.extend([f'{nb_path}: {issue}' for issue in issues]) + + if all_issues: + print('Found issues:') + for issue in all_issues: + print(f' {issue}') + # Don't fail on missing files for now, just warn + # exit(1) + else: + print('No file reference issues found') + " + + # Full notebook execution on Trainium instances + test-execution: + runs-on: self-hosted # Trainium instance with Neuron SDK + needs: test-notebooks + if: github.event_name == 'push' || github.event_name == 'schedule' + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Run notebook tests + run: | + # Run on actual Trainium hardware with Neuron SDK + ./run_tests.sh --fast --html-report + + - name: Upload test results + uses: actions/upload-artifact@v3 + if: always() + with: + name: test-results-${{ matrix.notebook-group }} + path: test_report.html \ No newline at end of file diff --git a/labs/FineTuning/HuggingFaceExample/01_finetuning/FT-Qwen3-1.7B-chess.ipynb b/labs/FineTuning/HuggingFaceExample/01_finetuning/FT-Qwen3-1.7B-chess.ipynb index aedcc91..7b37dbe 100644 --- a/labs/FineTuning/HuggingFaceExample/01_finetuning/FT-Qwen3-1.7B-chess.ipynb +++ b/labs/FineTuning/HuggingFaceExample/01_finetuning/FT-Qwen3-1.7B-chess.ipynb @@ -54,7 +54,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/FineTuning/HuggingFaceExample/01_finetuning/assets\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/FineTuning/HuggingFaceExample/01_finetuning/assets\n", "%pip install -q -r requirements.txt\n" ] }, @@ -95,7 +95,7 @@ " finetune_chess_model.py \\\n", " --model_id Qwen/Qwen3-1.7B \\\n", " --tokenizer_id Qwen/Qwen3-1.7B \\\n", - " --output_dir ~/environment/ml/qwen-chess \\\n", + " --output_dir ~/ml/qwen-chess \\\n", " --bf16 True \\\n", " --gradient_checkpointing True \\\n", " --gradient_accumulation_steps 1 \\\n", @@ -128,11 +128,11 @@ "outputs": [], "source": [ "!optimum-cli export neuron \\\n", - " --model /home/ubuntu/environment/ml/qwen-chess/merged_model \\\n", + " --model ~/ml/qwen-chess/merged_model \\\n", " --task text-generation \\\n", " --sequence_length 2048 \\\n", " --batch_size 4 \\\n", - " /home/ubuntu/environment/ml/qwen-chess/compiled_model\n" + " ~/ml/qwen-chess/compiled_model\n" ] }, { @@ -163,7 +163,7 @@ "from vllm import LLM, SamplingParams\n", "\n", "llm = LLM(\n", - " model=\"/home/ubuntu/environment/ml/qwen-chess/compiled_model\", #local compiled model\n", + " model=\"/home/ubuntu/ml/qwen-chess/compiled_model\", #full path to local compiled model\n", " max_num_seqs=4,\n", " max_model_len=2048,\n", " device=\"neuron\",\n", diff --git a/labs/FineTuning/HuggingFaceExample/01_finetuning/Finetune-Qwen3-1.7B.ipynb b/labs/FineTuning/HuggingFaceExample/01_finetuning/Finetune-Qwen3-1.7B.ipynb index cd2ada0..b06dff8 100644 --- a/labs/FineTuning/HuggingFaceExample/01_finetuning/Finetune-Qwen3-1.7B.ipynb +++ b/labs/FineTuning/HuggingFaceExample/01_finetuning/Finetune-Qwen3-1.7B.ipynb @@ -38,7 +38,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/FineTuning/HuggingFaceExample/01_finetuning/assets\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/FineTuning/HuggingFaceExample/01_finetuning/assets\n", "%pip install -r requirements.txt" ] }, @@ -79,7 +79,7 @@ " finetune_model.py \\\n", " --model_id Qwen/Qwen3-1.7B \\\n", " --tokenizer_id Qwen/Qwen3-1.7B \\\n", - " --output_dir ~/environment/ml/qwen \\\n", + " --output_dir ~/ml/qwen \\\n", " --bf16 True \\\n", " --gradient_checkpointing True \\\n", " --gradient_accumulation_steps 1 \\\n", @@ -112,11 +112,11 @@ "outputs": [], "source": [ "!optimum-cli export neuron \\\n", - " --model /home/ubuntu/environment/ml/qwen/merged_model \\\n", + " --model ~/ml/qwen/merged_model \\\n", " --task text-generation \\\n", " --sequence_length 512 \\\n", " --batch_size 1 \\\n", - " /home/ubuntu/environment/ml/qwen/compiled_model" + " ~/ml/qwen/compiled_model" ] }, { @@ -146,7 +146,7 @@ "import os\n", "from vllm import LLM, SamplingParams\n", "llm = LLM(\n", - " model=\"/home/ubuntu/environment/ml/qwen/compiled_model\", #local compiled model\n", + " model=\"/home/ubuntu/ml/qwen/compiled_model\", #local compiled model\n", " max_num_seqs=1,\n", " max_model_len=2048,\n", " device=\"neuron\",\n", diff --git a/labs/FineTuning/HuggingFaceExample/02_inference/Inference-TinyLlama-1.1B.ipynb b/labs/FineTuning/HuggingFaceExample/02_inference/Inference-TinyLlama-1.1B.ipynb index acdf9d9..96b47d4 100644 --- a/labs/FineTuning/HuggingFaceExample/02_inference/Inference-TinyLlama-1.1B.ipynb +++ b/labs/FineTuning/HuggingFaceExample/02_inference/Inference-TinyLlama-1.1B.ipynb @@ -200,7 +200,7 @@ "s3_output_path=f\"{s3_orig_model_path}compiled_model/\"\n", "print(\"s3_output_path\",s3_output_path)\n", "training_job_name = utils.name_from_base(\"TGICompilation\")\n", - "print(\"training_job_name\")\n", + "print(\"training_job_name\",training_job_name)\n", "s3_model_path = f\"{s3_orig_model_path}merged_model/\"\n", "print(\"s3_model_path\",s3_model_path)\n", "\n", diff --git a/labs/NxD/Lab_One_NxDI.ipynb b/labs/NxD/Lab_One_NxDI.ipynb index 70b6f24..5dc5e3e 100644 --- a/labs/NxD/Lab_One_NxDI.ipynb +++ b/labs/NxD/Lab_One_NxDI.ipynb @@ -81,7 +81,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hf download NousResearch/Llama-3.2-1B --local-dir /home/ubuntu/environment/models/llama/" + "!hf download NousResearch/Llama-3.2-1B --local-dir /home/ubuntu/build-on-trainium-workshop/labs/models/llama/" ] }, { @@ -101,7 +101,7 @@ "outputs": [], "source": [ "# the original checkpoint\n", - "model_path = '/home/ubuntu/environment/models/llama/'" + "model_path = '/home/ubuntu/build-on-trainium-workshop/labs/models/llama/'" ] }, { @@ -112,7 +112,7 @@ "outputs": [], "source": [ "# where your NxD trace will go\n", - "traced_model_path = '/home/ubuntu/environment/models/traced_llama'" + "traced_model_path = '/home/ubuntu/build-on-trainium-workshop/labs/models/traced_llama'" ] }, { diff --git a/labs/vLLM/Chess/Chess-Deployment.ipynb b/labs/vLLM/Chess/Chess-Deployment.ipynb index 183b986..9d2050c 100644 --- a/labs/vLLM/Chess/Chess-Deployment.ipynb +++ b/labs/vLLM/Chess/Chess-Deployment.ipynb @@ -109,7 +109,7 @@ "import os\n", "\n", "# Option 1: Use your fine-tuned model (update path if you completed fine-tuning lab)\n", - "FINE_TUNED_MODEL_PATH = \"/home/ubuntu/environment/ml/qwen-chess/compiled_model\" # default path from previous lab\n", + "FINE_TUNED_MODEL_PATH = \"/home/ubuntu/ml/qwen-chess/compiled_model\" # default path from previous lab\n", "\n", "# Option 2: Use pre-compiled model from HF repository (default), this will be pulled down by optimum-neuron-vllm\n", "PRECOMPILED_MODEL_PATH = \"kunhunjon/ChessLM_Qwen3_Trainium_AWS_Format\"\n", @@ -171,7 +171,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/vLLM/Chess/assets/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/\n", "# Update vllm.sh with correct model path\n", "import os\n", "import re\n", @@ -210,7 +210,7 @@ "nohup bash vllm.sh > vllm-server.log 2>&1 &\n", "\n", "echo \"Server starting (PID: $!)\"\n", - "echo \"Check logs from /home/ubuntu/environment: tail -f vLLM/Chess/assets/vllm-server/vllm-server.log\"\n" + "echo \"Check logs from /home/ubuntu/build-on-trainium-workshop/labs: tail -f vLLM/Chess/assets/vllm-server/vllm-server.log\"\n" ] }, { @@ -221,7 +221,7 @@ "\n", "You can monitor the server logs in the terminal:\n", "```bash\n", - "tail -f /home/ubuntu/environment/vLLM/Chess/assets/vllm-server/vllm-server.log\n", + "tail -f /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/vllm-server/vllm-server.log\n", "```\n", "\n", "Or check Neuron core usage:\n", @@ -299,7 +299,7 @@ "outputs": [], "source": [ "# copy your env vars in\n", - "%cd /home/ubuntu/environment/vLLM/Chess/assets/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/\n", "! cp env.example .env\n" ] }, @@ -313,7 +313,7 @@ "%load_ext dotenv\n", "\n", "# 2. Load your .env file (adjust path if needed!)\n", - "%dotenv /home/ubuntu/environment/vLLM/Chess/assets/.env" + "%dotenv /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/.env" ] }, { @@ -347,7 +347,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/vLLM/Chess/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/\n", "\n", "from assets.agents.vllm_agent import VLLMAgent\n", "from assets.agents.stockfish_agent import StockfishAgent\n", diff --git a/labs/vLLM/Chess/Chess-Tournament.ipynb b/labs/vLLM/Chess/Chess-Tournament.ipynb index 69be459..2041044 100644 --- a/labs/vLLM/Chess/Chess-Tournament.ipynb +++ b/labs/vLLM/Chess/Chess-Tournament.ipynb @@ -73,7 +73,7 @@ "outputs": [], "source": [ "# copy your env vars in\n", - "%cd /home/ubuntu/environment/vLLM/Chess/assets/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/\n", "! cp env.example .env\n" ] }, @@ -87,7 +87,7 @@ "%load_ext dotenv\n", "\n", "# 2. Load your .env file (adjust path if needed!)\n", - "%dotenv /home/ubuntu/environment/vLLM/Chess/assets/.env" + "%dotenv /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/.env" ] }, { @@ -123,7 +123,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/vLLM/Chess/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/\n", "\n", "# Play single game: vLLM vs Stockfish (skill 5)\n", "!python -m assets.run_game \\\n", diff --git a/labs/vLLM/Chess/README.md b/labs/vLLM/Chess/README.md index a62b6ff..7e15a24 100644 --- a/labs/vLLM/Chess/README.md +++ b/labs/vLLM/Chess/README.md @@ -53,7 +53,7 @@ Learn how to: ```bash # Navigate to lab directory -cd /home/ubuntu/environment/neuron-workshops/labs/vLLM/Chess +cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess #install stockfish sudo apt install stockfish diff --git a/labs/vLLM/Chess/Trainium2_Chess-Deployment.ipynb b/labs/vLLM/Chess/Trainium2_Chess-Deployment.ipynb index dd050ac..cb1d073 100644 --- a/labs/vLLM/Chess/Trainium2_Chess-Deployment.ipynb +++ b/labs/vLLM/Chess/Trainium2_Chess-Deployment.ipynb @@ -109,7 +109,7 @@ "import os\n", "\n", "# Option 1: Use your fine-tuned model (update path if you completed fine-tuning lab)\n", - "FINE_TUNED_MODEL_PATH = \"/home/ubuntu/environment/ml/qwen-chess/compiled_model\" # default path from previous lab\n", + "FINE_TUNED_MODEL_PATH = \"/home/ubuntu/ml/qwen-chess/compiled_model\" # default path from previous lab\n", "\n", "# Option 2: Use pre-compiled model from HF repository (default), this will be pulled down by optimum-neuron-vllm\n", "PRECOMPILED_MODEL_PATH = \"aws-neuron/ChessLM_Qwen3_Trainium_2_AWS_Format\"\n", @@ -171,7 +171,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/vLLM/Chess/assets/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/\n", "# Update vllm.sh with correct model path\n", "import os\n", "import re\n", @@ -210,7 +210,7 @@ "nohup bash vllm.sh > vllm-server.log 2>&1 &\n", "\n", "echo \"Server starting (PID: $!)\"\n", - "echo \"Check logs from /home/ubuntu/environment: tail -f vLLM/Chess/assets/vllm-server/vllm-server.log\"\n" + "echo \"Check logs from /home/ubuntu/build-on-trainium-workshop/labs: tail -f vLLM/Chess/assets/vllm-server/vllm-server.log\"\n" ] }, { @@ -221,7 +221,7 @@ "\n", "You can monitor the server logs in the terminal to look for \"Application startup complete\":\n", "```bash\n", - "tail -f /home/ubuntu/environment/vLLM/Chess/assets/vllm-server/vllm-server.log\n", + "tail -f /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/vllm-server/vllm-server.log\n", "```\n", "\n", "Or check Neuron core usage:\n", @@ -299,7 +299,7 @@ "outputs": [], "source": [ "# copy your env vars in\n", - "%cd /home/ubuntu/environment/vLLM/Chess/assets/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/\n", "! cp env.example .env\n" ] }, @@ -313,7 +313,7 @@ "%load_ext dotenv\n", "\n", "# 2. Load your .env file (adjust path if needed!)\n", - "%dotenv /home/ubuntu/environment/vLLM/Chess/assets/.env\n", + "%dotenv /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/.env\n", "%env VLLM_MODEL=aws-neuron/ChessLM_Qwen3_Trainium_2_AWS_Format" ] }, @@ -348,7 +348,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/vLLM/Chess/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/\n", "\n", "from assets.agents.vllm_agent import VLLMAgent\n", "from assets.agents.stockfish_agent import StockfishAgent\n", diff --git a/labs/vLLM/Chess/Trainium2_Chess-Tournament.ipynb b/labs/vLLM/Chess/Trainium2_Chess-Tournament.ipynb index b3583ae..b8d28f2 100644 --- a/labs/vLLM/Chess/Trainium2_Chess-Tournament.ipynb +++ b/labs/vLLM/Chess/Trainium2_Chess-Tournament.ipynb @@ -73,7 +73,7 @@ "outputs": [], "source": [ "# copy your env vars in\n", - "%cd /home/ubuntu/environment/vLLM/Chess/assets/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/\n", "! cp env.example .env\n" ] }, @@ -87,7 +87,7 @@ "%load_ext dotenv\n", "\n", "# 2. Load your .env file (adjust path if needed!)\n", - "%dotenv /home/ubuntu/environment/vLLM/Chess/assets/.env\n", + "%dotenv /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/assets/.env\n", "%env VLLM_MODEL=aws-neuron/ChessLM_Qwen3_Trainium_2_AWS_Format" ] }, @@ -124,7 +124,7 @@ "metadata": {}, "outputs": [], "source": [ - "%cd /home/ubuntu/environment/vLLM/Chess/\n", + "%cd /home/ubuntu/build-on-trainium-workshop/labs/vLLM/Chess/\n", "\n", "# Play single game: vLLM vs Stockfish (skill 5)\n", "!python -m assets.run_game \\\n", diff --git a/labs/vLLM/Servers.ipynb b/labs/vLLM/Servers.ipynb index 14c519f..6110761 100644 --- a/labs/vLLM/Servers.ipynb +++ b/labs/vLLM/Servers.ipynb @@ -39,7 +39,7 @@ } ], "source": [ - "%pip install --quiet -r /home/ubuntu/environment/vLLM/upstreaming-to-vllm/requirements/neuron.txt\n", + "%pip install --quiet -r /home/ubuntu/build-on-trainium-workshop/labs/vLLM/upstreaming-to-vllm/requirements/neuron.txt\n", "#expected to produce no output for 4 or 5 minutes. Remove the --quiet flag if you want to see ALL the packages installed! Or look in the neuron.txt requirements doc." ] }, @@ -49,7 +49,7 @@ "metadata": {}, "outputs": [], "source": [ - "!VLLM_TARGET_DEVICE=\"neuron\" pip install --quiet -e /home/ubuntu/environment/vLLM/upstreaming-to-vllm/.\n", + "!VLLM_TARGET_DEVICE=\"neuron\" pip install --quiet -e /home/ubuntu/build-on-trainium-workshop/labs/vLLM/upstreaming-to-vllm/.\n", "# expected to product no output for 5 or 6 minutes" ] }, @@ -67,7 +67,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hf download aws-neuron/Qwen3-8BSharded --local-dir /home/ubuntu/environment/qwen3\n", + "!hf download aws-neuron/Qwen3-8BSharded --local-dir /home/ubuntu/build-on-trainium-workshop/labs/qwen3\n", "#this could take 3-4 minutes" ] }, @@ -77,7 +77,7 @@ "metadata": {}, "outputs": [], "source": [ - "!hf download Qwen/Qwen3-8B --local-dir /home/ubuntu/environment/Qwen3-8B --exclude \"*.safetensors\"\n", + "!hf download Qwen/Qwen3-8B --local-dir /home/ubuntu/build-on-trainium-workshop/labs/Qwen3-8B --exclude \"*.safetensors\"\n", "#This is the stock model. It will only take seconds because we don't need to download the weights." ] }, @@ -104,10 +104,10 @@ "import os\n", "from vllm import LLM, SamplingParams\n", "os.environ['VLLM_NEURON_FRAMEWORK'] = \"neuronx-distributed-inference\"\n", - "os.environ['NEURON_COMPILED_ARTIFACTS'] = \"/home/ubuntu/environment/qwen3\"\n", + "os.environ['NEURON_COMPILED_ARTIFACTS'] = \"/home/ubuntu/build-on-trainium-workshop/labs/qwen3\"\n", "#os.environ['BASE_COMPILE_WORK_DIR'] = \"/home/ubuntu/qwen3/\"\n", "llm = LLM(\n", - " model=\"/home/ubuntu/environment/Qwen3-8B\", #model weights\n", + " model=\"/home/ubuntu/build-on-trainium-workshop/labs/Qwen3-8B\", #model weights\n", " max_num_seqs=1,\n", " max_model_len=1024,\n", " device=\"neuron\",\n", diff --git a/tests/README_TESTING.md b/tests/README_TESTING.md new file mode 100644 index 0000000..6ab6f5b --- /dev/null +++ b/tests/README_TESTING.md @@ -0,0 +1,63 @@ +# Notebook Testing Setup + +This repository includes testing for all Jupyter notebooks to ensure they work correctly after SDK updates and changes. + +**this is a work in progress and most tests will fail. Additionally, the order of tests matters if they install components** + +## Quick Start + +```bash +# Clone repository +git clone https://github.com/aws-neuron/build-on-trainium-workshop.git +cd build-on-trainium-workshop + +# Run all tests +./run_tests.sh + +# Run specific notebook +./run_tests.sh --notebook "vLLM/Chess/Chess-Deployment.ipynb" + +# Run with HTML report +./run_tests.sh --html-report +``` + +## What Gets Tested + +- **All notebooks execute without errors** +- **Cell outputs match expected results** (with nbval) +- **Import statements work correctly** +- **File paths resolve properly** +- **Model downloads and compilation succeed** +- **Hardware resources are available** + +## Test Categories + +| Category | Timeout | Description | +|----------|---------|-------------| +| NxD | 30 min | Model compilation and inference | +| FineTuning | 60 min | Model training and fine-tuning | +| vLLM | 30 min | Server setup and deployment | +| NKI | 15 min | Kernel development | + +## Environment + +- **Assumes**: Neuron SDK always available at `/opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/` +- **Requires**: Trainium instance (trn1.2xlarge+) +- **Uses**: nbval + pytest for execution and validation + +## Files + +- `run_tests.sh` - Main test runner script +- `test_notebooks.py` - Test discovery and configuration +- `pytest.ini` - Pytest configuration +- `requirements-test.txt` - Test dependencies +- `TESTING.md` - Detailed testing guide + +## CI/CD + +GitHub Actions automatically: +- Validates notebook syntax on PRs +- Runs full execution tests on pushes +- Runs weekly regression tests + +See `TESTING.md` for complete documentation. diff --git a/tests/TESTING.md b/tests/TESTING.md new file mode 100644 index 0000000..0bcad19 --- /dev/null +++ b/tests/TESTING.md @@ -0,0 +1,276 @@ +# Notebook Testing Guide + +This repository includes automated testing for all Jupyter notebooks to ensure they work correctly after SDK updates and changes. + +## Overview + +The testing system uses: +- **nbval**: A pytest plugin that executes notebooks and validates outputs +- **pytest**: Test framework with parallel execution support +- **papermill**: Alternative execution engine for notebooks +- **GitHub Actions**: Automated testing on code changes + +## Quick Start + +### Prerequisites + +1. **AWS Trainium Instance**: trn1.2xlarge or larger +2. **Neuron SDK**: Pre-installed with virtual environment at `/opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/` + +```bash +git clone https://github.com/aws-neuron/build-on-trainium-workshop.git +cd build-on-trainium-workshop +``` + +### Running Tests + +#### Run All Notebooks +```bash +./run_tests.sh +``` + +#### Run Specific Notebook +```bash +./run_tests.sh --notebook "vLLM/Chess/Chess-Deployment.ipynb" +``` + +#### Run with Options +```bash +# Fast mode (stop on first failure) +./run_tests.sh --fast + +# Generate HTML report +./run_tests.sh --html-report + +# Verbose output +./run_tests.sh --verbose +``` + +#### Run Specific Lab Category +```bash +# Test only NxD notebooks +pytest labs/NxD/ --nbval + +# Test only Fine-tuning notebooks +pytest labs/FineTuning/ --nbval + +# Test only vLLM notebooks +pytest labs/vLLM/ --nbval +``` + +## Test Configuration + +### Timeouts + +Different notebook categories have different timeout settings: + +- **NxD Labs**: 30 minutes (model compilation) +- **Fine-tuning Labs**: 60 minutes (training time) +- **vLLM Labs**: 30 minutes (server setup) +- **NKI Labs**: 15 minutes (kernel development) + +### Environment Requirements + +Each notebook category requires: +- Neuron SDK virtual environment +- Specific working directory +- Hardware resources (Neuron cores) + +## Understanding Test Results + +### Success ✅ +``` +labs/vLLM/Chess/Chess-Deployment.ipynb::test_notebook PASSED +``` + +### Failure ❌ +``` +labs/NxD/Lab_One_NxDI.ipynb::test_notebook FAILED +``` + +### Skipped ⏭️ +``` +labs/FineTuning/Finetune-Qwen3-1.7B.ipynb::test_notebook SKIPPED +``` +*Usually means notebook has skip conditions or hardware constraints* + +### Common Failure Types + +1. **Import Errors**: Missing dependencies +2. **Path Errors**: Incorrect file paths +3. **Hardware Errors**: Neuron cores not available +4. **Timeout Errors**: Notebook took too long +5. **Output Mismatch**: Cell output changed (may be expected) + +## Handling Test Updates + +### When Notebooks Change + +If you update a notebook and the output legitimately changes: + +1. **Run the notebook manually** to verify it works +2. **Update the stored outputs** by running with `--nbval-lax` flag +3. **Commit the updated notebook** with new outputs + +```bash +# Run with relaxed validation to update outputs +pytest labs/your-notebook.ipynb --nbval-lax +``` + +### When SDK Updates + +After Neuron SDK updates: + +1. **Run all tests** to identify issues +2. **Update notebooks** as needed for new SDK +3. **Update requirements** if dependencies changed +4. **Update timeouts** if performance characteristics changed + +## Advanced Usage + +### Custom Test Markers + +```bash +# Run only fast tests +pytest -m "not slow" + +# Run only GPU/Neuron tests +pytest -m gpu + +# Run integration tests +pytest -m integration +``` + +### Debugging Failed Tests + +```bash +# Stop on first failure with detailed output +pytest --maxfail=1 -vv --tb=long + +# Run specific cell ranges +pytest --nbval-cell-timeout=300 notebook.ipynb + +# Skip output comparison (execution only) +pytest --nbval-lax notebook.ipynb +``` + +### Sequential Execution + +```bash +# All notebooks run sequentially to avoid Neuron device conflicts +pytest labs/ --nbval + +# Control execution order if needed +pytest labs/NxD/ labs/FineTuning/ labs/vLLM/ labs/NKI/ --nbval +``` + +## CI/CD Integration + +### GitHub Actions + +The repository includes automated testing via GitHub Actions: + +- **Syntax Validation**: Runs on every PR +- **Execution Tests**: Runs on Trainium instances (when available) +- **Scheduled Tests**: Weekly runs to catch SDK issues + +### Local Pre-commit + +Set up pre-commit hooks to validate notebooks before pushing: + +```bash +# Install pre-commit +pip install pre-commit + +# Set up hooks (create .pre-commit-config.yaml) +pre-commit install + +# Run manually +pre-commit run --all-files +``` + +## Troubleshooting + +### Virtual Environment Issues + +```bash +# Check if Neuron SDK is available +ls -la /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/ + +# Manually activate and test +source /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/activate +python -c "import neuronx_distributed_inference" +``` + +### Path Issues + +```bash +# Verify you're in the right directory +pwd # Should end with /neuron-workshops + +# Check labs directory exists +ls -la labs/ + +# Verify paths in notebooks match +grep -r "/home/ubuntu/build-on-trainium-workshop" labs/ +``` + +### Hardware Issues + +```bash +# Check Neuron cores +neuron-ls + +# Check system resources +neuron-top + +# Verify instance type +curl -s http://169.254.169.254/latest/meta-data/instance-type +``` + +### Test Dependencies + +```bash +# Install/update test dependencies +pip install -r requirements-test.txt + +# Check nbval installation +python -c "import nbval; print(nbval.__version__)" + +# Verify pytest plugins +pytest --version +``` + +## Best Practices + +### For Notebook Authors + +1. **Keep outputs clean**: Clear unnecessary outputs before committing +2. **Use relative paths**: Avoid hardcoded absolute paths where possible +3. **Add error handling**: Include try/catch for expected failures +4. **Document requirements**: Note any special setup in markdown cells +5. **Test locally**: Run notebooks manually before committing + +### For Maintainers + +1. **Run tests regularly**: Especially after SDK updates +2. **Monitor timeouts**: Adjust as hardware/software performance changes +3. **Update dependencies**: Keep test requirements current +4. **Review failures**: Distinguish between real issues and expected changes +5. **Document changes**: Update this guide when test setup changes + +## Configuration Files + +- `pytest.ini`: Main pytest configuration +- `requirements-test.txt`: Test dependencies +- `test_notebooks.py`: Test discovery and configuration +- `run_tests.sh`: Convenience script for running tests +- `.github/workflows/test-notebooks.yml`: CI/CD configuration + +## Getting Help + +1. **Check logs**: Look at detailed pytest output +2. **Run manually**: Execute problematic notebooks by hand +3. **Check environment**: Verify Neuron SDK and dependencies +4. **Review changes**: Compare with working versions +5. **Ask for help**: Include full error messages and environment details diff --git a/tests/pip-freeze-2.25-known-working.txt b/tests/pip-freeze-2.25-known-working.txt new file mode 100644 index 0000000..78775f1 --- /dev/null +++ b/tests/pip-freeze-2.25-known-working.txt @@ -0,0 +1,273 @@ +absl-py==2.3.1 +accelerate==1.8.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +amqp==5.3.1 +annotated-types==0.7.0 +ansicolors==1.1.8 +anyio==4.9.0 +argon2-cffi==25.1.0 +argon2-cffi-bindings==25.1.0 +arrow==1.3.0 +astroid==3.3.11 +asttokens==3.0.0 +async-lru==2.0.5 +async-timeout==5.0.1 +attrs==25.3.0 +Automat==25.4.16 +awscli==1.41.17 +babel==2.17.0 +beautifulsoup4==4.13.4 +billiard==4.2.1 +bleach==6.2.0 +blobfile==3.0.0 +boto3==1.39.17 +botocore==1.39.17 +build==1.2.2.post1 +celery==5.5.3 +certifi==2025.7.14 +cffi==1.17.1 +charset-normalizer==3.4.2 +click==8.2.1 +click-didyoumean==0.3.1 +click-plugins==1.1.1.2 +click-repl==0.3.0 +cloudpickle==3.1.1 +cmake==4.0.3 +colorama==0.4.6 +comm==0.2.3 +constantly==23.10.4 +contourpy==1.3.2 +cryptography==45.0.5 +cssselect==1.3.0 +cycler==0.12.1 +dask==2025.7.0 +datasets==3.6.0 +debugpy==1.8.15 +decorator==5.2.1 +defusedxml==0.7.1 +dill==0.3.8 +distlib==0.4.0 +docstring_parser==0.17.0 +docutils==0.19 +ec2-metadata==2.14.0 +entrypoints==0.4 +environment-kernels==1.2.0 +exceptiongroup==1.3.0 +executing==2.2.0 +fastapi==0.116.1 +fastjsonschema==2.21.1 +filelock==3.18.0 +fonttools==4.59.0 +fqdn==1.5.1 +frozenlist==1.7.0 +fsspec==2025.3.0 +h11==0.16.0 +hf-xet==1.1.5 +httpcore==1.0.9 +httpie==3.2.4 +httpx==0.28.1 +huggingface-hub==0.33.4 +hyperlink==21.0.0 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.7.0 +incremental==24.7.2 +iniconfig==2.1.0 +ipykernel==6.30.0 +ipython==8.37.0 +ipywidgets==8.1.7 +islpy==2023.2.5 +isoduration==20.11.0 +isort==6.0.1 +itemadapter==0.12.0 +itemloaders==1.3.2 +jedi==0.19.2 +Jinja2==3.1.6 +jmespath==1.0.1 +joblib==1.5.1 +json5==0.12.0 +jsonpointer==3.0.0 +jsonschema==4.25.0 +jsonschema-specifications==2025.4.1 +jupyter==1.1.1 +jupyter-console==6.6.3 +jupyter-events==0.12.0 +jupyter-lsp==2.2.6 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyter_server==2.16.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.4.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.15 +kiwisolver==1.4.8 +kombu==5.5.4 +lark==1.2.2 +libneuronxla==2.2.8201.0+f46ac1ef +llvmlite==0.44.0 +locket==1.0.0 +lockfile==0.12.2 +lxml==6.0.0 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +matplotlib==3.10.3 +matplotlib-inline==0.1.7 +mccabe==0.7.0 +mdurl==0.1.2 +mistune==3.1.3 +ml_dtypes==0.5.3 +mpmath==1.3.0 +multidict==6.6.3 +multiprocess==0.70.16 +narwhals==2.0.1 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==2.8.8 +neuronx-cc==2.20.9961.0+0acef03a +neuronx-distributed==0.14.18122+d467a294 +neuronx-distributed-inference==0.5.9230+dcf1e2da +notebook==7.4.4 +notebook_shim==0.2.4 +numba==0.61.2 +numpy==1.25.2 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +opencv-python==4.12.0.88 +optimum==1.24.0 +optimum-neuron==0.3.0 +overrides==7.7.0 +packaging==25.0 +pandas==2.3.1 +pandocfilters==1.5.1 +papermill==2.6.0 +parsel==1.10.0 +parso==0.8.4 +partd==1.4.2 +peft==0.16.0 +pexpect==4.9.0 +pgzip==0.3.5 +pillow==11.3.0 +pip-tools==7.4.1 +pipenv==2025.0.4 +platformdirs==4.3.8 +plotly==6.2.0 +pluggy==1.6.0 +prometheus_client==0.22.1 +prompt_toolkit==3.0.51 +propcache==0.3.2 +Protego==0.5.0 +protobuf==3.20.3 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pyarrow==21.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pycparser==2.22 +pycryptodomex==3.23.0 +pydantic==2.11.7 +pydantic_core==2.33.2 +PyDispatcher==2.0.7 +Pygments==2.19.2 +pylint==3.3.7 +pyOpenSSL==25.1.0 +pyparsing==3.2.3 +pyproject_hooks==1.2.0 +PySocks==1.7.1 +pytest==8.4.1 +python-daemon==3.1.2 +python-dateutil==2.9.0.post0 +python-json-logger==3.3.0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==27.0.0 +queuelib==1.8.0 +referencing==0.36.2 +regex==2025.7.34 +requests==2.32.4 +requests-file==2.1.0 +requests-toolbelt==1.0.0 +requests-unixsocket==0.4.1 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rfc3987-syntax==1.1.0 +rich==14.1.0 +rpds-py==0.26.0 +rsa==4.7.2 +s3transfer==0.13.1 +safetensors==0.5.3 +scikit-learn==1.7.1 +scipy==1.12.0 +Scrapy==2.13.3 +seaborn==0.13.2 +Send2Trash==1.8.3 +sentencepiece==0.2.0 +service-identity==24.2.0 +shap==0.48.0 +shtab==1.7.2 +six==1.17.0 +slicer==0.0.8 +sniffio==1.3.1 +soupsieve==2.7 +stack-data==0.6.3 +starlette==0.47.2 +sympy==1.14.0 +tenacity==9.1.2 +terminado==0.18.1 +threadpoolctl==3.6.0 +tinycss2==1.4.0 +tldextract==5.3.0 +tokenizers==0.21.4 +tomli==2.2.1 +tomlkit==0.13.3 +toolz==1.0.0 +torch==2.7.1 +torch-neuronx==2.7.0.2.9.9357+08e1f40d +torch-xla==2.7.0 +torchvision==0.22.1 +tornado==6.5.1 +tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.51.3 +triton==3.3.1 +trl==0.11.4 +Twisted==25.5.0 +typeguard==4.4.4 +types-python-dateutil==2.9.0.20250708 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +tyro==0.9.32 +tzdata==2025.2 +uri-template==1.3.0 +urllib3==2.5.0 +vine==5.1.0 +virtualenv==20.32.0 +w3lib==2.3.1 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +wget==3.2 +widgetsnbextension==4.0.14 +xxhash==3.5.0 +yarl==1.20.1 +zipp==3.23.0 +zope.interface==7.2 +Note: you may need to restart the kernel to use updated packages. diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 0000000..852a2cd --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,36 @@ +[tool:pytest] +# Pytest configuration for notebook testing + +# Add nbval plugin for notebook validation +addopts = + --nbval + --nbval-lax + --tb=short + --maxfail=3 + -v + +# Timeout for notebook execution (in seconds) +timeout = 1800 + +# Ignore patterns +norecursedirs = + .git + .ipynb_checkpoints + __pycache__ + .pytest_cache + +# Test discovery patterns +python_files = test_*.py *_test.py +python_classes = Test* +python_functions = test_* + +# Markers for different types of tests +markers = + slow: marks tests as slow (deselect with '-m "not slow"') + gpu: marks tests as requiring GPU/Neuron hardware + integration: marks tests as integration tests + unit: marks tests as unit tests + +# Notebook-specific settings +nbval_ignore_stderr = true +nbval_skip_on_error = false diff --git a/tests/requirements-test.txt b/tests/requirements-test.txt new file mode 100644 index 0000000..404129c --- /dev/null +++ b/tests/requirements-test.txt @@ -0,0 +1,19 @@ +# Testing dependencies for Neuron Workshop notebooks + +# Core testing framework +pytest>=7.0.0 +pytest-timeout>=2.1.0 # Timeout support + +# Notebook testing +nbval>=0.10.0 # Notebook validation plugin for pytest +papermill>=2.4.0 # Alternative notebook execution engine +nbformat>=5.4.0 # Notebook format support +jupyter>=1.0.0 # Jupyter ecosystem + +# Additional utilities +pytest-html>=3.1.0 # HTML test reports +pytest-cov>=4.0.0 # Coverage reporting (optional) +pytest-mock>=3.8.0 # Mocking support + +# Environment and path handling +python-dotenv>=0.19.0 # Environment variable support diff --git a/tests/run_tests.sh b/tests/run_tests.sh new file mode 100755 index 0000000..db06fca --- /dev/null +++ b/tests/run_tests.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# Test runner script for Neuron Workshop notebooks +# This script sets up the environment and runs notebook tests + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +echo -e "${GREEN}Neuron Workshop Notebook Testing${NC}" +echo "==================================" + +# Check if we're in the right directory and navigate if needed +if [ ! -d "labs" ]; then + # Try to find the neuron-workshops directory + if [ -d "../build-on-trainium-workshop/labs" ]; then + echo -e "${YELLOW}Changing to build-on-trainium-workshop directory...${NC}" + cd build-on-trainium-workshop + elif [ -d "../labs" ]; then + echo -e "${YELLOW}Changing to parent directory...${NC}" + cd .. + else + echo -e "${RED}Error: labs directory not found. Please run from the repository root or parent directory.${NC}" + echo -e "${RED}Current directory: $(pwd)${NC}" + echo -e "${RED}Looking for: labs/ directory${NC}" + exit 1 + fi +fi + +echo -e "${GREEN}Working directory: $(pwd)${NC}" + +# Activate Neuron SDK virtual environment (required) +NEURON_VENV="/opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/activate" +if [ -f "$NEURON_VENV" ]; then + echo -e "${GREEN}Activating Neuron SDK virtual environment...${NC}" + source "$NEURON_VENV" +else + echo -e "${RED}Error: Neuron SDK virtual environment not found at $NEURON_VENV${NC}" + echo -e "${RED}Please ensure you're running on a Trainium instance with Neuron SDK installed.${NC}" + exit 1 +fi + +# Install test dependencies +echo -e "${GREEN}Installing test dependencies...${NC}" +pip install -r requirements-test.txt + +# Set environment variables for testing +export LABS_DIR="$(pwd)/labs" +export PYTHONPATH="$(pwd):$PYTHONPATH" + +# Parse command line arguments +PYTEST_ARGS="" +SPECIFIC_NOTEBOOK="" +REPORT_HTML="" + +while [[ $# -gt 0 ]]; do + case $1 in + --notebook) + SPECIFIC_NOTEBOOK="$2" + shift 2 + ;; + --html-report) + REPORT_HTML="--html=test_report.html --self-contained-html" + shift + ;; + --fast) + PYTEST_ARGS="$PYTEST_ARGS -x --tb=line" + shift + ;; + --verbose) + PYTEST_ARGS="$PYTEST_ARGS -vv" + shift + ;; + *) + PYTEST_ARGS="$PYTEST_ARGS $1" + shift + ;; + esac +done + +# Run tests +echo -e "${GREEN}Running notebook tests...${NC}" +echo "Working directory: $(pwd)" +echo "Labs directory: $LABS_DIR" + +if [ -n "$SPECIFIC_NOTEBOOK" ]; then + echo -e "${YELLOW}Testing specific notebook: $SPECIFIC_NOTEBOOK${NC}" + pytest $PYTEST_ARGS $REPORT_HTML --nbval "labs/$SPECIFIC_NOTEBOOK" +else + echo -e "${YELLOW}Testing all notebooks in labs/${NC}" + echo "Discovering notebooks..." + find labs/ -name "*.ipynb" -not -path "*/.ipynb_checkpoints/*" | head -5 + echo "" + pytest $PYTEST_ARGS $REPORT_HTML --nbval labs/ +fi + +# Check exit code +if [ $? -eq 0 ]; then + echo -e "${GREEN}✓ All tests passed!${NC}" + if [ -n "$REPORT_HTML" ]; then + echo -e "${GREEN}HTML report generated: test_report.html${NC}" + fi +else + echo -e "${RED}✗ Some tests failed.${NC}" + exit 1 +fi diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py new file mode 100644 index 0000000..9fc1a5f --- /dev/null +++ b/tests/test_notebooks.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +""" +Notebook discovery utility for Neuron Workshop testing. +This file helps with test organization but nbval handles the actual execution. +""" +from pathlib import Path + +# Define notebook timeouts by category +NOTEBOOK_TIMEOUTS = { + "NxD": 1800, # 30 minutes for model compilation + "FineTuning": 3600, # 60 minutes for fine-tuning + "vLLM": 1800, # 30 minutes for vLLM setup + "NKI": 900, # 15 minutes for NKI labs +} + +def get_notebook_timeout(notebook_path: str) -> int: + """Get timeout for a notebook based on its path.""" + for category, timeout in NOTEBOOK_TIMEOUTS.items(): + if f"labs/{category}/" in notebook_path: + return timeout + return 900 # Default 15 minutes + +def get_notebooks(): + """Discover all notebook files in the labs directory.""" + notebooks = [] + labs_dir = Path("labs") + + if not labs_dir.exists(): + return notebooks + + for notebook_path in labs_dir.rglob("*.ipynb"): + # Skip checkpoint files + if ".ipynb_checkpoints" in str(notebook_path): + continue + + timeout = get_notebook_timeout(str(notebook_path)) + notebooks.append((str(notebook_path), timeout)) + + return notebooks + +if __name__ == "__main__": + # Print discovered notebooks for debugging + notebooks = get_notebooks() + print(f"Discovered {len(notebooks)} notebooks:") + for nb_path, timeout in notebooks: + print(f" {nb_path} (timeout: {timeout}s)")