diff --git a/README.md b/README.md
index 18792af0..1bd2a2a6 100644
--- a/README.md
+++ b/README.md
@@ -1,22 +1,27 @@
-# KernelAgent — Multi‑Agent GPU Kernel Synthesis
+# KernelAgent — Multi‑Agent GPU Kernel Synthesis and Optimization
 
-KernelAgent turns PyTorch programs into verified Triton kernels. It was designed around KernelBench workloads and combines:
+KernelAgent turns PyTorch programs into verified Triton kernels and optimize its performance. It was designed around KernelBench workloads and combines:
 
 - Static problem analysis to decide whether to run a lightweight path or a full pipeline
 - LLM‑assisted refactoring that isolates fusable subgraphs
 - Parallel Triton kernel generation with strict runtime verification
 - End‑to‑end composition that rebuilds the original forward pass using only the synthesized kernels
+- Hardware‑guided optimization pipeline that iteratively improves performance
 
 Blog post: [PyTorch KernelFalcon](https://pytorch.org/blog/kernelfalcon-autonomous-gpu-kernel-generation-via-deep-agents/)
 
-Additional docs: coming soon
 
-## Pipeline Overview
+## Kernel Generation Pipeline Overview
 
 ![](./assets/kernelagent2.excalidraw.svg)
 
 Every stage writes artifacts to a run directory under `.fuse/<run_id>/`, including the fused PyTorch code, `subgraphs.json`, individual KernelAgent sessions, and the final `compose_out/composed_kernel.py`.
 
+## KernelAgent Multi-Worker Optimization Pipeline Overview
+![](./assets/opt_agent.svg)
+Every stage writes artifacts to a run directory under `.optimize/<run_id>/`, including the input Triton kernel, artifacts, individual optimization worker sessions, and the final `output/best_kernel.py`.
+
+
 ## Quickstart
 
 ### Requirements
@@ -143,6 +148,7 @@ More knobs live in `triton_kernel_agent/agent.py` and `Fuser/config.py`.
   - Triton KernelAgent UI: `kernel-agent` or `python scripts/triton_ui.py`
   - Fuser orchestration UI: `fuser-ui` or `python scripts/fuser_ui`
   - Full pipeline UI: `pipeline-ui` or `python scripts/pipeline_ui`
+  - Optimization UI: `optimization-ui` or `python scripts/optimization_ui.py`
 
 ## Component Details
 
@@ -158,6 +164,49 @@ More knobs live in `triton_kernel_agent/agent.py` and `Fuser/config.py`.
 
 - **Composer (`Fuser/compose_end_to_end.py`)**: stitches the verified kernels back into a single Triton program. The composed file contains one or more `@triton.jit` kernels plus a `kernel_function(...)` wrapper and a self-test that replays the original PyTorch problem. With `--verify`, the test is executed immediately and must succeed.
 
+## Kernel Optimization Pipeline
+
+KernelAgent includes a hardware-guided optimization pipeline that iteratively improves a verified Triton kernel's performance using GPU profiling feedback.
+
+1. **Profile** — NCU collects 28 hardware metrics (compute utilization, memory bandwidth, cache hit rates, occupancy, stall breakdowns)
+2. **Roofline Analysis** — Classifies the kernel as memory-bound, compute-bound, or underutilized based on SOL (speed-of-light) percentages
+3. **Bottleneck Diagnosis** — An LLM analyzes the NCU metrics + kernel code to identify root causes and recommend specific fixes
+4. **Optimization** — An LLM generates an optimized kernel applying the recommended fixes
+5. **Verification** — The optimized kernel is tested for numerical correctness against PyTorch reference
+6. **Benchmarking** — CUDA event timing measures the new kernel, tracking best-so-far with divergence-based revert
+
+The loop runs for up to N rounds, with early termination when the kernel reaches roofline (≥95% SOL) or when performance converges.
+
+### Usage
+
+#### Gradio UI
+```bash
+optimization-ui.py --port 8088
+```
+
+
+### Key Components
+
+| Component | Location | Role |
+|---|---|---|
+| **OptimizationOrchestrator** | `triton_kernel_agent/opt_worker_component/orchestrator/` | Main optimization loop |
+| **KernelProfiler** | `triton_kernel_agent/opt_worker_component/profiling/` | NCU hardware profiling |
+| **BottleneckAnalyzer** | `triton_kernel_agent/opt_worker_component/prescribing/` | LLM-based bottleneck diagnosis |
+| **RooflineAnalyzer** | `kernel_perf_agent/kernel_opt/roofline/` | SOL classification and early stopping |
+| **Benchmark** | `triton_kernel_agent/opt_worker_component/benchmarking/` | CUDA event timing |
+
+### Optimization Artifacts
+
+```
+.optimize/workers/<worker_id>/<run_id>/artifacts
+  kernel_round_0.py          # baseline kernel
+  kernel_round_N.py          # kernel after round N
+  round001_opt_prompt.txt    # optimization prompt sent to LLM
+  round001_opt_reply.txt     # LLM response
+  round001_strategy.json     # bottleneck analysis result
+  ...
+```
+
 ## Platform Support
 
 KernelAgent supports multiple GPU platforms for Triton kernel execution:
@@ -218,6 +267,7 @@ It includes selected L1/L2/L3 problems with:
 ## Repository Layout
 
 - `triton_kernel_agent/` — KernelAgent core (agent, worker manager, provider adapters, prompt templates)
+- `triton_kernel_agent/opt_worker_component/` — optimization pipeline (profiler, benchmarker, bottleneck analyzer, orchestrator)
 - `Fuser/` — auto-router, orchestration pipeline, CLIs, Gradio UIs
 - `triton_kernel_agent/templates/` — Jinja templates used when prompting TritonKernelAgent
 - `examples/` — sample problems and prompt snippets
@@ -234,7 +284,8 @@ It includes selected L1/L2/L3 problems with:
 
 ## Documentation & Community
 
-- Architecture and deep-dive docs: `Coming Soon`
+- Optimization pipeline docs: see [Kernel Optimization Pipeline](#kernel-optimization-pipeline) above
+- Open-source recommendations: see `docs/open_source_recommendations.md`
 - Issues: https://github.com/pytorch-labs/KernelAgent/issues
 - Blog post: https://pytorch.org/blog/kernelfalcon-autonomous-gpu-kernel-generation-via-deep-agents/
 
diff --git a/assets/opt_agent.svg b/assets/opt_agent.svg
new file mode 100644
index 00000000..406a52d3
--- /dev/null
+++ b/assets/opt_agent.svg
@@ -0,0 +1,2 @@
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1068.115118337131 556.818564330905" width="1068.115118337131" height="556.818564330905"><!-- svg-source:excalidraw --><metadata></metadata><defs><style class="style-fonts">
+      @font-face { font-family: Excalifont; src: url(data:font/woff2;base64,d09GMgABAAAAACJYAA4AAAAAPEQAACIAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGiIbjxQcgXYGYACBFBEICtwwxCsLbgABNgIkA4FYBCAFgxgHIBvcLiMD9ZL2Mkz2lwc8Ga/xOgBP1ZMqG9UwBIZhLBZ421S5voxh3JdpcaJGEXc5+MadhufX5nv/379u4O64A45s4SLJayLvqPBoMQEnYhN2b9NeJCZz4WYtnNtEV2HVlOd77s077xfhLaR7FiTWu1jQgx4tff5Pp2W96+yWljA5qIgrJHt9UO31V/Qz7Z/2//kCR3aSkWSKs2A7vBhY4JuR/i+nlXRVvbn9qhI4cYNl2cEBu5NtDwAYBk7ZWTjN4d65uq9COw4KDKHtOLHDzcHhkmrhcF7GgKNNBuGBZKJSnJPdp19re9XaOSr2BEk7nV5J6Uvm/5ZNe0NdSX2+VAfC0oVBKIyZm/cpf4fWhqHXPbobSkspRSgewu1dWh1SSlEcKrgYFYWMEkHG2qjoGGNyDZ2C9ePi7nHhWGQUcSESDRTdTQAgAKgEdCNQGCTFoYgHJRH4oQAgi6WB22zzXEB81t/ZCsRXnXUtQHzv7W4HYjgAgBBiubvobAdEXWxhGGSeDw9IhKBq5B+ByQQHYK+TIBAAvPobUqWMuZUxFzNmOmMeiff626HjIgMDcEL4bfesw+dvFU2g70DIxQpHMrg+hR2k6fcAy19VW6a7PtJ760yEfguQ94rfK1JzVuACrTvFHHC7MRBcMQf5tohg00N6dcTQ3Iv1wK/OVngS6MsFoWSDdii2fCApDhS/mKv3dKC7vhYkgUNERUPHwMHFJyAiJiEVIIiMnFIItXARYsSKC+YClVC0kYdEkSKRbBTiixAUyYwDOlGKBUeCwIfBiOFnyGeUEqoWARGNABedEAOTCAoxqHCiePwDZ3EGkDLPGADwCAAgIu2wngUYxdBg4MwJsjhnEw6MLeQxQQeLShxgAkKOgr7ktJiDkEhEcdbqJ4EPiho35hoXZnYUsyRHDh9hn0fuCJBJci2pYDFCOWo0IXCK8QpyAs11jiSDugohgXicxXh5uRHYw/1Ngxdwnzpr2y0KMZ1D2/cOSRepvxDYidBlgsQ+wtl4Cj4pdaYrVIkLfxioBImChQrZ+thiL4JPPVleLJUUUgMInhkNGqFP6BKS77IIzMCWtnHVKAwxS1NcK8gVq4GH4j2DmaErGnbMBWASyMweev8B1nWp3gWwLRd/1lMsu1tBtI/chEMRjDMgpZgXzNPIqEVLkMwoVz6XJn49Zjd1GSaqegktc0biQl4tOsH6tgb3X1yMuXTh3JnD3u6t3gQBFkL3SQCSKWl1mA1GZSXr3+zFRJNVbgAE7D2DLHcVJuoiiLy6ygnqzTA6JpAdU+xqj5S6eTo+J0cf2OWorLG7awPUafKUhJn2wgRuiCApk67mOj16sTRPlacXBYYzkjRS4z8//PLPdz+8c7ZcP1v75mr6mLxKt3MX/j8jCjxAfLVgrn8DO9q0PB488jpWB6+1UtiugN0VPek4lA9/1dZ5NVeYEESdrP/GvCajCM1+j+OXXCx32IT1NDuk0SeM1Ht0hmk3wAPZn8Vwj5Sx36+YbJru0lHpqEeehYPZ/hwQsMVQlGn5v7Rn2HnGnGjrlLnBTcr49pDSyRNKzeaxZpAzFbPXpUP+gTrEbT7t7aRXH98Fv++3Zd+AO6mbtmmDOVAnb9y++uSopvLUcnhvciXlb//Wzo0ZuNYs1pMLmXLQlyDcI3kqX9qdgyR8vpfdH+fmFjchTWLsaupXrZvceSQd2jYJ1wzmXIsuq6kEHy5MxqMZPhvgZDfev9iQ/fX2uc8gKc4+o2NvoyX3ypYLczK/qyW9ScIdXWHCW7vF5Kw3/tZO8vowzJqCKUwlyA5vr9cunlz3EZGapB6PM1X6rv+lzCxu5JOWLDIGvjTJc1KqggtYPoVyycQbM0c7avd4Y3ItTlnYMk8vT5J0kWsjPs5X+tEeWa5gnvYzFhbrp6UNpCqD438tp2yoqsiJnF5TixYb3XCPpCyH7fa6TI/w2k9A9FRdrxGpaqluWMZIQMp4dAR4DSDG0+37eZr51s6OJpfIyGiojtzEZReWl5lASAWWZWEyAWcRMBao5bmAf1WDaBBFtHHRu+IhgzEDtOagCglJwy5ZMTKk3vuuEldlkvA004Uy1Ou2bZNU9KRD3V4SP4vjvMMyK6OJ3ZtbvcbK1ll0Fh1uabnS50kWChhjAe6biyoI6FEu7VQnyZ1DOV6F5PXtZDR34Z3i+xUV0Ao3aaaQQvhuccqX5qVZN74VEt0NAfp+21wHS0fCih/YsGeY7kF/2dZMheMcDdBighfXvqI6EL3ZSEQv5qO6R+v1PG2qEVK4lmD3X3FLbhTdNjfM4d817AdW+PLLdmE+pSyeiJOT13OVCQex7OhagSyg2koJghnTarvNXPP0ohinOYrojKpBdEX5bIC4+H6NLAuXTJ/oE8iBbAhV28tQJePcUINoF2/JQQb3pMNQNgxwT0Jhg9+P7tOMo4LkAcKUZfDpU8A++Q/tBonTNlT30+u5b/IvCjNa/CbZUK69u3+/YPr+0D8yr8FyOX5vxmeKrOokIcOGFhs0CAJMiDjmAX44ZoHb63D1nsJEzVFF0wN3aLJDbSXtTahuOj3a4DUq10RPLopyy8jywT1AtqAIC6kz/Wgro0wLSaY1mxQyspQwBX95gQGteAvgpb7J1nTqYjujEdeKcS1BX0IJSDqqT41Mr2SUezTDFY4cx8KVZENG3zfNel2CDMAllMAUOiyFy64iLFxsqcAKLEwumDLAeyhNc+UF8/SFfR0mon2bDNMn+VAnfPIA1yguqmjQ29LkktiI57UVTUvalmzVkG2RESlfK6o4Vma0khzLp3orseAz2ZZo46OTsPqFHgLe6EE/O78teXq11r5p8Ey2ImPKT3XIk2PCt2ujb4qXEU32lcHsR3h09BBeLP6avEaPjjHtl0NEfGL/JwxJurlKwkTViSpJKPnbDPqF7arefEDeuerBI1KwTV8OtdE7zLe4FvV0kcJJRyEL11oCM6c3a7rZo+r4sNN6RTi4pzjSI0ztHyGp5i7yF5WM6xTrD2NFbAinRNkGaqpKf2E6Ovz1iVaMT5K7cwlexbZg/KG8C+iaLqabwXEnP6hsi4W7XFutpmSohadIYdIJJchVryf3LBfGs5ssM72zP4mArif3Jsk0Xdj9IeMGR6W/jOer5fCQzJMshDm+/AIgUyrq/3VyqPAPVUvl9BpsFbL64JIMk3TMx/83p29+soGU7ujgM1l4G9i9Cc+EfxXdpEnd5ezQluaiFo1neXqE2WnSm9X8fi5BDj2XNzNoYsoeuV8g228Pk7FnCrWwSbm4V1Rng4ExepB9pLkEZWpZnmArFOOllH9EpupKtVBY415EGze06Eof4IMphd8n5EjxhkBuSPjyYPoMSh2cxHqtVlPCGwNFtFzAH61Ssi2GPbqRp/zWeyrmxm01TEiueXcsvB2V1ABxLzqEjX51Rb/5mDxXGGchlLTvg2FgJ0vefowdr6c6rPCgNFOaE5lQuTavapJvNhE2C48GtFbcFOZBRFUQGIrXiD/Lk2diNCGtZ6Avla8aWx9jLFA8UjJziEMrJhnTdoNrzdMuPy0xvY1UhQILagmmE1C2hpVrzB2NPQ1SD6sxx9V4fHJ/RC7FlOtfr0/0Jwdz+8qVj7QjfvvaeVNIjXSYbDmaOaqpj2ZxKthu56yKiehxNpXgUzQxWvcs9GZ80IB+fY/k55Na98kbt3kx2+/kA0/2AXybxETvwfS3CbHg4eCjJDba1BHE7LQCY5qEieYVuTKLlcbZ9lleay7RiIuRQfs4uNkUJaWmpLydbOgYB4ASZG9V70za2311jpGihTGU8kvrFf5O4UaGD+LNZfaukeDjBV9Yjj+YfbKYfZAppaPCJbttlTtBZAX376PsVIs+XoBhO3espS335kle0SQfCXs679Irca37zajEA9qaYbI8LDwX/KzukrQAPpmzsh27d2l5OEdtP6afkLkEmZJcWogVYfp/f6Q07KqxMapaUx51dEWzHe4ln3G/Cu9vJ6OrvLSD1FsChAREv58O13MIaFrSCxKk+K9asVNJJmv1JSZbdLYnxhmcg+wkUfNpc91mMJ5FCaa0KBvzDu3y1URdTW5f1I7jWBbScBcOIr5ROEMqvxffdIPCRNsfMLwk6aIWHeqUC3blhl6jBfjTjA3JHwn98RwVSREtbwWLBVdQSLzDDT50IqRwo/1lTR+eNMm8/tCRjAwPyv07uXSXSKmMTO5kpuokzc2myIvia3HCVdazl0QcK4HQokqMXheLjVVxladX+K0IqbywcIuEHmNCELdaCgFWpqpuiF6EVAcf92/eHxmpO3oyWepxbd2Y5a9X/FYLmtEnaa4Ji/aqGErYxiaoflYV1zLBhESLXlmDBc5trYvk43qR1HYlWN48TPazX3gRp1LVouxhpQSCWBB4a4tyr0dHMxxsWIorWqIlBIB+5fDvYHO0refpTm0Hqfgtt2qBff95TXGe1o2G3SV5FcAAtmFTRrcl0AZOvsM4YyW5ZDLPw63885PM/33ul+iw96fJ62XBBFsJ5P4ParrQ8Pm7m+XgL1vF5IebpdsfHO0NBAF9CUJSnuZeHwOuQ08hz3QJlTSdqFdi/qRaZHCAyRJEpc1NenBEu7vBGVIGN0maEIdcpEHi0i+FVXD/XL+5pRw8Nfu+D+jirTjvClHeT3fnc/Mx/WBRyyei0ZAYgD7xFWvWS/TTjQ8a3qg4UCjGl7OzqPj+LNX3lLVQqFbncWKSAoMRaXHNSGdLOd6Dcfv6Q+zPQRgS3bDPUonFTKtrn5BwtcFFFj19m3uUd9GRXqs6FnqTFFhIjRc+2w0/bwexFeBBSkDnXjyylMMK+HWbDF8OV1fVE7IhyoY97zjrbXZwSQhGCHFvizaiHRp9oCzP8/CaQsri6AnX+BV+g52ADlKx+IpD6ZksMTvRPC7xJ1yz6EaX3Ovf8pcixWreoLjH0dNPTc+JMMRgyxQi1tR+xgJxi/T7FWsAIqP98GR/EV2XTPsI8Dcw/6mTieit1yKevhMkGNuuhvNrXOsihQcy5nqV4FjnqNfc8gnsScek1+kbG7zZRFdABh32IpqmK/uZVipqp0l3+PhbI1GX8WmfbKefx3Qs9P4G50YyUvMnzG8McDgMH1akBldGcfW490oohwzKKF0CKDehX/9n2lBxCA+q8Icox0eRkQ0QN2Uod1/Avh/alGk2iIwSulJTtMBvlx8hj1+hQWPL+UFYHaSMHy7an/30cm75K1Ib6OERtfBauTpmv+2pKa7KStPzBCVM2BYgqoXPYvDJlJ8dG1nRH49N8ssZky374f/uqDdSU3F0NjCKucF24WuoQ5oxUm9CwZTuu18tax+5ybms4ewPHAXrtCoVq3JoBjqDa/EYuqta380yLiPpaNWGPQ+PME/nfc5ce+61JdRBMHOK8BDPIULzUqjc2jy9X7KDoGNtCvJ0eJZ+aHgBWMyf3hUBfHsTnHk9oSLA0CqvEa0lFQfEvHHU1QCarrI2HgwQWO4LuI+PEG5Vb9/pO1MyXT8wzq8OHf3EAJapBra2Kfi5sL/m15CJQkpV1RkgZHH+ggY4GYh4Qin4iIYVhuiIhV3Enu1vPx3MinQec9Y8q53AyEmx44QfCHZcIQU9Z+3HgRXEtf8W8d5MNr2FrB7dNQJWL3uhXfHrHcmOvVsujJcHQz5UwQSZQ6tJfF8V4+3MuOyRpd08+ce5R3Ul87yZ74X/d/2ONrWvHRtbWxbI+ZUR3z6Km8CY51eOm7g1tDyk0w6iENSI83ma+NM+VIXP9oz4hooVt+1cqyj2k6pU3Zc7rQSv0bE0fUcDtbNVkqP8XtpuQX2inENZ0dP3Z1T/MTF9j2ul7qSWkpvea8xINQp9k5TOv4x9t3qbiBEevVEM/pFz9tYIDW+hu79Oq93uZb36fyVy5s4r9A6nU0r0BM/68I4x58msPoFo6oo7k6F9ihOGIzUUaj4Oh1Qe/YZFXbezgGzRNbY8XiRyOLl/hSnesVuxUCaujoZ9F5rJ0PqZOSSRr3tD7jH5Dd+Qkwd8v8QUu4McASCDp4OvGR4/5i75UGovGzBmxdUbPYPr4kIzrJJgEvbG4aC6uNJWjYfnfHAqlqLUOoK7M/8NuKo29ebEl8BvV7//j+Evu7rn3BFeiv2EmWJPuAqC+eA92XkgSqCd/KvbSr0kJa91w9WGEf4GLoV6Gx//Xm3tPub+YI7tSmXRo54dD7ZGGfVPueVwCM70H+G+DwhNjOBAPISIT7yBjrDHzixWQQo88GtsQj7LihFdnI0X9uAf4v0yXdc3J2pjo0FR9FO5/kZsiX8L0AOdJs+AJxgxjbzHn/fpx5l80vJ+9DR9koKuACZce20Q7yoZZRNvjp3VR2EJyI/aXGiL76T1l3LMQQ9bXXjDQmu5NpAZwtKx30t1bXrHb9BhNhqzcnA87F9lPI+pr+f295lRBWmTiLkxVcqtRefkPB5EJhyxZx2i7MU8U09Z+rPsbhLFgnwH1sDMRWGX3tG5t/KaysZ1pVppQ5cMj/QycchtufFJ+4LQeZ8UIh+LG/UsJoBXThgbXcsqLrHI5X//98hYur7Alqhtl04A1IFfjXLoJne/aJZ9IFcIrPK7xHeHa68eQ47YOhzHj+MxVs+z0gayCSELiKfNpLMQBOcJbKHH0n6QUHY90ctGQugjdrfJYQrWvFv1nfnm5e+qz93SBtWyljOUfaZ/5mwj12fFs01uAmaPyymqGS5rlxIMyR9AbRYsxek41vacEJLzA7Cdyn3lznfjV8wglBD1REKJuyrhFqYxhJDeYpov8f7Yf2Li9KFh8XZV8tp16y9i2SlTO+NUR2fiTILdev6J/+sIDf56dSRaKTUwjpCPTOrPfNGQfN0xH/dNj6BQonIEvzNowV31dmiQIBueWduDxwpQF+Pq71mz942uZFyqvlCya6DNDtxq79RdVkC+Rj0iLQM9rcn5Rl9qm6OhpbfZcijTrsR3GH7lkncVIITuD54tDh/bLzb4KnAWjdPASlrKa7cqdClnQBWuBtOCk8rAD/wwdmSJmzDuQl3jBtkCMqspxJpDDHsKjniGSDYVWm3VYbOVn5vIXCJe81VXdqOSN4Q1Rghts5iK8qziGEoJdCCTKGc385GotZNIJ5nlgdAAae+wiToeeDolnYdaBDFlTTZlSq2I//oURJGimRJYSCcZJepCTvcxGSgRpcwdaxC5ovaz3jUyOiy/0IkGcjeddSI/kZgY4V2QxwNMj//o1sgdrrivRm8KrT86rjWojt3Bc1BxPMrUM1FogEOyv/xEDR6HfGJrpbHg+Omkg4Swpd7rnVwsFo3FeL6gtSAcLFxTxI0pV97TaO+565yB0Qam//TvtVnxNdq2jOVc2YhWRSaEETHmG4iLZYQyL8zD1cTt3CohhFuCNyWyWGzItCA6TtTkV6QO9mthKgquW8DF+jlEK9gSYjv9zf4fZl1kndQWZmpCR/Df6rZddmwLQCRvILvJG7A8mG3oXWgbyRa4yiyHlvybOPhvjPGAKNdD+P20rZkbIuxa+M+h2wrlZzh9sceZ+5w5Kj8YuY3Xe2WMCT61XLr9WY7tcepBjndDFMHTDQhqdMAgMFa1dxUrtMkx6ZDE8EhsmTsUJvCe/AL6dXw1b3Bt/46C3GAnkWco13MmPrpNIVwzj5xH5ipVWT+jnQeDisrW4Yv5fj7AdY7tYO42vk3SQHB0iNCSso2Pw8wZIANWTFQx80OHI1wBFt+soXBYwR+YKOFL8XpDtAZGE9cEi3IVlvqK/ChHUE4xPcR40Ts9SxB3UWK4FFFUMNovjvWHtMC9BzxH8iN5W7LzQ2J2N/QPTN4H9ZyKKKgjl+ZIVub9ijjD3u1PTU0S0066N7vxm/FtHJ9yVYSpknc8FxSZnty+IKydqVOuSBDmZQiAjIbPzh1DwNERdhfrtubhmV7BIkABJnvmYZZyPGYXTjudFHwTUMU+hCwBfy+X7o2O15/6+GeA4mQPLrnpatW+I1MM7T/RjVF4eX0j0ZrSdKxWVcIXpb3lin8tzM23zEloqqbs9am0zicoJ9J+9i/2fdule5Z6VodXifLXhh6OYZsejfuSGj/Tbz0ZPBVWmVH5+wV6IMCHxw/v14gOuJuK6oChzJkC5kLitaQbvza6bV15pmcc+bmX6+SXU3myr17SdqgOxswLvxGZqGVz7qYXU0ntuD0Ilpskk3F499HXveQe4mHE/D40KLiMLrf2P5zam3CyPhezBduwweDSKLc56Tsp443jT/8ciadc12GD+OLhLJt5hmEDG0JZ/QDGVBvrgAjpQgoU+w6WBTOjp0PWxcGDD0I0MIf05+g6pzHO3RY6J5PxzZYjHbZYOAd95+ayATPoHH+mO7XGN6yhYKPDSDm0cRpjD81NPJkvJeNPbcoN6hpNyi5YVPlQm9lWB46K9Al1sADj4C5E08tjbQZmIZzfEfhqaM1jZAY886nXOQXlBA1Wtwjd7mdifrIFTufgIvAmeRARbmD6d/ipe6gcTHNbg/1JKAJfEcmkjTtR4QLF+8xz4DT+tlYYsqL520I3LxyGG9+NrFWMWRiFBFbTxtTPy8tWCgoEijLhCnoh7jnAInz7TWeUiQf42lB9h0tXwfoGzq2zoLMIiTm4hIni435ckxiQKbVmdU40qRw65+FdEt6/NF7zDdY6svQOxlgsZSxHbQPK3wMtG4OibYpU/5J1eYlxJR7m1OxPplws84M/t0YqC0kfm350f8YqBeW0SbH9Xg3z+2d4U/uqIX5V6O+aKXn9pWzV1CR6+WHa5TV/jN0NS3wf0WXsvh//PB8ea8E6EwnCWRVgLzvuwvVDT/H2GQRS4cLZBdTpnLAslID2E/Cx/NoCNrMSyXznjkOyLhHYVNQWZVC+UuONKZJKNDmBfyURuG3/itrbGWbrmanE3f6TcsHjosgZa3g7EDPJVZcAQj1nlfXNWfY84b2ayQ57dgjOXoLyeQmtP16g7/PlLeQMTn2v71QpuLeGbXx7IU6nNgSX/tmHdT3UkgMQ588JKzt43M/iG3aAo+SxJ4uH1LN5bP3A7H0lvAXBQczR9OOwhS+yLxUmHu+a6LQhc/YuPPiYjbXGHb35n/escrsT9samJ3XSCoiWy+etczmPr6zLigD3sUTP2x4iEPUSMWKqdAGFcCMfcHs5ppTStk0aUYFXwH7fMP5YgYLrQ6T8SjbOjqxMjstYnG4hJxU3Pi5IMQOMEkvB+t4y2v6/12JSGDIyrAfr32N9r5TpuFVBxN4t6L5hJZCkbi4MaqL3yWo33Yj8HdzFxJCrEZ0Chv7399UtidZGGvVrimBEAVAcLSYhFsllo2UIn/5CzcFSdWPVRyB4dTHiBbj72QSvqdTvSODz5RZuUPE2aXE4TGEbO8CqU7eeBsjIe4e8HJbNsUYzEj87uB+msYpYi4Wf7wa0AAnBRigmiAP+CDn3Ja9t5pUooiIGIjHqZrBP8l2Ncbk29te9VBa+AcWbnpzutp1ZCP6emlFdag04/eal+Ua67jrSG+qtVYS/AxEiHqmI+zrSOLX4JdoSxib2EwKS0FZoIq+MfUuJIYX4JlVZMNxpAYMcwdSPcJ6QRiBTPhEjlIuR/XxpELIAQ81kWhTtXcIaK0SNssO7aLOnwGywu1q5FbmfzqeOlVUlbX5+u1j9zkNuShdnhnP628OrcyaJJbDo44BNIdS+065X7eOlUXORGlKc0zCLtaVwS0LePxPIjewiShcSB5LzEqqWgeMIWcSub2Bry0MOWQQKPFGuTUlycKILOnYYPgiyUtYEYPkoiam/IzvTHEFPgfpj14I9M4Ova+S7mkdbjgWa3Ww7egqK/5Q1ghu48KaPrM9xTc3hTYe0Yeuzc0aypxJMb2a3zZM+rPPHynLDPHxrqomi3d4AFa7oyKBz77FcYwZiaAQFktvuHGsBmzv9/uueD1mhZvI+IqgXQ0c5ybDgzgWOkUaY6AmMtfxPnf9ve5biuiNxC46EDMsJzem14xuS136UKeFZLSbXIdUjSi5j2zz6bLAqeIJu4GJHFVRBgoIGd20X7FleNlTFxUjiFRYudb953hIyR7UUdPJpy9My9kxfmkFHrSgfMQCtkXNNlRr/V8DuoxqK9NOJpN6orfk6t1ibZPJdluDbfitXOjFEjt6NO27otfAcizD7nRM9z7+5uujxf8fdd4tTdJW3CKwO+fz5gJt6qUx1+S0VZU4I1+LRn/6WFHDU9/1NfRqnmDqZ2vBOI393dCjqr1fFXT+qvbTndTDofWxiHhVUmBip5MLD4pd1ibH3bce1rxn7muaacT3cQ5OXfrBrquek5Ns6ZPPBdSqekK5d8vuCOqEDyZrTHyTbfWw4KWH/mBAaKW4aPvoRRxE3aEZl0YhSlR7j+NIXdoYU8EFOvTQnLPWLNwkornoXXR4TEZKKm0Ez33vBLgHn6aiMckU33bhu0cyEDLgZwYfQWBEKVtEAoZflqgiYZ8LWhOuLcrWd9bdCUKtenouPYAeci67bHOc8lvYz3Oa+69WMHjJ3JOYodFmmi9O8PokjvRFYZuwo3NiCa8odKqzXpa4uuVvnuTkRxe/zEHEdE8yR1zOdXg26vduhS9OvWTuPd43JcsiM7GSIIuGGPfVlMzRGHt+SyuqX+a4mPIsUP520D61eVaL7k0lG+OeW6t3C72PAj12yNSo5tTJt5Rszxi9/a0qTi2quScpatV6yRTAQxno8YMZH3EGr6ZHmr2ZvuPlN0Yu0wgacIXOFc65QqsifndWR+C+SrvILj2S9GZ+STsoKFpOBVvAggZR+NGXm8Mjzxr3kDfXSwYvuOy6v+XPd6Cl09GAVrZfy4NcJmPkoasio/Xt9heSl0YpCwfPdnM7AO8yQEmF65TUT6SwqQ66gMvTnLOJ9NtCfdDxRdSV2XOGphtWKPv2mIfkPkiEkbtnHTE6SptjH+olEeJ3W3r0z7UwRBa2h9ZL/I/OXX9/1cuK82Ohvlxs/E5CLj9RxJMV433IW+OnEiZo0lFKUu4jwJYP9GmtYCfys/6QlPLVZu2rHRp7Lc0cWk7s4NDInkmjikW0MHs+64I3vkynj4MMfwn+RPLp8j3G0jpzO7zoxBlsjy0rnZk13EZS0iPLwKw4P4Y1gtPN8rQ3v/C/1q6Mc++iGbh5/IIYFsGV/eLe0W8XIfCMi9HcYMd3fDfB4a+dfc+x/Du7P8eP/jRQiUAD+GVMysTOMvxfHAeHjJg8+xhoF4IvIGhdWm3CKTVev4+eD1p+2mijNZBkQlN0A8oWSHoKBCmg5I36gvvrLayIiF5g05NSJzkvcvMLOe7QDAMG5KSftSEMJGI2QREiqJ2SpoI6IpO72V8ZFJEWSB5jhiTINErNOZuggLcViI5JSm8UlSFjUYJTiclJUCjHCFp9M8tpLWpNr+6WRUcm+Afhi5L9RizX/xbKUQsExqW91eRYDcX0Ug6A7EoMKNxyDo+SLwWRRIh4eFwCjWWp4tWpSb6Z23aI41GnQI+n26uRZEJ26Mhm4ysSJFstFBnN2dz+/RkqlJAHjBYYlJP7EKdTak5Gp2oYaOJkV0JmVVJg5E/VC+PVnYFMh3rgjDCuAaiHixfYxkQmffqaoYm3hbq8+0SC0WvlimTrvWsA6QeqwF7C2UDkKs3v1EgYAAAA=); }</style></defs><rect x="0" y="0" width="1068.115118337131" height="556.818564330905" fill="#ffffff"></rect><g stroke-linecap="round" transform="translate(354.7555452752483 230.059430803572) rotate(0 96.83314732142833 150.65917968750023)"><path d="M32 0 C75.83 -5.39, 121.99 0.82, 161.67 0 C185.92 -0.3, 193.54 10.69, 193.67 32 C199.2 119.56, 195.41 212.65, 193.67 269.32 C192.47 294.16, 185.07 304.79, 161.67 301.32 C129.79 297.89, 95.29 297.18, 32 301.32 C12.96 304.83, 1.31 290.53, 0 269.32 C4.93 211.83, 1.75 159.45, 0 32 C2.82 9.09, 11.85 -2.78, 32 0" stroke="none" stroke-width="0" fill="#ffec99"></path><path d="M32 0 C80.84 1.83, 132.1 -0.06, 161.67 0 M32 0 C69.98 -1.13, 109.63 -1.19, 161.67 0 M161.67 0 C182.57 -0.02, 193.51 10.43, 193.67 32 M161.67 0 C182.74 -1.05, 193.01 12.68, 193.67 32 M193.67 32 C196.99 93.08, 195.09 154.34, 193.67 269.32 M193.67 32 C194.64 92.45, 195.15 152.2, 193.67 269.32 M193.67 269.32 C194.91 292.42, 183.47 299.45, 161.67 301.32 M193.67 269.32 C191.95 288.8, 181.18 303.1, 161.67 301.32 M161.67 301.32 C132.79 302.01, 99.71 300.12, 32 301.32 M161.67 301.32 C110.9 302.32, 59.37 300.78, 32 301.32 M32 301.32 C9.08 300.55, -0.98 289.86, 0 269.32 M32 301.32 C12.2 300.27, -1 291.08, 0 269.32 M0 269.32 C-0.84 213.35, -1.34 158.47, 0 32 M0 269.32 C1.55 197.61, 2.06 124.16, 0 32 M0 32 C0.9 11.57, 10.15 -0.17, 32 0 M0 32 C1.12 11.54, 10.18 1.14, 32 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round"><g transform="translate(1032.5919131140115 97.05740373741196) rotate(0 -260.54054970112463 -27.216634373802094)"><path d="M-0.83 -0.38 C-10.86 -9.5, 25.55 -44.76, -61.26 -53.59 C-148.07 -62.42, -445.13 -53.09, -521.68 -53.37 M0.93 -1.63 C-9.17 -10.57, 25.03 -46.34, -62.25 -55.27 C-149.54 -64.2, -446.46 -55.32, -522.76 -55.21" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(1032.5919131140115 97.05740373741196) rotate(0 -260.54054970112463 -27.216634373802094)"><path d="M-499.4 -64.11 C-504.85 -62.56, -507.69 -61.55, -522.76 -55.21 M-499.4 -64.11 C-508.69 -60.31, -517.36 -56.62, -522.76 -55.21" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(1032.5919131140115 97.05740373741196) rotate(0 -260.54054970112463 -27.216634373802094)"><path d="M-499.14 -47.01 C-504.8 -49.04, -507.71 -51.61, -522.76 -55.21 M-499.14 -47.01 C-508.6 -49.69, -517.37 -52.48, -522.76 -55.21" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(916.5692825104254 502.4003895703231) rotate(0 64.35254123884397 -154.28414917158796)"><path d="M-0.09 0.84 C21.4 -3.48, 106.74 25.43, 128.11 -26.29 C149.48 -78.02, 128.31 -262.59, 128.14 -309.51 M-1.59 0.24 C20.37 -4.37, 108.97 23.57, 130.5 -27.92 C152.04 -79.41, 128.22 -261.68, 127.63 -308.69" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(916.5692825104254 502.4003895703231) rotate(0 64.35254123884397 -154.28414917158796)"><path d="M137.78 -285.84 C136.92 -292.12, 134.75 -295.55, 127.63 -308.69 M137.78 -285.84 C135.35 -289.9, 133.88 -295.61, 127.63 -308.69" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(916.5692825104254 502.4003895703231) rotate(0 64.35254123884397 -154.28414917158796)"><path d="M120.72 -284.67 C123.79 -291.2, 125.56 -294.9, 127.63 -308.69 M120.72 -284.67 C122.07 -289.09, 124.38 -295.06, 127.63 -308.69" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(10 239.4588505452416) rotate(0 130.57587246823232 31.72991071428578)"><path d="M15.86 0 C107.38 -2.7, 196.09 -3.47, 245.29 0 C257.28 -0.05, 260.33 5.2, 261.15 15.86 C262.8 25.94, 259.73 30.71, 261.15 47.59 C259.3 59.31, 258.01 59.93, 245.29 63.46 C168.5 62, 91.99 62.33, 15.86 63.46 C8.32 65.46, 0.63 56.91, 0 47.59 C1.02 40.41, 0.27 28.89, 0 15.86 C0.95 4.18, 2.39 -3.47, 15.86 0" stroke="none" stroke-width="0" fill="#ffc9c9"></path><path d="M15.86 0 C67.58 -0.46, 117.49 -1.1, 245.29 0 M15.86 0 C105.91 -0.76, 195.31 -0.91, 245.29 0 M245.29 0 C254.5 0.97, 260.07 5.32, 261.15 15.86 M245.29 0 C257.22 -1.31, 259.23 6.78, 261.15 15.86 M261.15 15.86 C260.49 26.6, 260.24 34.73, 261.15 47.59 M261.15 15.86 C261.24 25.5, 260.55 35.97, 261.15 47.59 M261.15 47.59 C260.89 56.64, 256.75 64.38, 245.29 63.46 M261.15 47.59 C261.56 59.62, 255.46 65.72, 245.29 63.46 M245.29 63.46 C194.57 62.71, 145.88 63.68, 15.86 63.46 M245.29 63.46 C154.73 62.64, 64.48 62.96, 15.86 63.46 M15.86 63.46 C6.53 62.39, -1.17 56.49, 0 47.59 M15.86 63.46 C3.02 64.81, -1.6 60.38, 0 47.59 M0 47.59 C-1.17 36.52, -2.14 23.35, 0 15.86 M0 47.59 C0.45 35.86, -0.36 23.65, 0 15.86 M0 15.86 C-1.06 3.44, 5.06 0.25, 15.86 0 M0 15.86 C1.16 4.49, 5.73 1.11, 15.86 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(60.535924958466694 258.6887612595274) rotate(0 80.03994750976562 12.5)"><text x="80.03994750976562" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">(2) Judge Agent</text></g><g stroke-linecap="round" transform="translate(180.46557682096795 11.612276628543214) rotate(0 159.1436421320774 33.76129664262453)"><path d="M16.88 0 C87.27 1.93, 157.05 5.88, 301.41 0 C315.3 -0.04, 316.61 7.12, 318.29 16.88 C321.03 22.76, 320.69 33.32, 318.29 50.64 C317.89 65.43, 310.75 64.2, 301.41 67.52 C218.3 70, 138.41 72.03, 16.88 67.52 C8.09 68.97, 1.55 60.39, 0 50.64 C0.36 39.86, 0.32 30.16, 0 16.88 C-2.23 5.7, 7.63 1.06, 16.88 0" stroke="none" stroke-width="0" fill="#a5d8ff"></path><path d="M16.88 0 C101.37 1.71, 182.28 0.35, 301.41 0 M16.88 0 C120.16 -1.8, 222.42 -1.35, 301.41 0 M301.41 0 C312.09 -0.14, 317.77 4.79, 318.29 16.88 M301.41 0 C312.03 -0.51, 318.96 6.04, 318.29 16.88 M318.29 16.88 C316.59 24.98, 316.23 30.6, 318.29 50.64 M318.29 16.88 C319.19 27.06, 318.82 39.16, 318.29 50.64 M318.29 50.64 C319.32 63.1, 313.04 68.5, 301.41 67.52 M318.29 50.64 C317.44 62.61, 311.09 68.63, 301.41 67.52 M301.41 67.52 C230.63 67.52, 162.56 69.02, 16.88 67.52 M301.41 67.52 C218.85 66.5, 135.31 66.96, 16.88 67.52 M16.88 67.52 C7.18 67.61, -0.31 61.16, 0 50.64 M16.88 67.52 C3.67 67.72, -0.31 60.13, 0 50.64 M0 50.64 C1.11 38.67, 1.81 25.74, 0 16.88 M0 50.64 C0.06 43.98, 0.64 37.85, 0 16.88 M0 16.88 C-1.41 4.55, 5.99 0.54, 16.88 0 M0 16.88 C1.65 7.11, 7.05 -1.23, 16.88 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(246.18928181925617 32.87357327116774) rotate(0 93.41993713378906 12.5)"><text x="93.41993713378906" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic"> (1) Profiler Agent </text></g><g stroke-linecap="round"><g transform="translate(167.84417254979758 37.381065853275686) rotate(0 -58.98792852266047 35.27283726789949)"><path d="M-0.57 0.74 C-19.22 1.74, -91.93 -6.64, -111.4 5.05 C-130.87 16.74, -116.22 59.79, -117.39 70.88 M1.32 0.08 C-17.56 0.86, -92.67 -8.54, -112.54 3.51 C-132.4 15.56, -116.88 61.36, -117.85 72.37" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(167.84417254979758 37.381065853275686) rotate(0 -58.98792852266047 35.27283726789949)"><path d="M-129.24 50.12 C-124.29 57.18, -119.77 66.35, -117.85 72.37 M-129.24 50.12 C-125.88 57.24, -122.12 64.19, -117.85 72.37" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(167.84417254979758 37.381065853275686) rotate(0 -58.98792852266047 35.27283726789949)"><path d="M-112.27 48 C-113.14 55.7, -114.42 65.6, -117.85 72.37 M-112.27 48 C-114.48 55.96, -116.28 63.61, -117.85 72.37" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round"><g transform="translate(66.8706638175197 182.47611430932125) rotate(0 1.820522250088402 27.341174524535973)"><path d="M0.93 0.05 C1.62 9.06, 3.17 45.12, 3.46 54.24 M-0.04 -0.97 C0.54 8.19, 1.57 46.02, 2.41 55.54" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(66.8706638175197 182.47611430932125) rotate(0 1.820522250088402 27.341174524535973)"><path d="M-7.39 32.54 C-4.89 37.74, -3.33 41.94, 2.41 55.54 M-7.39 32.54 C-3.95 40.18, -1.19 47.45, 2.41 55.54" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(66.8706638175197 182.47611430932125) rotate(0 1.820522250088402 27.341174524535973)"><path d="M9.69 31.63 C8.04 37.18, 5.45 41.6, 2.41 55.54 M9.69 31.63 C7.85 39.42, 5.34 46.97, 2.41 55.54" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(10.585798202977912 372.59195945989995) rotate(0 132.02796836841515 33.32407924107133)"><path d="M16.66 0 C96.94 1.5, 180.97 2.14, 247.39 0 C259.02 0.32, 260.75 6.51, 264.06 16.66 C265.07 26.79, 260.83 34.01, 264.06 49.99 C265.19 64.04, 259.44 68.13, 247.39 66.65 C195.3 62.22, 142.59 66.18, 16.66 66.65 C5.28 65.45, 2.37 59.83, 0 49.99 C0.63 36.86, 0.22 21.28, 0 16.66 C-1.17 6.56, 7.41 1.26, 16.66 0" stroke="none" stroke-width="0" fill="#ffc9c9"></path><path d="M16.66 0 C103.17 -1.14, 191.07 0.08, 247.39 0 M16.66 0 C68.36 -0.37, 119.6 -0.07, 247.39 0 M247.39 0 C256.85 -1.93, 263.38 4.57, 264.06 16.66 M247.39 0 C258.38 0.48, 265.26 7.75, 264.06 16.66 M264.06 16.66 C264.42 25.73, 265.17 33.19, 264.06 49.99 M264.06 16.66 C262.88 25.47, 263.17 34.06, 264.06 49.99 M264.06 49.99 C265.56 62.56, 258.48 65.71, 247.39 66.65 M264.06 49.99 C265.01 58.9, 258.01 68.78, 247.39 66.65 M247.39 66.65 C161.01 67.84, 73.82 67.13, 16.66 66.65 M247.39 66.65 C199.89 66.6, 151.7 67.31, 16.66 66.65 M16.66 66.65 C7.42 68.01, 0.81 61.96, 0 49.99 M16.66 66.65 C4.59 66.53, -1.54 62, 0 49.99 M0 49.99 C-0.24 38.8, 0.12 31.19, 0 16.66 M0 49.99 C-0.42 39.33, 0.7 27.05, 0 16.66 M0 16.66 C0.76 6.6, 6.36 0.48, 16.66 0 M0 16.66 C-2.16 6.88, 3.65 1.94, 16.66 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round"><g transform="translate(74.30648524436197 303.1600686164352) rotate(0 0.6291909652164804 34.4113510909541)"><path d="M0.64 -0.45 C1.12 10.91, 1.99 57.28, 2.08 68.82 M-0.48 -1.74 C-0.02 9.77, 1.42 58.39, 1.57 70.36" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(74.30648524436197 303.1600686164352) rotate(0 0.6291909652164804 34.4113510909541)"><path d="M-7.51 47.06 C-5.99 53.92, -3.08 57.98, 1.57 70.36 M-7.51 47.06 C-5.74 54.75, -1.59 60.78, 1.57 70.36" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(74.30648524436197 303.1600686164352) rotate(0 0.6291909652164804 34.4113510909541)"><path d="M9.59 46.68 C7.05 53.71, 5.9 57.87, 1.57 70.36 M9.59 46.68 C6.3 54.39, 5.39 60.53, 1.57 70.36" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g stroke-linecap="round" transform="translate(707.2235153538072 221.02456907895748) rotate(0 99.15062478387108 162.89699762597377)"><path d="M32 0 C68.09 1.58, 100.52 1.53, 166.3 0 C187.06 -2.23, 198.38 12.67, 198.3 32 C195.71 119.14, 198.9 201.64, 198.3 293.79 C199.67 317.02, 189.08 326.66, 166.3 325.79 C141.65 325.79, 111.55 322.53, 32 325.79 C8.6 326.11, 0.28 316.19, 0 293.79 C0.42 210.62, -2.65 126.44, 0 32 C1.5 11.82, 7.92 -0.61, 32 0" stroke="none" stroke-width="0" fill="#b2f2bb"></path><path d="M32 0 C82.69 0.68, 130.91 0.03, 166.3 0 M32 0 C78.19 0.06, 125.21 -0.4, 166.3 0 M166.3 0 C188.81 -1.14, 196.63 11.96, 198.3 32 M166.3 0 C187.99 -0.2, 199.75 9.94, 198.3 32 M198.3 32 C197.62 121.3, 197.44 207.88, 198.3 293.79 M198.3 32 C196.42 100.26, 196.88 170.11, 198.3 293.79 M198.3 293.79 C198.66 316.39, 187.28 327.76, 166.3 325.79 M198.3 293.79 C196.4 312.91, 186.85 324.66, 166.3 325.79 M166.3 325.79 C128.64 325.08, 86.09 326.86, 32 325.79 M166.3 325.79 C122.98 326.13, 77.84 325.82, 32 325.79 M32 325.79 C8.7 326.97, -1.39 317.04, 0 293.79 M32 325.79 C12.4 327.48, -0.02 314.05, 0 293.79 M0 293.79 C3.47 205.64, 1.54 116.06, 0 32 M0 293.79 C-0.19 195.41, 0.14 98.59, 0 32 M0 32 C1.01 9.97, 11.05 0.97, 32 0 M0 32 C2.14 12.24, 11.59 0.99, 32 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g stroke-linecap="round" transform="translate(722.8071936501935 311.7115365362838) rotate(0 68.1031889948905 23.48524140509653)"><path d="M11.74 0 C53.59 3.31, 87.73 2.97, 124.46 0 C134.92 -0.4, 139.74 2.01, 136.21 11.74 C137.97 15.37, 136.85 23.24, 136.21 35.23 C139.56 45.52, 133.74 48.52, 124.46 46.97 C96.78 45.56, 65.44 45.52, 11.74 46.97 C3.34 44.74, 0.07 45.06, 0 35.23 C-1.73 28.64, 2.27 16.22, 0 11.74 C1.37 5.8, 5.36 0.87, 11.74 0" stroke="none" stroke-width="0" fill="#ebfbee"></path><path d="M11.74 0 C52.76 -0.16, 90.17 -0.1, 124.46 0 M11.74 0 C41.16 0.43, 69.24 -0.67, 124.46 0 M124.46 0 C130.77 -1.39, 136.68 2.17, 136.21 11.74 M124.46 0 C130.53 0.38, 138.03 4.55, 136.21 11.74 M136.21 11.74 C137.92 21.32, 137.29 29.21, 136.21 35.23 M136.21 11.74 C135.44 20.42, 135.58 28.15, 136.21 35.23 M136.21 35.23 C137.38 41.91, 130.62 48.26, 124.46 46.97 M136.21 35.23 C136.56 42.85, 133.74 46.24, 124.46 46.97 M124.46 46.97 C86.86 47.88, 48.71 46.02, 11.74 46.97 M124.46 46.97 C94.68 46.3, 65.6 47.53, 11.74 46.97 M11.74 46.97 C4.27 48.23, -0.35 45.02, 0 35.23 M11.74 46.97 C2.01 44.75, -0.78 41.93, 0 35.23 M0 35.23 C1.86 26.89, -1.13 21.75, 0 11.74 M0 35.23 C0.45 27.01, -0.75 19.02, 0 11.74 M0 11.74 C-1.97 5.09, 2.52 1.92, 11.74 0 M0 11.74 C1.73 5.6, 3.89 -1.07, 11.74 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(740.2958120274081 324.43888783677585) rotate(0 50.61457061767578 10.757890104604371)"><text x="50.61457061767578" y="15.16432189145032" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="17.212624167366993px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Opt-Agent 1</text></g><g stroke-linecap="round" transform="translate(725.7232194848918 432.52955167295) rotate(0 65.51383225795428 23.48232510877051)"><path d="M11.74 0 C55.21 -1.76, 96.95 -3.27, 119.29 0 C123.79 -0.4, 131.49 5.73, 131.03 11.74 C134.46 20.43, 132.56 25.75, 131.03 35.22 C129.52 42.87, 124.7 48.38, 119.29 46.96 C86.59 44.69, 55.12 48.92, 11.74 46.96 C4.97 45.9, -3.19 41.19, 0 35.22 C1.11 29.32, 1.2 20.51, 0 11.74 C-3.38 5.99, 0.93 3.03, 11.74 0" stroke="none" stroke-width="0" fill="#ebfbee"></path><path d="M11.74 0 C40.08 -0.27, 69.64 0.87, 119.29 0 M11.74 0 C45.16 0.3, 80.33 0.12, 119.29 0 M119.29 0 C125.58 0.33, 132.61 4.47, 131.03 11.74 M119.29 0 C128.58 -1.15, 131.63 6.02, 131.03 11.74 M131.03 11.74 C132.15 19.26, 131.79 26.88, 131.03 35.22 M131.03 11.74 C131.49 19.35, 130.06 28.74, 131.03 35.22 M131.03 35.22 C131.33 42.87, 128.37 46.33, 119.29 46.96 M131.03 35.22 C131.86 42.03, 125.8 47.32, 119.29 46.96 M119.29 46.96 C77.94 45.1, 38.23 46.53, 11.74 46.96 M119.29 46.96 C95.44 48.09, 71.06 48.44, 11.74 46.96 M11.74 46.96 C2.26 45.04, -0.68 42.07, 0 35.22 M11.74 46.96 C3.79 47.45, 1.2 45.24, 0 35.22 M0 35.22 C0.67 31.27, 1.42 25.69, 0 11.74 M0 35.22 C-0.85 30.02, -0.56 24.6, 0 11.74 M0 11.74 C1.5 5.38, 3.89 -0.93, 11.74 0 M0 11.74 C0.96 1.72, 3.42 2.13, 11.74 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(739.0649783785884 445.253986677116) rotate(0 52.17207336425781 10.757890104604371)"><text x="52.17207336425781" y="15.16432189145032" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="17.212624167366993px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Opt-Agent 3</text></g><g stroke-linecap="round" transform="translate(724.0047394099199 374.56261789857876) rotate(0 65.81297280391664 21.889465620441115)"><path d="M10.94 0 C39.33 0.64, 74.82 2.86, 120.68 0 C126.72 0.69, 133.37 7, 131.63 10.94 C130.28 18.82, 129.37 28.48, 131.63 32.83 C131.58 39.31, 127.89 43.21, 120.68 43.78 C95.6 43.81, 65.23 43.02, 10.94 43.78 C4.79 45.93, -3.53 41.5, 0 32.83 C-3.54 27.19, -3.15 20.44, 0 10.94 C2 4.28, 2.39 -2.07, 10.94 0" stroke="none" stroke-width="0" fill="#ebfbee"></path><path d="M10.94 0 C41.24 -1.8, 68.97 -1.84, 120.68 0 M10.94 0 C35.48 1.12, 62.33 1.23, 120.68 0 M120.68 0 C129.25 -1, 132.15 5.48, 131.63 10.94 M120.68 0 C129.16 1.38, 132.06 4.78, 131.63 10.94 M131.63 10.94 C132.66 15.62, 131.73 23.64, 131.63 32.83 M131.63 10.94 C131.6 16.17, 132.07 21.26, 131.63 32.83 M131.63 32.83 C132.35 39.24, 126.83 44.09, 120.68 43.78 M131.63 32.83 C133.41 40.23, 127.62 42.93, 120.68 43.78 M120.68 43.78 C95.57 44.52, 74.44 43.99, 10.94 43.78 M120.68 43.78 C80.01 43.89, 40.65 44.36, 10.94 43.78 M10.94 43.78 C3.54 44.2, 1.04 42.04, 0 32.83 M10.94 43.78 C2.02 42.54, 0.42 40.76, 0 32.83 M0 32.83 C-1.21 23.54, -1.82 14.6, 0 10.94 M0 32.83 C0.9 25.71, 0.14 16.66, 0 10.94 M0 10.94 C0.83 1.74, 3.22 1.85, 10.94 0 M0 10.94 C1.83 2.99, 5.27 1.68, 10.94 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(736.8539747260436 385.6941934144154) rotate(0 52.96373748779297 10.757890104604371)"><text x="52.96373748779297" y="15.16432189145032" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="17.212624167366993px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Opt-Agent 2</text></g><g transform="translate(724.530946808286 244.0862028229376) rotate(0 89.48317631896134 26.894725261510985)"><text x="89.48317631896133" y="18.955402364312903" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="21.515780209208742px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">(5) Optimization </text><text x="89.48317631896133" y="45.85012762582383" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="21.515780209208742px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Manager</text></g><g stroke-linecap="round" transform="translate(30.978286056956676 113.87287246199799) rotate(0 513.5684161400872 34.20089374777103)"><path d="M17.1 0 C247.18 2.63, 476.78 1.49, 1010.04 0 M1010.04 0 C1022.73 0.31, 1026.96 6.96, 1027.14 17.1 M1027.14 17.1 C1026.12 26.59, 1028.81 35.54, 1027.14 51.3 M1027.14 51.3 C1026.83 61.96, 1019.73 68.58, 1010.04 68.4 M1010.04 68.4 C725.74 70.31, 441.21 70.02, 17.1 68.4 M17.1 68.4 C7.66 66.75, -1.93 62.02, 0 51.3 M0 51.3 C1.06 44.63, -1.4 33.06, 0 17.1 M0 17.1 C0.36 6.24, 7.13 1.29, 17.1 0" stroke="#1e1e1e" stroke-width="2.5" fill="none" stroke-dasharray="8 10"></path></g><g stroke-linecap="round" transform="translate(624.9126468497791 128.96440930856693) rotate(0 79.7109375 19.494140625)"><path d="M9.75 0 C42.99 -2.49, 79.81 -0.52, 149.67 0 M9.75 0 C50.85 0, 90.65 0.6, 149.67 0 M149.67 0 C157.73 0.08, 159.11 2.51, 159.42 9.75 M149.67 0 C154.22 0.2, 159.12 1.49, 159.42 9.75 M159.42 9.75 C160.74 16.12, 161.43 21.56, 159.42 29.24 M159.42 9.75 C159.3 13.92, 159.86 18.62, 159.42 29.24 M159.42 29.24 C158.01 34.66, 156.54 39.53, 149.67 38.99 M159.42 29.24 C161.07 37.22, 157.6 37.76, 149.67 38.99 M149.67 38.99 C117.03 36.22, 83.94 39.11, 9.75 38.99 M149.67 38.99 C97.86 37.38, 44.99 39.26, 9.75 38.99 M9.75 38.99 C4.84 38.42, 1.41 37.2, 0 29.24 M9.75 38.99 C2.99 41.25, -1.22 33.62, 0 29.24 M0 29.24 C-0.7 23.99, 0.92 19.8, 0 9.75 M0 29.24 C-0.49 22.01, -0.73 15.26, 0 9.75 M0 9.75 C-0.03 2.79, 3.2 -0.32, 9.75 0 M0 9.75 C-1.42 3.3, 4.53 0.68, 9.75 0" stroke="#1971c2" stroke-width="2" fill="none"></path></g><g transform="translate(668.2536121207752 135.95854993356693) rotate(0 36.369972229003906 12.5)"><text x="36.369972229003906" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1971c2" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Kernel 1</text></g><g stroke-linecap="round" transform="translate(799.5127058575986 130.5123762499618) rotate(0 79.71093750000011 19.494140625)"><path d="M9.75 0 C39.85 1.04, 69.68 0.51, 149.67 0 M9.75 0 C46.3 -0.17, 81.15 -0.59, 149.67 0 M149.67 0 C154.47 0.17, 159.16 1.72, 159.42 9.75 M149.67 0 C157.19 1.06, 159.83 4.7, 159.42 9.75 M159.42 9.75 C157.73 14.59, 158.66 20.66, 159.42 29.24 M159.42 9.75 C158.83 15.48, 159.7 22.53, 159.42 29.24 M159.42 29.24 C160.85 37.03, 157.41 37.92, 149.67 38.99 M159.42 29.24 C158.08 33.81, 153.91 40.34, 149.67 38.99 M149.67 38.99 C118.88 40.02, 84.23 40.87, 9.75 38.99 M149.67 38.99 C121.85 40, 93.14 41.02, 9.75 38.99 M9.75 38.99 C3.03 40.95, -1.06 33.89, 0 29.24 M9.75 38.99 C2.99 39.28, 1.16 34.94, 0 29.24 M0 29.24 C1.24 23.81, 0.75 15.98, 0 9.75 M0 29.24 C0.12 23.27, 0.11 17.59, 0 9.75 M0 9.75 C-1.24 3.29, 4.36 0.59, 9.75 0 M0 9.75 C-0.68 1.21, 2.06 0.73, 9.75 0" stroke="#1971c2" stroke-width="2" fill="none"></path></g><g transform="translate(840.1236677716611 137.5065168749618) rotate(0 39.0999755859375 12.5)"><text x="39.0999755859375" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1971c2" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Kernel 2</text></g><g transform="translate(486.28191778039695 130.20188416993938) rotate(0 63.80248260498047 21.074930646074563)"><text x="0" y="14.853611119353342" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="16.85994451685964px" fill="#1e1e1e" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Queue</text><text x="0" y="35.928541765427894" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="16.85994451685964px" fill="#1e1e1e" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Top candidates</text></g><g transform="translate(968.6226377655528 143.3971691647996) rotate(0 8.219993591308594 12.5)"><text x="0" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">...</text></g><g stroke-linecap="round" transform="translate(148.18869347656755 130.0720516272645) rotate(0 79.7109375 19.494140625)"><path d="M9.75 0 C38.29 1.07, 70.81 0.54, 149.67 0 M9.75 0 C59.73 -2.34, 111.02 -1.86, 149.67 0 M149.67 0 C156.07 0.42, 160.46 5.16, 159.42 9.75 M149.67 0 C154.55 -1.24, 159.84 3.87, 159.42 9.75 M159.42 9.75 C158.51 15.85, 157.91 22.31, 159.42 29.24 M159.42 9.75 C160.02 17.46, 159.28 23.28, 159.42 29.24 M159.42 29.24 C160.25 33.83, 155.74 40.84, 149.67 38.99 M159.42 29.24 C161.26 35.09, 157.79 40.67, 149.67 38.99 M149.67 38.99 C107.4 38.02, 69.07 39.25, 9.75 38.99 M149.67 38.99 C113.5 40.35, 75.87 40.1, 9.75 38.99 M9.75 38.99 C2.41 38.89, -1.34 36.52, 0 29.24 M9.75 38.99 C3.22 38.46, -0.06 35.37, 0 29.24 M0 29.24 C0.57 24.02, -1.73 18.95, 0 9.75 M0 29.24 C0.25 23.29, 0.28 16.54, 0 9.75 M0 9.75 C-1.88 4.4, 1.59 1.69, 9.75 0 M0 9.75 C1.28 3.65, 2.44 -1.32, 9.75 0" stroke="#1971c2" stroke-width="2" fill="none"></path></g><g transform="translate(168.7096743115285 137.0661922522645) rotate(0 59.18995666503906 12.5)"><text x="59.18995666503906" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1971c2" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Input Kernel</text></g><g stroke-linecap="round"><g transform="translate(322.00412308322257 146.66435971670398) rotate(0 70.61344833259761 1.8211236304699696)"><path d="M0.5 1.77 C23.75 2.62, 117.67 6.53, 140.74 6.39 M-2.67 0.26 C20.05 0.28, 113.7 2.48, 138.23 2.87" stroke="#1971c2" stroke-width="2" fill="none"></path></g><g transform="translate(322.00412308322257 146.66435971670398) rotate(0 70.61344833259761 1.8211236304699696)"><path d="M115.84 10.6 C124.65 6.48, 128.9 4.67, 137.25 2.76 M113.87 10.42 C122.31 8.41, 130.44 6.18, 138.85 2.34" stroke="#1971c2" stroke-width="2" fill="none"></path></g><g transform="translate(322.00412308322257 146.66435971670398) rotate(0 70.61344833259761 1.8211236304699696)"><path d="M116.17 -6.49 C124.95 -4.73, 129.09 -0.66, 137.25 2.76 M114.21 -6.68 C122.63 -3.2, 130.65 0.07, 138.85 2.34" stroke="#1971c2" stroke-width="2" fill="none"></path></g></g><mask></mask><g transform="translate(772.7319016850076 474.93903108488416) rotate(0 12.380175204790248 18.82630768305762)"><text x="0" y="26.537563310038063" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="30.122092292892237px" fill="#1e1e1e" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">...</text></g><g stroke-linecap="round" transform="translate(376.30967473953433 310.5078125000002) rotate(0 64.10086495535711 22.213309151786007)"><path d="M11.11 0 C53.01 -4.69, 91.67 0.79, 117.1 0 C123.02 -0.44, 126.28 2.35, 128.2 11.11 C131.25 22.62, 128.11 29, 128.2 33.32 C125.41 37.22, 124.11 46.71, 117.1 44.43 C83.89 41.79, 42.29 42.58, 11.11 44.43 C2.15 41.29, -0.61 43.65, 0 33.32 C-2.26 27.77, -1.05 21.38, 0 11.11 C-1.37 6.59, 1.82 -1.51, 11.11 0" stroke="none" stroke-width="0" fill="#fff9db"></path><path d="M11.11 0 C38.72 -1.79, 67.44 -2.19, 117.1 0 M11.11 0 C41.65 0.41, 72.64 0.82, 117.1 0 M117.1 0 C125.86 -1.84, 129.14 3.54, 128.2 11.11 M117.1 0 C125.97 0.09, 128.37 4.36, 128.2 11.11 M128.2 11.11 C127.35 19.06, 130.08 27.86, 128.2 33.32 M128.2 11.11 C128.33 16.88, 128.79 24.75, 128.2 33.32 M128.2 33.32 C126.48 42.01, 123.94 44.8, 117.1 44.43 M128.2 33.32 C126.18 42.86, 124.59 44.38, 117.1 44.43 M117.1 44.43 C87.93 42.54, 56.34 43.87, 11.11 44.43 M117.1 44.43 C92.31 42.99, 67.22 43.68, 11.11 44.43 M11.11 44.43 C5.58 42.73, 0.46 40.22, 0 33.32 M11.11 44.43 C3.9 46.35, 2.03 39.5, 0 33.32 M0 33.32 C-1.64 27.77, -0.22 17.87, 0 11.11 M0 33.32 C0.55 27.38, -0.65 18.99, 0 11.11 M0 11.11 C-0.38 3.5, 4.94 -1.53, 11.11 0 M0 11.11 C0.79 5.9, 3.41 1.64, 11.11 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(405.86056716071175 320.22112165178623) rotate(0 34.54997253417969 12.5)"><text x="34.54997253417969" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">History</text></g><g stroke-linecap="round" transform="translate(378.831298623463 369.7119140624993) rotate(0 64.10086495535711 30)"><path d="M15 0 C43.8 1.58, 64.98 -2.15, 113.2 0 C120.28 -0.75, 127.03 4.94, 128.2 15 C131.84 21.63, 129.37 27.84, 128.2 45 C131.26 56.9, 126.18 60.44, 113.2 60 C90.53 57.3, 67.61 62.17, 15 60 C6.64 56.54, 2.57 53.4, 0 45 C-1.69 38.99, 3.62 32.35, 0 15 C-3.35 3.8, 2.34 -1.6, 15 0" stroke="none" stroke-width="0" fill="#fff9db"></path><path d="M15 0 C39.53 -1.64, 67.1 -0.05, 113.2 0 M15 0 C47.71 -0.53, 78.99 -0.55, 113.2 0 M113.2 0 C122.76 -0.4, 128 4.19, 128.2 15 M113.2 0 C124.03 -0.05, 126.43 5.98, 128.2 15 M128.2 15 C129.3 21.17, 127.67 29.62, 128.2 45 M128.2 15 C127.57 26.34, 127.2 39.29, 128.2 45 M128.2 45 C129.46 55.82, 122.77 61.77, 113.2 60 M128.2 45 C129.42 55.72, 124.81 58.92, 113.2 60 M113.2 60 C76.99 59.23, 41.63 61.42, 15 60 M113.2 60 C82.31 61.28, 51.74 60.18, 15 60 M15 60 C4.97 61.15, -0.78 54.28, 0 45 M15 60 C6.63 57.96, 1.64 56.88, 0 45 M0 45 C-0.2 35.6, 0.59 27.6, 0 15 M0 45 C0.75 37.08, 0.69 28.72, 0 15 M0 15 C1 4.41, 3.74 -1.64, 15 0 M0 15 C-1.15 6.83, 6.79 0.11, 15 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(394.3222011154412 374.7119140624993) rotate(0 48.609962463378906 25)"><text x="48.609962463378906" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">RAG</text><text x="48.609962463378906" y="42.62" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Knowledge</text></g><g stroke-linecap="round" transform="translate(381.37733656989076 442.2474888392858) rotate(0 62.8662109375 24.29059709821422)"><path d="M12.15 0 C40.6 -1.56, 76.2 1.81, 113.59 0 C119.99 -1.71, 125.56 0.7, 125.73 12.15 C126.1 19.98, 123.86 24.59, 125.73 36.44 C122.36 48.07, 118.37 45.62, 113.59 48.58 C85.58 50.53, 67.45 52.54, 12.15 48.58 C5.4 49.08, -1.47 42.93, 0 36.44 C-1.15 27.03, -0.89 22.9, 0 12.15 C2.41 2.04, 2.01 -2.1, 12.15 0" stroke="none" stroke-width="0" fill="#fff9db"></path><path d="M12.15 0 C45.43 -0.97, 80.07 -1.37, 113.59 0 M12.15 0 C48.06 -0.22, 84.68 -0.4, 113.59 0 M113.59 0 C120.11 -0.97, 123.78 4.14, 125.73 12.15 M113.59 0 C123.02 -0.62, 126.28 3.87, 125.73 12.15 M125.73 12.15 C125.22 18.62, 124.06 25.5, 125.73 36.44 M125.73 12.15 C126.73 17.67, 125.53 24.07, 125.73 36.44 M125.73 36.44 C127.7 43.12, 123.08 46.96, 113.59 48.58 M125.73 36.44 C125.84 43.05, 123.11 49.8, 113.59 48.58 M113.59 48.58 C84.58 48.56, 57.07 47.61, 12.15 48.58 M113.59 48.58 C78.22 48.95, 43.39 49.52, 12.15 48.58 M12.15 48.58 C4.87 49.63, 0.61 45.84, 0 36.44 M12.15 48.58 C2.9 49.96, -0.74 42.51, 0 36.44 M0 36.44 C0.04 29.89, -0.38 23.91, 0 12.15 M0 36.44 C0.85 30.7, 0.25 23.19, 0 12.15 M0 12.15 C-0.08 4.16, 2.77 1.95, 12.15 0 M0 12.15 C-0.45 3.79, 6.1 -1.75, 12.15 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(393.78358656989076 454.0380859375) rotate(0 50.4599609375 12.5)"><text x="50.4599609375" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Reflextion</text></g><g transform="translate(90.82661635504746 312.3228236607149) rotate(0 66.65396881103516 17.5)"><text x="0" y="24.668" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="28px" fill="#e03131" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Diagnose </text></g><g transform="translate(96.00298397851293 456.8987165178569) rotate(0 67.78797149658203 17.5)"><text x="0" y="24.668" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="28px" fill="#e03131" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Prescribe </text></g><g stroke-linecap="round"><g transform="translate(70.2351413577253 439.781528140315) rotate(0 130.52785160112137 32.89010609015952)"><path d="M0.66 1.08 C6.02 11.59, -10.07 52.09, 33.34 62.82 C76.74 73.55, 223.1 65.13, 261.09 65.47 M-0.45 0.6 C4.71 11.29, -10.72 53.06, 32.75 64.06 C76.21 75.06, 222.19 66.4, 260.34 66.62" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(70.2351413577253 439.781528140315) rotate(0 130.52785160112137 32.89010609015952)"><path d="M237.07 75.75 C248.15 73.13, 254.68 68.42, 260.34 66.62 M237.07 75.75 C242.98 74.26, 249.44 70.29, 260.34 66.62" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(70.2351413577253 439.781528140315) rotate(0 130.52785160112137 32.89010609015952)"><path d="M236.65 58.65 C247.86 62.83, 254.56 64.92, 260.34 66.62 M236.65 58.65 C242.71 61.85, 249.28 62.57, 260.34 66.62" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g transform="translate(557.3467165398963 457.2427455357142) rotate(0 68.51159340994695 16.371540067567935)"><text x="0" y="23.07732287924384" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="26.194464108108786px" fill="#f08c00" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Synthesize</text></g><g stroke-linecap="round"><g transform="translate(549.522567482792 501.27402725244656) rotate(0 76.43853368761552 0.5020546719009644)"><path d="M0.69 0.57 C26.13 0.69, 128.28 0.17, 153.75 0.05 M-0.41 -0.17 C24.85 0.12, 127.68 0.97, 153.26 1.08" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(549.522567482792 501.27402725244656) rotate(0 76.43853368761552 0.5020546719009644)"><path d="M129.71 9.48 C139.98 7.49, 146.69 3.79, 153.26 1.08 M129.71 9.48 C135.13 7.49, 138.26 7.11, 153.26 1.08" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(549.522567482792 501.27402725244656) rotate(0 76.43853368761552 0.5020546719009644)"><path d="M129.82 -7.63 C139.95 -2.88, 146.63 0.15, 153.26 1.08 M129.82 -7.63 C135.16 -6.18, 138.26 -3.14, 153.26 1.08" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g></g><mask></mask><g transform="translate(916.0626399577889 458.84709821428623) rotate(0 61.06594783280548 38.271484374999886)"><text x="0" y="26.973742187499905" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="30.61718749999989px" fill="#2f9e44" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Explore </text><text x="0" y="65.24522656249977" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="30.61718749999989px" fill="#2f9e44" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic"></text></g><g transform="translate(527.5089054785831 46.57366071428601) rotate(0 54.58598327636719 17.5)"><text x="0" y="24.668" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="28px" fill="#1971c2" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Measure</text></g><g transform="translate(70.81688342745224 47.95340401785711) rotate(0 45.82197570800781 17.5)"><text x="0" y="24.668" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="28px" fill="#1971c2" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">Collect</text></g><g stroke-linecap="round" transform="translate(654.7771723661572 10) rotate(0 143.03431598944815 38.583286830357224)"><path d="M19.29 0 C98.73 -2.53, 173.3 1.51, 266.78 0 C276.44 -2.45, 283.31 3.76, 286.07 19.29 C289.46 32.41, 284.28 47.49, 286.07 57.87 C283.4 70.16, 279.95 77.08, 266.78 77.17 C170.49 76.92, 77.88 74.49, 19.29 77.17 C5.62 80.64, -3.32 67.76, 0 57.87 C1.01 49.89, 3.38 38.78, 0 19.29 C2.02 8.21, 6.43 0.16, 19.29 0" stroke="none" stroke-width="0" fill="#a5d8ff"></path><path d="M19.29 0 C86.88 -2.3, 151.93 -2.36, 266.78 0 M19.29 0 C115.82 -1.23, 213.25 -0.51, 266.78 0 M266.78 0 C278.28 0, 287.95 8.13, 286.07 19.29 M266.78 0 C280.94 2.17, 284.5 5.16, 286.07 19.29 M286.07 19.29 C284.91 29.2, 287.98 39.66, 286.07 57.87 M286.07 19.29 C286.51 30.05, 286.15 42.09, 286.07 57.87 M286.07 57.87 C287.5 70.75, 279.64 75.82, 266.78 77.17 M286.07 57.87 C287.44 70.24, 278.45 75.54, 266.78 77.17 M266.78 77.17 C206.5 76.09, 150.56 75.07, 19.29 77.17 M266.78 77.17 C215.34 76.94, 162.16 77.64, 19.29 77.17 M19.29 77.17 C6.91 78.89, -1.13 69.97, 0 57.87 M19.29 77.17 C6.22 79.34, -1.51 71.11, 0 57.87 M0 57.87 C-1.44 43.13, 0.62 31.42, 0 19.29 M0 57.87 C0.43 46.04, 0.55 32.62, 0 19.29 M0 19.29 C0.39 6.98, 6.88 -1.03, 19.29 0 M0 19.29 C1.02 7.49, 4.32 -0.94, 19.29 0" stroke="#1e1e1e" stroke-width="2" fill="none"></path></g><g transform="translate(682.971583570449 36.08328683035711) rotate(0 114.83990478515625 12.5)"><text x="114.83990478515625" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="middle" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">(6) Benchmarking Agent</text></g><g transform="translate(37.69094195694356 389.85663272049624) rotate(0 95.51992797851562 12.5)"><text x="0" y="17.619999999999997" font-family="Excalifont, Xiaolai, Segoe UI Emoji" font-size="20px" fill="#1e1e1e" text-anchor="start" style="white-space: pre;" direction="ltr" dominant-baseline="alphabetic">(3)  Analyzer Agent</text></g></svg>
\ No newline at end of file
diff --git a/examples/optimize_01_matvec/input.py b/examples/optimize_01_matvec/input.py
new file mode 100644
index 00000000..0597a968
--- /dev/null
+++ b/examples/optimize_01_matvec/input.py
@@ -0,0 +1,166 @@
+# kernel.py
+# Matrix-vector multiplication using Triton: C = A @ B
+# Implements the exact problem from the test:
+#   - M = 2048
+#   - K = 1,048,576
+#   - A: (M, K), BF16
+#   - B: (K, 1), BF16
+#   - C: (M, 1), BF16
+#
+# Notes on fusion:
+# - The entire operation (matrix-vector product) is executed in a single Triton kernel.
+# - There is nothing else to fuse (no bias/activation in the test), so no extra kernel stages are required.
+# - All math is performed inside the Triton kernel; the Python wrapper only validates/allocates/configures.
+#
+# Triton programming guidelines followed:
+# - Use @triton.jit for kernels.
+# - Use tl.constexpr for compile-time constants (BLOCK_M, BLOCK_K).
+# - Proper indexing with tl.program_id, tl.arange, and tl.cdiv.
+# - Use tl.load/tl.store with masks for OOB protection and coalesced access on contiguous inputs.
+# - Accumulate in FP32 for numerical stability and convert to BF16 on store.
+
+import triton
+import triton.language as tl
+import torch
+
+
+@triton.jit
+def _matvec_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    M,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    BLOCK_M: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    # Program id along the M dimension (each program computes a block of rows)
+    pid_m = tl.program_id(0)
+
+    # Row indices this program handles
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    m_mask = offs_m < M
+
+    # Help compiler with alignment information
+    offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_M), BLOCK_M)
+
+    # Initialize FP32 accumulator for BLOCK_M rows
+    acc = tl.zeros((BLOCK_M,), dtype=tl.float32)
+
+    # Iterate over K dimension in chunks of BLOCK_K
+    # Use tl.range to ensure proper device-side looping
+    for k0 in tl.range(0, K, BLOCK_K):
+        offs_k = k0 + tl.arange(0, BLOCK_K)
+        k_mask = offs_k < K
+        # Also assist compiler with alignment info for K offsets
+        offs_k = tl.max_contiguous(tl.multiple_of(offs_k, BLOCK_K), BLOCK_K)
+
+        # Compute pointers:
+        # A tile is [BLOCK_M, BLOCK_K] region starting at (offs_m, offs_k)
+        a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        # B tile is a vector [BLOCK_K] at column 0 (since B is [K, 1])
+        b_ptrs = b_ptr + (offs_k * stride_bk + 0 * stride_bn)
+
+        # Load with masking to guard boundaries. Inputs are BF16; cast to FP32 for accumulation
+        a = tl.load(a_ptrs, mask=(m_mask[:, None] & k_mask[None, :]), other=0).to(
+            tl.float32
+        )
+        b = tl.load(b_ptrs, mask=k_mask, other=0).to(tl.float32)
+
+        # Fused multiply-accumulate for rows in this tile:
+        # sum over K tile dimension for each row
+        acc += tl.sum(a * b[None, :], axis=1)
+
+    # Convert accumulator to BF16 and store to C[:, 0]
+    out = acc.to(tl.bfloat16)
+    c_ptrs = c_ptr + (offs_m * stride_cm + 0 * stride_cn)
+    tl.store(c_ptrs, out, mask=m_mask)
+
+
+def kernel_function(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+    """
+    Compute C = A @ B using a single Triton kernel.
+
+    What is fused:
+    - Entire matrix-vector multiplication is done in one pass inside the kernel.
+    - No additional ops (e.g., bias/activation) are required by the test, so none are fused.
+
+    Runtime constraints honored:
+    - Wrapper only validates arguments, allocates output, and launches the Triton kernel.
+    - All math is inside the Triton kernel; no torch.nn or torch.nn.functional usage.
+
+    Args:
+        A: [M, K] BF16 CUDA tensor
+        B: [K, 1] BF16 CUDA tensor (also accepts shape [K], it will be viewed as [K, 1])
+
+    Returns:
+        C: [M, 1] BF16 CUDA tensor
+    """
+    # Validate device and dtype
+    if not A.is_cuda or not B.is_cuda:
+        raise ValueError("A and B must be CUDA tensors.")
+    if A.dtype != torch.bfloat16 or B.dtype != torch.bfloat16:
+        raise ValueError("A and B must be torch.bfloat16 tensors.")
+
+    if A.ndim != 2:
+        raise ValueError("A must be 2D [M, K].")
+    M, K = A.shape
+
+    # Accept B as [K] or [K, 1]
+    if B.ndim == 1:
+        if B.shape[0] != K:
+            raise ValueError(
+                f"When B is 1D, expected shape [K]={K}, but got {tuple(B.shape)}"
+            )
+        Bv = B.view(K, 1)
+    elif B.ndim == 2:
+        if B.shape[0] != K or B.shape[1] != 1:
+            raise ValueError(
+                f"B must be [K, 1], got {tuple(B.shape)} (K must match A.shape[1])"
+            )
+        Bv = B
+    else:
+        raise ValueError("B must be 1D [K] or 2D [K, 1].")
+
+    # Allocate output C [M, 1]
+    C = torch.empty((M, 1), device=A.device, dtype=A.dtype)
+
+    # Extract strides (in elements, not bytes)
+    stride_am, stride_ak = A.stride()
+    stride_bk, stride_bn = Bv.stride()
+    stride_cm, stride_cn = C.stride()
+
+    # Kernel launch configuration
+    # Choose modest tile sizes to balance register usage and loop count over K.
+    # For the huge K in the test, BLOCK_K=256 works well without excessive register pressure.
+    BLOCK_M = 128
+    BLOCK_K = 256
+
+    def grid(meta):
+        return (triton.cdiv(M, meta["BLOCK_M"]),)
+
+    _matvec_kernel[grid](
+        A,
+        Bv,
+        C,
+        M,
+        K,
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+        BLOCK_M=BLOCK_M,
+        BLOCK_K=BLOCK_K,
+        num_warps=4,
+        num_stages=2,
+    )
+
+    return C
diff --git a/examples/optimize_01_matvec/problem.py b/examples/optimize_01_matvec/problem.py
new file mode 100644
index 00000000..35a8a417
--- /dev/null
+++ b/examples/optimize_01_matvec/problem.py
@@ -0,0 +1,38 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs matrix-vector multiplication (C = A * B).
+    """
+
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        """
+        Performs matrix-vector multiplication.
+
+        Args:
+            A: Input matrix of shape (M, K).
+            B: Input vector of shape (K, 1).
+
+        Returns:
+            Output vector of shape (M, 1).
+        """
+        return torch.matmul(A, B)
+
+
+M = 256 * 8  # 2048
+K = 131072 * 8  # 1048576
+
+
+def get_inputs():
+    A = torch.rand(M, K)
+    B = torch.rand(K, 1)
+    return [A, B]
+
+
+def get_init_inputs():
+    return []  # No special initialization inputs needed
diff --git a/examples/optimize_01_matvec/test.py b/examples/optimize_01_matvec/test.py
new file mode 100644
index 00000000..06a9d8bf
--- /dev/null
+++ b/examples/optimize_01_matvec/test.py
@@ -0,0 +1,272 @@
+"""Correctness test for matrix-vector multiplication kernel."""
+
+import inspect
+import sys
+
+import torch
+from kernel import kernel_function
+from problem import get_init_inputs, get_inputs, Model
+
+_CONV_TYPES = (
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+    torch.nn.ConvTranspose1d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
+)
+_NORM_TYPES = (
+    torch.nn.BatchNorm1d,
+    torch.nn.BatchNorm2d,
+    torch.nn.BatchNorm3d,
+    torch.nn.LayerNorm,
+    torch.nn.GroupNorm,
+    torch.nn.InstanceNorm1d,
+    torch.nn.InstanceNorm2d,
+    torch.nn.InstanceNorm3d,
+)
+_POOL_TYPES = (
+    torch.nn.MaxPool1d,
+    torch.nn.MaxPool2d,
+    torch.nn.MaxPool3d,
+    torch.nn.AvgPool1d,
+    torch.nn.AvgPool2d,
+    torch.nn.AvgPool3d,
+    torch.nn.AdaptiveAvgPool1d,
+    torch.nn.AdaptiveAvgPool2d,
+    torch.nn.AdaptiveAvgPool3d,
+    torch.nn.AdaptiveMaxPool1d,
+    torch.nn.AdaptiveMaxPool2d,
+    torch.nn.AdaptiveMaxPool3d,
+)
+
+
+def _extract_model_params(model):
+    """Extract learnable parameters and layer config from a PyTorch model."""
+    params = {}
+
+    for _, module in model.named_modules():
+        if isinstance(module, (*_CONV_TYPES, torch.nn.Linear)):
+            if hasattr(module, "weight") and module.weight is not None:
+                params.setdefault("weight", module.weight)
+                params.setdefault("w", module.weight)
+                if getattr(module, "bias", None) is not None:
+                    params.setdefault("conv_bias", module.bias)
+                    params.setdefault("bias", module.bias)
+                for attr in ("stride", "padding", "dilation", "output_padding"):
+                    val = getattr(module, attr, None)
+                    if val is not None:
+                        params.setdefault(attr, val)
+                if hasattr(module, "groups"):
+                    params.setdefault("groups", module.groups)
+
+        elif isinstance(module, _NORM_TYPES):
+            if getattr(module, "weight", None) is not None:
+                params.setdefault("weight", module.weight)
+                params.setdefault("w", module.weight)
+            if getattr(module, "bias", None) is not None:
+                params.setdefault("bias", module.bias)
+            if hasattr(module, "eps"):
+                params["eps"] = module.eps
+            if hasattr(module, "num_groups"):
+                params["num_groups"] = module.num_groups
+            if hasattr(module, "normalized_shape"):
+                params["normalized_shape"] = module.normalized_shape
+
+        elif isinstance(module, _POOL_TYPES):
+            for attr in ("kernel_size", "stride", "padding", "dilation"):
+                val = getattr(module, attr, None)
+                if val is not None:
+                    params.setdefault(attr, val)
+
+    if hasattr(model, "bias") and isinstance(
+        model.bias, (torch.Tensor, torch.nn.Parameter)
+    ):
+        params["add_bias"] = model.bias
+        params.setdefault("bias", model.bias)
+
+    # Extract simple scalar attributes stored by Model.__init__
+    # (catches dim, negative_slope, min_val, max_val, etc.)
+    _INIT_SCALAR_NAMES = {
+        "dim",
+        "negative_slope",
+        "min_val",
+        "max_val",
+        "beta",
+        "threshold",
+        "alpha",
+        "lambd",
+        "upper",
+        "lower",
+        "p",
+    }
+    for attr_name in _INIT_SCALAR_NAMES:
+        if hasattr(model, attr_name) and not isinstance(
+            getattr(model, attr_name), (torch.Tensor, torch.nn.Module)
+        ):
+            params.setdefault(attr_name, getattr(model, attr_name))
+
+    return params
+
+
+def test_kernel():
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Setup reference model
+    model = Model(*get_init_inputs()).to(device).to(dtype)
+    inputs = [
+        (
+            x.to(device).to(dtype)
+            if isinstance(x, torch.Tensor) and x.is_floating_point()
+            else (x.to(device) if isinstance(x, torch.Tensor) else x)
+        )
+        for x in get_inputs()
+    ]
+
+    # Get reference output
+    with torch.no_grad():
+        ref_output = model(*inputs)
+
+    # Smart parameter binding: detect if kernel needs model params
+    sig = inspect.signature(kernel_function)
+    kernel_params = list(sig.parameters.keys())
+    param_kinds = [p.kind for p in sig.parameters.values()]
+    has_var_positional = any(k == inspect.Parameter.VAR_POSITIONAL for k in param_kinds)
+    has_var_keyword = any(k == inspect.Parameter.VAR_KEYWORD for k in param_kinds)
+    _MODEL_PARAM_NAMES = {
+        "weight",
+        "w",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "output_padding",
+        "groups",
+        "bias",
+        "conv_bias",
+        "eps",
+        "num_groups",
+        "normalized_shape",
+        "dim",
+        "negative_slope",
+        "min_val",
+        "max_val",
+        "beta",
+        "threshold",
+        "alpha",
+        "lambd",
+        "upper",
+        "lower",
+        "p",
+    }
+    needs_model = bool(_MODEL_PARAM_NAMES & set(kernel_params))
+    # If kernel uses *args/**kwargs, inspect its source for weight-related hints
+    if not needs_model and (has_var_positional or has_var_keyword):
+        try:
+            src = inspect.getsource(kernel_function)
+            needs_model = any(
+                kw in src
+                for kw in (
+                    "weight",
+                    "is_weight",
+                    "w.shape",
+                    "w.ndim",
+                    "kernel_size",
+                    "dilation",
+                )
+            )
+        except (OSError, TypeError):
+            pass
+
+    if needs_model:
+        model_params = _extract_model_params(model)
+        has_weight = "weight" in model_params or "w" in model_params
+        if has_var_positional and has_weight:
+            # *args kernel with weight: pass (input, weight1, weight2, ...) positionally
+            pos_args = list(inputs)
+            # Collect ALL conv/linear weights from model
+            for _, mod in model.named_modules():
+                if isinstance(mod, (*_CONV_TYPES, torch.nn.Linear)):
+                    if hasattr(mod, "weight") and mod.weight is not None:
+                        pos_args.append(mod.weight)
+            # Pass config params as kwargs
+            config_kwargs = {}
+            for k, v in model_params.items():
+                if k not in ("weight", "w", "bias", "conv_bias", "add_bias"):
+                    # Convert uniform tuples to scalar int for compatibility
+                    if (
+                        isinstance(v, (tuple, list))
+                        and len(v) >= 1
+                        and all(e == v[0] for e in v)
+                    ):
+                        v = v[0]
+                    config_kwargs[k] = v
+            kernel_output = kernel_function(*pos_args, **config_kwargs)
+        else:
+            # Bind keyword args, adapting tuple/int form to match defaults
+            call_args = {}
+            pos_idx = 0
+            for pname in kernel_params:
+                p = sig.parameters[pname]
+                if (
+                    p.kind == inspect.Parameter.VAR_POSITIONAL
+                    or p.kind == inspect.Parameter.VAR_KEYWORD
+                ):
+                    continue
+                if pname in model_params:
+                    val = model_params[pname]
+                    # Convert tuple/list to scalar when kernel expects int
+                    if isinstance(val, (tuple, list)):
+                        if p.default is not inspect.Parameter.empty and isinstance(
+                            p.default, int
+                        ):
+                            val = val[0]
+                        elif len(val) == 1:
+                            val = val[0]
+                    call_args[pname] = val
+                elif pos_idx < len(inputs):
+                    call_args[pname] = inputs[pos_idx]
+                    pos_idx += 1
+            kernel_output = kernel_function(**call_args)
+    else:
+        kernel_output = kernel_function(*inputs)
+
+    # Compare
+    # Handle in-place kernels that return None
+    if kernel_output is None:
+        # Assume in-place modification of first input
+        kernel_output = inputs[0]
+    # Handle shape mismatch: kernel may return per-sample loss vs reference scalar mean
+    if ref_output.dim() == 0 and kernel_output.dim() >= 1:
+        kernel_output = kernel_output.mean()
+    elif kernel_output.dim() == 0 and ref_output.dim() >= 1:
+        ref_output = ref_output.mean()
+    # Align dtypes for comparison
+    if ref_output.dtype != kernel_output.dtype:
+        # If kernel outputs higher precision, recompute reference at that precision
+        # using the SAME inputs to ensure fair comparison
+        if kernel_output.dtype == torch.float32 and ref_output.dtype in (
+            torch.bfloat16,
+            torch.float16,
+        ):
+            model_f32 = Model(*get_init_inputs()).to(device).to(torch.float32)
+            inputs_f32 = [
+                x.to(torch.float32) if x.is_floating_point() else x for x in inputs
+            ]
+            with torch.no_grad():
+                ref_output = model_f32(*inputs_f32)
+        else:
+            kernel_output = kernel_output.to(ref_output.dtype)
+    if torch.allclose(ref_output, kernel_output, rtol=1e-2, atol=1e-2):
+        print("PASS")
+        return True
+    else:
+        max_diff = (ref_output - kernel_output).abs().max().item()
+        print(f"FAIL: max difference = {max_diff}")
+        return False
+
+
+if __name__ == "__main__":
+    success = test_kernel()
+    sys.exit(0 if success else 1)
diff --git a/examples/optimize_02_rmsnorm/input.py b/examples/optimize_02_rmsnorm/input.py
new file mode 100644
index 00000000..e5e7744d
--- /dev/null
+++ b/examples/optimize_02_rmsnorm/input.py
@@ -0,0 +1,259 @@
+import torch
+import triton
+import triton.language as tl
+
+
+"""
+RMS Normalization over the channel/feature dimension (dim=1) for NCHW tensors using Triton.
+
+Fusion and design notes:
+- We implement the whole RMSNorm in a single Triton kernel: reduction of sum-of-squares across channels
+  and the normalization write-back are fused into one kernel launch. Within the kernel, we do two passes
+  over the input per tile: first to accumulate the sum of squares along the feature dimension, second to
+  apply the normalization scale and store. This avoids allocating any intermediate tensors while keeping
+  the Python wrapper free of compute.
+- The kernel is tiled along the contiguous W dimension for coalesced loads/stores, and iterates over C
+  (features) to perform the reduction and normalization. Masking is used for boundary conditions.
+- The wrapper supports both in-place and out-of-place operation. If no output tensor is provided, we
+  default to in-place to reduce peak memory consumption for the large test tensor.
+
+Runtime constraints:
+- All math is inside the Triton kernel (tl.load/tl.store/tl.math.rsqrt, etc.).
+- The Python wrapper only validates arguments, allocates output (if requested), and launches the kernel.
+- No torch.nn / torch.nn.functional usage anywhere in the execution path.
+"""
+
+
+@triton.jit
+def _rmsnorm_nchw_kernel(
+    x_ptr,
+    y_ptr,
+    N,
+    C,
+    H,
+    W,
+    stride_nx,
+    stride_cx,
+    stride_hx,
+    stride_wx,
+    stride_ny,
+    stride_cy,
+    stride_hy,
+    stride_wy,
+    eps,
+    BLOCK_W: tl.constexpr,
+):
+    # 2D launch:
+    #  - axis 0 tiles along W
+    #  - axis 1 enumerates all (N*H) rows
+    pid_w = tl.program_id(axis=0)
+    pid_nh = tl.program_id(axis=1)
+
+    # Which n and h row are we processing?
+    n = pid_nh // H
+    h = pid_nh - n * H  # equivalent to pid_nh % H
+
+    # Offsets along W for this tile
+    start_w = pid_w * BLOCK_W
+    offs_w = start_w + tl.arange(0, BLOCK_W)
+    mask_w = offs_w < W
+
+    # Cast strides and indices to int64 for address arithmetic safety
+    stride_nx = tl.full([], stride_nx, tl.int64)
+    stride_cx = tl.full([], stride_cx, tl.int64)
+    stride_hx = tl.full([], stride_hx, tl.int64)
+    stride_wx = tl.full([], stride_wx, tl.int64)
+    stride_ny = tl.full([], stride_ny, tl.int64)
+    stride_cy = tl.full([], stride_cy, tl.int64)
+    stride_hy = tl.full([], stride_hy, tl.int64)
+    stride_wy = tl.full([], stride_wy, tl.int64)
+
+    n = n.to(tl.int64)
+    h = h.to(tl.int64)
+    offs_w_i64 = offs_w.to(tl.int64)
+
+    # Base offsets for given (n, h)
+    base_nh_x = n * stride_nx + h * stride_hx
+    base_nh_y = n * stride_ny + h * stride_hy
+
+    # Accumulator for sum of squares across channels (compute in float32)
+    acc = tl.zeros([BLOCK_W], dtype=tl.float32)
+
+    # First pass: accumulate sum of squares along C
+    # Use a dynamic loop since C is provided at runtime.
+    for c in tl.range(0, C):
+        c_i64 = c.to(tl.int64)
+        x_offsets = base_nh_x + c_i64 * stride_cx + offs_w_i64 * stride_wx
+        x_vals = tl.load(x_ptr + x_offsets, mask=mask_w, other=0.0)
+        x_f32 = x_vals.to(tl.float32)
+        acc += x_f32 * x_f32
+
+    # Compute inverse RMS: inv_rms = 1 / sqrt(mean(x^2) + eps)
+    # mean is acc / C
+    c_f32 = tl.full([1], C, dtype=tl.float32)
+    mean = acc / c_f32
+    inv_rms = tl.math.rsqrt(mean + eps)
+
+    # Second pass: normalize and store
+    for c in tl.range(0, C):
+        c_i64 = c.to(tl.int64)
+        x_offsets = base_nh_x + c_i64 * stride_cx + offs_w_i64 * stride_wx
+        y_offsets = base_nh_y + c_i64 * stride_cy + offs_w_i64 * stride_wy
+        x_vals = tl.load(x_ptr + x_offsets, mask=mask_w, other=0.0)
+        x_f32 = x_vals.to(tl.float32)
+        y_f32 = x_f32 * inv_rms
+        y_vals = y_f32.to(x_vals.dtype)
+        tl.store(y_ptr + y_offsets, y_vals, mask=mask_w)
+
+
+def _parse_kernel_args(x, args, kwargs):
+    """
+    Parse flexible arguments from the test harness.
+    Returns:
+      eps (float), num_features (int or None), out_tensor (Tensor or None)
+    """
+    # Defaults
+    eps = kwargs.pop("eps", None)
+    num_features = kwargs.pop("num_features", None)
+    # Some tests may pass `features=...`
+    if "features" in kwargs and num_features is None:
+        num_features = kwargs.pop("features")
+    # Accept multiple possible output keywords
+    out = kwargs.pop("out", None)
+    if out is None:
+        out = kwargs.pop("output", None)
+    if out is None:
+        out = kwargs.pop("y", None)
+    if out is None:
+        out = kwargs.pop("dst", None)
+
+    # Handle positional args: could be (eps), (features), or (eps, features)
+    if len(args) == 1:
+        a0 = args[0]
+        if isinstance(a0, (int,)) and num_features is None:
+            num_features = int(a0)
+        else:
+            # assume eps
+            if eps is None:
+                eps = float(a0)
+    elif len(args) == 2:
+        a0, a1 = args
+        # try to identify by types
+        if isinstance(a0, (float,)) or not isinstance(a0, (int,)):
+            # assume eps first, features second
+            if eps is None:
+                eps = float(a0)
+            if num_features is None and isinstance(a1, (int,)):
+                num_features = int(a1)
+        else:
+            # assume features first, eps second
+            if num_features is None:
+                num_features = int(a0)
+            if eps is None:
+                eps = float(a1)
+
+    # Finalize defaults
+    if eps is None:
+        eps = 1e-5
+    # num_features can be None; we will infer from x.shape[1]
+    return eps, num_features, out
+
+
+def kernel_function(x, *args, **kwargs):
+    """
+    RMS Normalization over feature/channel dim (dim=1) for NCHW tensors on CUDA.
+
+    Behavior:
+    - Normalizes each (n, h, w) vector across channels c in [0, C), computing:
+        rms = sqrt(mean(x[n, :, h, w]^2) + eps)
+        y[n, c, h, w] = x[n, c, h, w] / rms
+    - Uses a single fused Triton kernel launch with a two-pass streaming strategy:
+        1) Reduce sum of squares across C
+        2) Apply scale and write normalized values
+      This avoids Python-side compute and keeps memory usage low (no large intermediates).
+    - If an output tensor is provided via out/output/y/dst, writes there. Otherwise, performs in-place
+      normalization on x to minimize peak memory.
+
+    Accepted call patterns (examples):
+      - kernel_function(x)
+      - kernel_function(x, eps)
+      - kernel_function(x, features)
+      - kernel_function(x, eps, features)
+      - kernel_function(x, num_features=..., eps=...)
+      - kernel_function(x, out=prealloc), kernel_function(x, output=...), y=..., dst=...
+
+    Args:
+      x: CUDA tensor with shape (N, C, H, W). Dtype: float16 or bfloat16 recommended.
+      eps: small epsilon for numerical stability (default 1e-5)
+      num_features: expected C; if provided, validated against x.shape[1]
+      out/output/y/dst: optional output tensor. If omitted, operation runs in-place on x.
+
+    Returns:
+      The normalized tensor (same shape/type/device as x). If run in-place and returning None is
+      acceptable to the caller, you may still return x for convenience.
+    """
+    if not isinstance(x, torch.Tensor):
+        raise TypeError("x must be a torch.Tensor")
+    if x.device.type != "cuda":
+        raise ValueError("x must be on CUDA device")
+    if x.ndim != 4:
+        raise ValueError(f"Expected 4D NCHW tensor, got shape {tuple(x.shape)}")
+    if x.dtype not in (torch.float16, torch.bfloat16, torch.float32):
+        raise ValueError(
+            f"Unsupported dtype: {x.dtype}. Use float16, bfloat16, or float32."
+        )
+
+    eps, num_features, out = _parse_kernel_args(x, args, kwargs)
+
+    N, C, H, W = x.shape
+    if num_features is not None and int(num_features) != C:
+        raise ValueError(
+            f"num_features ({num_features}) does not match input channels ({C})."
+        )
+
+    # Setup output tensor. If not provided, do in-place to save memory (huge tensors in the test).
+    if out is None:
+        # In-place: write results directly to x
+        y = x
+    else:
+        if not isinstance(out, torch.Tensor):
+            raise TypeError("Provided output must be a torch.Tensor")
+        if out.shape != x.shape or out.device != x.device or out.dtype != x.dtype:
+            raise ValueError(
+                "Output tensor must match input in shape, device, and dtype."
+            )
+        y = out
+
+    # Strides in elements
+    sx0, sx1, sx2, sx3 = x.stride()
+    sy0, sy1, sy2, sy3 = y.stride()
+
+    # Kernel launch configuration
+    # Tile along W for coalesced access
+    BLOCK_W = 256
+    grid = (triton.cdiv(W, BLOCK_W), N * H)
+
+    # Launch kernel
+    _rmsnorm_nchw_kernel[grid](
+        x,
+        y,
+        N,
+        C,
+        H,
+        W,
+        sx0,
+        sx1,
+        sx2,
+        sx3,
+        sy0,
+        sy1,
+        sy2,
+        sy3,
+        float(eps),
+        BLOCK_W=BLOCK_W,
+        num_warps=4,
+        num_stages=2,
+    )
+
+    # Return result tensor. If in-place, return x to satisfy callers expecting a Tensor.
+    return y
diff --git a/examples/optimize_02_rmsnorm/problem.py b/examples/optimize_02_rmsnorm/problem.py
new file mode 100644
index 00000000..708c7001
--- /dev/null
+++ b/examples/optimize_02_rmsnorm/problem.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs RMS Normalization.
+    """
+
+    def __init__(self, num_features: int, eps: float = 1e-5):
+        """
+        Initializes the RMSNorm layer.
+
+        Args:
+            num_features (int): Number of features in the input tensor.
+            eps (float, optional): A small value added to the denominator to avoid division by zero. Defaults to 1e-5.
+        """
+        super(Model, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies RMS Normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, num_features, *).
+
+        Returns:
+            torch.Tensor: Output tensor with RMS Normalization applied, same shape as input.
+        """
+        # Calculate the RMS along the feature dimension
+        rms = torch.sqrt(torch.mean(x**2, dim=1, keepdim=True) + self.eps)
+
+        # Normalize the input by dividing by the RMS
+        return x / rms
+
+
+batch_size = 112
+features = 64
+dim1 = 512
+dim2 = 512
+
+
+def get_inputs():
+    x = torch.rand(batch_size, features, dim1, dim2)
+    return [x]
+
+
+def get_init_inputs():
+    return [features]
diff --git a/examples/optimize_02_rmsnorm/test.py b/examples/optimize_02_rmsnorm/test.py
new file mode 100644
index 00000000..b489c477
--- /dev/null
+++ b/examples/optimize_02_rmsnorm/test.py
@@ -0,0 +1,270 @@
+"""Correctness test for RMSNorm kernel."""
+
+import inspect
+import sys
+import torch
+
+from problem import Model, get_inputs, get_init_inputs
+from kernel import kernel_function
+
+_CONV_TYPES = (
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+    torch.nn.ConvTranspose1d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
+)
+_NORM_TYPES = (
+    torch.nn.BatchNorm1d,
+    torch.nn.BatchNorm2d,
+    torch.nn.BatchNorm3d,
+    torch.nn.LayerNorm,
+    torch.nn.GroupNorm,
+    torch.nn.InstanceNorm1d,
+    torch.nn.InstanceNorm2d,
+    torch.nn.InstanceNorm3d,
+)
+_POOL_TYPES = (
+    torch.nn.MaxPool1d,
+    torch.nn.MaxPool2d,
+    torch.nn.MaxPool3d,
+    torch.nn.AvgPool1d,
+    torch.nn.AvgPool2d,
+    torch.nn.AvgPool3d,
+    torch.nn.AdaptiveAvgPool1d,
+    torch.nn.AdaptiveAvgPool2d,
+    torch.nn.AdaptiveAvgPool3d,
+    torch.nn.AdaptiveMaxPool1d,
+    torch.nn.AdaptiveMaxPool2d,
+    torch.nn.AdaptiveMaxPool3d,
+)
+
+
+def _extract_model_params(model):
+    """Extract learnable parameters and layer config from a PyTorch model."""
+    params = {}
+
+    for _, module in model.named_modules():
+        if isinstance(module, (*_CONV_TYPES, torch.nn.Linear)):
+            if hasattr(module, "weight") and module.weight is not None:
+                params.setdefault("weight", module.weight)
+                params.setdefault("w", module.weight)
+                if getattr(module, "bias", None) is not None:
+                    params.setdefault("conv_bias", module.bias)
+                    params.setdefault("bias", module.bias)
+                for attr in ("stride", "padding", "dilation", "output_padding"):
+                    val = getattr(module, attr, None)
+                    if val is not None:
+                        params.setdefault(attr, val)
+                if hasattr(module, "groups"):
+                    params.setdefault("groups", module.groups)
+
+        elif isinstance(module, _NORM_TYPES):
+            if getattr(module, "weight", None) is not None:
+                params.setdefault("weight", module.weight)
+                params.setdefault("w", module.weight)
+            if getattr(module, "bias", None) is not None:
+                params.setdefault("bias", module.bias)
+            if hasattr(module, "eps"):
+                params["eps"] = module.eps
+            if hasattr(module, "num_groups"):
+                params["num_groups"] = module.num_groups
+            if hasattr(module, "normalized_shape"):
+                params["normalized_shape"] = module.normalized_shape
+
+        elif isinstance(module, _POOL_TYPES):
+            for attr in ("kernel_size", "stride", "padding", "dilation"):
+                val = getattr(module, attr, None)
+                if val is not None:
+                    params.setdefault(attr, val)
+
+    if hasattr(model, "bias") and isinstance(
+        model.bias, (torch.Tensor, torch.nn.Parameter)
+    ):
+        params["add_bias"] = model.bias
+        params.setdefault("bias", model.bias)
+
+    # Extract simple scalar attributes stored by Model.__init__
+    # (catches dim, negative_slope, min_val, max_val, etc.)
+    _INIT_SCALAR_NAMES = {
+        "dim",
+        "negative_slope",
+        "min_val",
+        "max_val",
+        "beta",
+        "threshold",
+        "alpha",
+        "lambd",
+        "upper",
+        "lower",
+        "p",
+    }
+    for attr_name in _INIT_SCALAR_NAMES:
+        if hasattr(model, attr_name) and not isinstance(
+            getattr(model, attr_name), (torch.Tensor, torch.nn.Module)
+        ):
+            params.setdefault(attr_name, getattr(model, attr_name))
+
+    return params
+
+
+def test_kernel():
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Setup reference model
+    model = Model(*get_init_inputs()).to(device).to(dtype)
+    inputs = [
+        x.to(device).to(dtype)
+        if isinstance(x, torch.Tensor) and x.is_floating_point()
+        else (x.to(device) if isinstance(x, torch.Tensor) else x)
+        for x in get_inputs()
+    ]
+
+    # Get reference output
+    with torch.no_grad():
+        ref_output = model(*inputs)
+
+    # Smart parameter binding: detect if kernel needs model params
+    sig = inspect.signature(kernel_function)
+    kernel_params = list(sig.parameters.keys())
+    param_kinds = [p.kind for p in sig.parameters.values()]
+    has_var_positional = any(k == inspect.Parameter.VAR_POSITIONAL for k in param_kinds)
+    has_var_keyword = any(k == inspect.Parameter.VAR_KEYWORD for k in param_kinds)
+    _MODEL_PARAM_NAMES = {
+        "weight",
+        "w",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "output_padding",
+        "groups",
+        "bias",
+        "conv_bias",
+        "eps",
+        "num_groups",
+        "normalized_shape",
+        "dim",
+        "negative_slope",
+        "min_val",
+        "max_val",
+        "beta",
+        "threshold",
+        "alpha",
+        "lambd",
+        "upper",
+        "lower",
+        "p",
+    }
+    needs_model = bool(_MODEL_PARAM_NAMES & set(kernel_params))
+    # If kernel uses *args/**kwargs, inspect its source for weight-related hints
+    if not needs_model and (has_var_positional or has_var_keyword):
+        try:
+            src = inspect.getsource(kernel_function)
+            needs_model = any(
+                kw in src
+                for kw in (
+                    "weight",
+                    "is_weight",
+                    "w.shape",
+                    "w.ndim",
+                    "kernel_size",
+                    "dilation",
+                )
+            )
+        except (OSError, TypeError):
+            pass
+
+    if needs_model:
+        model_params = _extract_model_params(model)
+        has_weight = "weight" in model_params or "w" in model_params
+        if has_var_positional and has_weight:
+            # *args kernel with weight: pass (input, weight1, weight2, ...) positionally
+            pos_args = list(inputs)
+            # Collect ALL conv/linear weights from model
+            for _, mod in model.named_modules():
+                if isinstance(mod, (*_CONV_TYPES, torch.nn.Linear)):
+                    if hasattr(mod, "weight") and mod.weight is not None:
+                        pos_args.append(mod.weight)
+            # Pass config params as kwargs
+            config_kwargs = {}
+            for k, v in model_params.items():
+                if k not in ("weight", "w", "bias", "conv_bias", "add_bias"):
+                    # Convert uniform tuples to scalar int for compatibility
+                    if (
+                        isinstance(v, (tuple, list))
+                        and len(v) >= 1
+                        and all(e == v[0] for e in v)
+                    ):
+                        v = v[0]
+                    config_kwargs[k] = v
+            kernel_output = kernel_function(*pos_args, **config_kwargs)
+        else:
+            # Bind keyword args, adapting tuple/int form to match defaults
+            call_args = {}
+            pos_idx = 0
+            for pname in kernel_params:
+                p = sig.parameters[pname]
+                if (
+                    p.kind == inspect.Parameter.VAR_POSITIONAL
+                    or p.kind == inspect.Parameter.VAR_KEYWORD
+                ):
+                    continue
+                if pname in model_params:
+                    val = model_params[pname]
+                    # Convert tuple/list to scalar when kernel expects int
+                    if isinstance(val, (tuple, list)):
+                        if p.default is not inspect.Parameter.empty and isinstance(
+                            p.default, int
+                        ):
+                            val = val[0]
+                        elif len(val) == 1:
+                            val = val[0]
+                    call_args[pname] = val
+                elif pos_idx < len(inputs):
+                    call_args[pname] = inputs[pos_idx]
+                    pos_idx += 1
+            kernel_output = kernel_function(**call_args)
+    else:
+        kernel_output = kernel_function(*inputs)
+
+    # Compare
+    # Handle in-place kernels that return None
+    if kernel_output is None:
+        # Assume in-place modification of first input
+        kernel_output = inputs[0]
+    # Handle shape mismatch: kernel may return per-sample loss vs reference scalar mean
+    if ref_output.dim() == 0 and kernel_output.dim() >= 1:
+        kernel_output = kernel_output.mean()
+    elif kernel_output.dim() == 0 and ref_output.dim() >= 1:
+        ref_output = ref_output.mean()
+    # Align dtypes for comparison
+    if ref_output.dtype != kernel_output.dtype:
+        # If kernel outputs higher precision, recompute reference at that precision
+        # using the SAME inputs to ensure fair comparison
+        if kernel_output.dtype == torch.float32 and ref_output.dtype in (
+            torch.bfloat16,
+            torch.float16,
+        ):
+            model_f32 = Model(*get_init_inputs()).to(device).to(torch.float32)
+            inputs_f32 = [
+                x.to(torch.float32) if x.is_floating_point() else x for x in inputs
+            ]
+            with torch.no_grad():
+                ref_output = model_f32(*inputs_f32)
+        else:
+            kernel_output = kernel_output.to(ref_output.dtype)
+    if torch.allclose(ref_output, kernel_output, rtol=1e-2, atol=1e-2):
+        print("PASS")
+        return True
+    else:
+        max_diff = (ref_output - kernel_output).abs().max().item()
+        print(f"FAIL: max difference = {max_diff}")
+        return False
+
+
+if __name__ == "__main__":
+    success = test_kernel()
+    sys.exit(0 if success else 1)
diff --git a/examples/optimize_03_max_pooling/input.py b/examples/optimize_03_max_pooling/input.py
new file mode 100644
index 00000000..8281ed6c
--- /dev/null
+++ b/examples/optimize_03_max_pooling/input.py
@@ -0,0 +1,224 @@
+# kernel.py
+# Triton-based MaxPool3d implementation specialized to the test configuration:
+# - Input tensor shape: (N=16, C=32, D=128, H=128, W=128)
+# - Pooling params typically called by the test: kernel_size=3, stride=2, padding=1, dilation=3
+#
+# Notes on fusion:
+# - MaxPool3d is a standalone reduction operator. There is no natural upstream/downstream op specified
+#   in the test to fuse with (e.g., bias, activation), so this kernel focuses on an efficient single-pass
+#   pooling implementation. If a pipeline included additional pointwise ops on the pooled output, those
+#   could be fused into the epilogue to reduce memory traffic.
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _maxpool3d_kernel(
+    x_ptr,  # *const T
+    y_ptr,  # *T
+    N,
+    C,
+    D,
+    H,
+    W,  # input sizes
+    OD,
+    OH,
+    OW,  # output sizes
+    strideN,
+    strideC,
+    strideD,
+    strideH,
+    strideW,  # input strides (in elements)
+    ostrideN,
+    ostrideC,
+    ostrideD,
+    ostrideH,
+    ostrideW,  # output strides (in elements)
+    KERNEL_SIZE: tl.constexpr,  # pool kernel size (assumed cubic here)
+    STRIDE: tl.constexpr,  # pool stride (assumed same across dims)
+    PADDING: tl.constexpr,  # pool padding (assumed same across dims)
+    DILATION: tl.constexpr,  # dilation (assumed same across dims)
+    BLOCK_W: tl.constexpr,  # vectorized span of OW per program
+):
+    # Program ids:
+    # axis 0: blocks along OW
+    # axis 1: specific OD index
+    # axis 2: flattened (N*C*OH)
+    pid_w = tl.program_id(axis=0)
+    pid_d = tl.program_id(axis=1)
+    pid_z = tl.program_id(axis=2)
+
+    # Decode pid_z into (n, c, oh)
+    oh = pid_z % OH
+    nc = pid_z // OH
+    c = nc % C
+    n = nc // C
+
+    # Compute OW offsets this program handles
+    ow_start = pid_w * BLOCK_W
+    ow_offsets = ow_start + tl.arange(0, BLOCK_W)
+    ow_mask = ow_offsets < OW
+
+    # Output indices along D and H (scalars per program)
+    od = pid_d
+
+    # Base starts in input space for the pooling window
+    d_base = od * STRIDE - PADDING
+    h_base = oh * STRIDE - PADDING
+    # Vector of base W inputs for this block
+    w_base = ow_offsets * STRIDE - PADDING
+
+    # Accumulator in fp32 for numerical robustness (stores max across the 3x3x3 window)
+    acc = tl.full([BLOCK_W], -float("inf"), dtype=tl.float32)
+
+    # Precompute base strides for (n, c)
+    base_nc = n * strideN + c * strideC
+
+    # Iterate over the pooling window (kd, kh, kw) with compile-time unrolling
+    for kd in tl.static_range(0, KERNEL_SIZE):
+        d_idx = d_base + kd * DILATION
+        valid_d = (d_idx >= 0) & (d_idx < D)
+        # Safe index to keep addresses in-bounds for masked loads
+        d_idx_safe = tl.where(valid_d, d_idx, 0)
+
+        for kh in tl.static_range(0, KERNEL_SIZE):
+            h_idx = h_base + kh * DILATION
+            valid_h = (h_idx >= 0) & (h_idx < H)
+            valid_dh = valid_d & valid_h
+            h_idx_safe = tl.where(valid_h, h_idx, 0)
+
+            # Base pointer for current (n, c, d_idx, h_idx)
+            base_dh = base_nc + d_idx_safe * strideD + h_idx_safe * strideH
+
+            for kw in tl.static_range(0, KERNEL_SIZE):
+                w_idx = w_base + kw * DILATION
+                # Check bounds per-lane; combine with ow_mask and valid_dh
+                w_valid = ow_mask & (w_idx >= 0) & (w_idx < W) & valid_dh
+                w_idx_safe = tl.where(w_valid, w_idx, 0)
+
+                # Element pointers for this (kd, kh, kw) and OW lanes
+                ptrs = x_ptr + base_dh + w_idx_safe * strideW
+
+                # Load with mask; out-of-bounds lanes use -inf so they don't affect max
+                vals = tl.load(ptrs, mask=w_valid, other=-float("inf"))
+                vals_f32 = vals.to(tl.float32)
+                acc = tl.maximum(acc, vals_f32)
+
+    # Store result
+    out_base = y_ptr + n * ostrideN + c * ostrideC + od * ostrideD + oh * ostrideH
+    out_ptrs = out_base + ow_offsets * ostrideW
+    tl.store(out_ptrs, acc, mask=ow_mask)
+
+
+def _compute_out_dim(
+    L_in: int, kernel: int, stride: int, padding: int, dilation: int
+) -> int:
+    # PyTorch formula: floor((L_in + 2*padding - dilation*(kernel - 1) - 1) / stride + 1)
+    return (L_in + 2 * padding - dilation * (kernel - 1) - 1) // stride + 1
+
+
+def kernel_function(
+    x: torch.Tensor, kernel_size: int, stride: int, padding: int, dilation: int
+):
+    """
+    Triton-backed 3D Max Pooling (no indices), compatible with the test's call signature.
+
+    Args:
+        x: Input tensor of shape (N, C, D, H, W), CUDA device.
+        kernel_size: int, pooling kernel size (assumed cubic)
+        stride: int, pooling stride (assumed same for D/H/W)
+        padding: int, zero-padding applied on each side (assumed same for D/H/W)
+        dilation: int, dilation factor (assumed same for D/H/W)
+
+    Returns:
+        y: Output tensor of shape (N, C, OD, OH, OW) with the same dtype/device as x.
+
+    Design and fusion notes:
+    - This is a single-pass, fused pooling reduction: it computes the maximum over the 3D dilated window
+      directly from global memory and writes the result, with masking to handle padding/boundaries.
+    - No additional post-processing stages are specified in the test; thus, there are no further ops to fuse.
+      If a follow-up pointwise op were known, it could be integrated into the epilogue to reduce memory traffic.
+
+    Runtime policy:
+    - The wrapper only validates arguments, computes output shape, allocates the output tensor, and launches
+      the Triton kernel. All math (window traversal and reduction) happens inside the Triton kernel.
+    """
+    # Basic checks
+    if not x.is_cuda:
+        raise ValueError("Input must be a CUDA tensor.")
+    if x.ndim != 5:
+        raise ValueError(
+            f"Expected 5D input (N, C, D, H, W), got shape {tuple(x.shape)}"
+        )
+    if (
+        not isinstance(kernel_size, int)
+        or not isinstance(stride, int)
+        or not isinstance(padding, int)
+        or not isinstance(dilation, int)
+    ):
+        raise TypeError("kernel_size, stride, padding, dilation must be ints.")
+
+    N, C, D, H, W = x.shape
+    K = kernel_size
+    S = stride
+    P = padding
+    Di = dilation
+
+    # Compute output shape
+    OD = _compute_out_dim(D, K, S, P, Di)
+    OH = _compute_out_dim(H, K, S, P, Di)
+    OW = _compute_out_dim(W, K, S, P, Di)
+    if OD <= 0 or OH <= 0 or OW <= 0:
+        raise ValueError(
+            "Computed non-positive output dimension(s). Check pooling parameters."
+        )
+
+    # Allocate output
+    y = torch.empty((N, C, OD, OH, OW), device=x.device, dtype=x.dtype)
+
+    # Get strides in "element" units (PyTorch strides are already in elements, not bytes)
+    strideN, strideC, strideD, strideH, strideW = x.stride()
+    ostrideN, ostrideC, ostrideD, ostrideH, ostrideW = y.stride()
+
+    # Configure launch
+    # We tile along OW dimension with BLOCK_W elements per program.
+    # OW is 62 in the test, so BLOCK_W=64 covers each row in one program; remaining lanes are masked.
+    BLOCK_W = 64
+
+    def grid(meta):
+        return (triton.cdiv(OW, meta["BLOCK_W"]), OD, N * C * OH)
+
+    # Launch kernel
+    _maxpool3d_kernel[grid](
+        x,
+        y,
+        N,
+        C,
+        D,
+        H,
+        W,
+        OD,
+        OH,
+        OW,
+        strideN,
+        strideC,
+        strideD,
+        strideH,
+        strideW,
+        ostrideN,
+        ostrideC,
+        ostrideD,
+        ostrideH,
+        ostrideW,
+        KERNEL_SIZE=K,
+        STRIDE=S,
+        PADDING=P,
+        DILATION=Di,
+        BLOCK_W=BLOCK_W,
+        num_warps=4,  # Reasonable default for this memory-bound reduction
+        num_stages=2,
+    )
+
+    return y
diff --git a/examples/optimize_03_max_pooling/problem.py b/examples/optimize_03_max_pooling/problem.py
new file mode 100644
index 00000000..3859ba83
--- /dev/null
+++ b/examples/optimize_03_max_pooling/problem.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    """
+    Simple model that performs Max Pooling 3D.
+    """
+
+    def __init__(
+        self,
+        kernel_size: int,
+        stride: int = None,
+        padding: int = 0,
+        dilation: int = 1,
+        return_indices: bool = False,
+        ceil_mode: bool = False,
+    ):
+        """
+        Initializes the Max Pooling 3D layer.
+
+        Args:
+            kernel_size (int): Size of the kernel for the max pooling operation.
+            stride (int, optional): Stride of the pooling operation. Defaults to None, which means stride is equal to kernel_size.
+            padding (int, optional): Padding applied to the input tensor. Defaults to 0.
+            dilation (int, optional): Spacing between kernel elements. Defaults to 1.
+            return_indices (bool, optional): Whether to return indices of the maximum values. Defaults to False.
+            ceil_mode (bool, optional): When True, the output size is ceil(input_size / stride) instead of floor. Defaults to False.
+        """
+        super(Model, self).__init__()
+        self.maxpool = nn.MaxPool3d(
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            return_indices=return_indices,
+            ceil_mode=ceil_mode,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Applies Max Pooling 3D to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, channels, dim1, dim2, dim3).
+
+        Returns:
+            torch.Tensor: Output tensor with Max Pooling 3D applied.
+        """
+        return self.maxpool(x)
+
+
+batch_size = 16
+channels = 32
+dim1 = 128
+dim2 = 128
+dim3 = 128
+kernel_size = 3
+stride = 2
+padding = 1
+dilation = 3
+
+
+def get_inputs():
+    x = torch.rand(batch_size, channels, dim1, dim2, dim3)
+    return [x]
+
+
+def get_init_inputs():
+    return [kernel_size, stride, padding, dilation]
diff --git a/examples/optimize_03_max_pooling/test.py b/examples/optimize_03_max_pooling/test.py
new file mode 100644
index 00000000..575c5a9e
--- /dev/null
+++ b/examples/optimize_03_max_pooling/test.py
@@ -0,0 +1,272 @@
+"""Correctness test for 3D max pooling kernel."""
+
+import inspect
+import sys
+
+import torch
+from kernel import kernel_function
+from problem import get_init_inputs, get_inputs, Model
+
+_CONV_TYPES = (
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+    torch.nn.ConvTranspose1d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
+)
+_NORM_TYPES = (
+    torch.nn.BatchNorm1d,
+    torch.nn.BatchNorm2d,
+    torch.nn.BatchNorm3d,
+    torch.nn.LayerNorm,
+    torch.nn.GroupNorm,
+    torch.nn.InstanceNorm1d,
+    torch.nn.InstanceNorm2d,
+    torch.nn.InstanceNorm3d,
+)
+_POOL_TYPES = (
+    torch.nn.MaxPool1d,
+    torch.nn.MaxPool2d,
+    torch.nn.MaxPool3d,
+    torch.nn.AvgPool1d,
+    torch.nn.AvgPool2d,
+    torch.nn.AvgPool3d,
+    torch.nn.AdaptiveAvgPool1d,
+    torch.nn.AdaptiveAvgPool2d,
+    torch.nn.AdaptiveAvgPool3d,
+    torch.nn.AdaptiveMaxPool1d,
+    torch.nn.AdaptiveMaxPool2d,
+    torch.nn.AdaptiveMaxPool3d,
+)
+
+
+def _extract_model_params(model):
+    """Extract learnable parameters and layer config from a PyTorch model."""
+    params = {}
+
+    for _, module in model.named_modules():
+        if isinstance(module, (*_CONV_TYPES, torch.nn.Linear)):
+            if hasattr(module, "weight") and module.weight is not None:
+                params.setdefault("weight", module.weight)
+                params.setdefault("w", module.weight)
+                if getattr(module, "bias", None) is not None:
+                    params.setdefault("conv_bias", module.bias)
+                    params.setdefault("bias", module.bias)
+                for attr in ("stride", "padding", "dilation", "output_padding"):
+                    val = getattr(module, attr, None)
+                    if val is not None:
+                        params.setdefault(attr, val)
+                if hasattr(module, "groups"):
+                    params.setdefault("groups", module.groups)
+
+        elif isinstance(module, _NORM_TYPES):
+            if getattr(module, "weight", None) is not None:
+                params.setdefault("weight", module.weight)
+                params.setdefault("w", module.weight)
+            if getattr(module, "bias", None) is not None:
+                params.setdefault("bias", module.bias)
+            if hasattr(module, "eps"):
+                params["eps"] = module.eps
+            if hasattr(module, "num_groups"):
+                params["num_groups"] = module.num_groups
+            if hasattr(module, "normalized_shape"):
+                params["normalized_shape"] = module.normalized_shape
+
+        elif isinstance(module, _POOL_TYPES):
+            for attr in ("kernel_size", "stride", "padding", "dilation"):
+                val = getattr(module, attr, None)
+                if val is not None:
+                    params.setdefault(attr, val)
+
+    if hasattr(model, "bias") and isinstance(
+        model.bias, (torch.Tensor, torch.nn.Parameter)
+    ):
+        params["add_bias"] = model.bias
+        params.setdefault("bias", model.bias)
+
+    # Extract simple scalar attributes stored by Model.__init__
+    # (catches dim, negative_slope, min_val, max_val, etc.)
+    _INIT_SCALAR_NAMES = {
+        "dim",
+        "negative_slope",
+        "min_val",
+        "max_val",
+        "beta",
+        "threshold",
+        "alpha",
+        "lambd",
+        "upper",
+        "lower",
+        "p",
+    }
+    for attr_name in _INIT_SCALAR_NAMES:
+        if hasattr(model, attr_name) and not isinstance(
+            getattr(model, attr_name), (torch.Tensor, torch.nn.Module)
+        ):
+            params.setdefault(attr_name, getattr(model, attr_name))
+
+    return params
+
+
+def test_kernel():
+    device = "cuda"
+    dtype = torch.bfloat16
+
+    # Setup reference model
+    model = Model(*get_init_inputs()).to(device).to(dtype)
+    inputs = [
+        (
+            x.to(device).to(dtype)
+            if isinstance(x, torch.Tensor) and x.is_floating_point()
+            else (x.to(device) if isinstance(x, torch.Tensor) else x)
+        )
+        for x in get_inputs()
+    ]
+
+    # Get reference output
+    with torch.no_grad():
+        ref_output = model(*inputs)
+
+    # Smart parameter binding: detect if kernel needs model params
+    sig = inspect.signature(kernel_function)
+    kernel_params = list(sig.parameters.keys())
+    param_kinds = [p.kind for p in sig.parameters.values()]
+    has_var_positional = any(k == inspect.Parameter.VAR_POSITIONAL for k in param_kinds)
+    has_var_keyword = any(k == inspect.Parameter.VAR_KEYWORD for k in param_kinds)
+    _MODEL_PARAM_NAMES = {
+        "weight",
+        "w",
+        "kernel_size",
+        "stride",
+        "padding",
+        "dilation",
+        "output_padding",
+        "groups",
+        "bias",
+        "conv_bias",
+        "eps",
+        "num_groups",
+        "normalized_shape",
+        "dim",
+        "negative_slope",
+        "min_val",
+        "max_val",
+        "beta",
+        "threshold",
+        "alpha",
+        "lambd",
+        "upper",
+        "lower",
+        "p",
+    }
+    needs_model = bool(_MODEL_PARAM_NAMES & set(kernel_params))
+    # If kernel uses *args/**kwargs, inspect its source for weight-related hints
+    if not needs_model and (has_var_positional or has_var_keyword):
+        try:
+            src = inspect.getsource(kernel_function)
+            needs_model = any(
+                kw in src
+                for kw in (
+                    "weight",
+                    "is_weight",
+                    "w.shape",
+                    "w.ndim",
+                    "kernel_size",
+                    "dilation",
+                )
+            )
+        except (OSError, TypeError):
+            pass
+
+    if needs_model:
+        model_params = _extract_model_params(model)
+        has_weight = "weight" in model_params or "w" in model_params
+        if has_var_positional and has_weight:
+            # *args kernel with weight: pass (input, weight1, weight2, ...) positionally
+            pos_args = list(inputs)
+            # Collect ALL conv/linear weights from model
+            for _, mod in model.named_modules():
+                if isinstance(mod, (*_CONV_TYPES, torch.nn.Linear)):
+                    if hasattr(mod, "weight") and mod.weight is not None:
+                        pos_args.append(mod.weight)
+            # Pass config params as kwargs
+            config_kwargs = {}
+            for k, v in model_params.items():
+                if k not in ("weight", "w", "bias", "conv_bias", "add_bias"):
+                    # Convert uniform tuples to scalar int for compatibility
+                    if (
+                        isinstance(v, (tuple, list))
+                        and len(v) >= 1
+                        and all(e == v[0] for e in v)
+                    ):
+                        v = v[0]
+                    config_kwargs[k] = v
+            kernel_output = kernel_function(*pos_args, **config_kwargs)
+        else:
+            # Bind keyword args, adapting tuple/int form to match defaults
+            call_args = {}
+            pos_idx = 0
+            for pname in kernel_params:
+                p = sig.parameters[pname]
+                if (
+                    p.kind == inspect.Parameter.VAR_POSITIONAL
+                    or p.kind == inspect.Parameter.VAR_KEYWORD
+                ):
+                    continue
+                if pname in model_params:
+                    val = model_params[pname]
+                    # Convert tuple/list to scalar when kernel expects int
+                    if isinstance(val, (tuple, list)):
+                        if p.default is not inspect.Parameter.empty and isinstance(
+                            p.default, int
+                        ):
+                            val = val[0]
+                        elif len(val) == 1:
+                            val = val[0]
+                    call_args[pname] = val
+                elif pos_idx < len(inputs):
+                    call_args[pname] = inputs[pos_idx]
+                    pos_idx += 1
+            kernel_output = kernel_function(**call_args)
+    else:
+        kernel_output = kernel_function(*inputs)
+
+    # Compare
+    # Handle in-place kernels that return None
+    if kernel_output is None:
+        # Assume in-place modification of first input
+        kernel_output = inputs[0]
+    # Handle shape mismatch: kernel may return per-sample loss vs reference scalar mean
+    if ref_output.dim() == 0 and kernel_output.dim() >= 1:
+        kernel_output = kernel_output.mean()
+    elif kernel_output.dim() == 0 and ref_output.dim() >= 1:
+        ref_output = ref_output.mean()
+    # Align dtypes for comparison
+    if ref_output.dtype != kernel_output.dtype:
+        # If kernel outputs higher precision, recompute reference at that precision
+        # using the SAME inputs to ensure fair comparison
+        if kernel_output.dtype == torch.float32 and ref_output.dtype in (
+            torch.bfloat16,
+            torch.float16,
+        ):
+            model_f32 = Model(*get_init_inputs()).to(device).to(torch.float32)
+            inputs_f32 = [
+                x.to(torch.float32) if x.is_floating_point() else x for x in inputs
+            ]
+            with torch.no_grad():
+                ref_output = model_f32(*inputs_f32)
+        else:
+            kernel_output = kernel_output.to(ref_output.dtype)
+    if torch.allclose(ref_output, kernel_output, rtol=1e-2, atol=1e-2):
+        print("PASS")
+        return True
+    else:
+        max_diff = (ref_output - kernel_output).abs().max().item()
+        print(f"FAIL: max difference = {max_diff}")
+        return False
+
+
+if __name__ == "__main__":
+    success = test_kernel()
+    sys.exit(0 if success else 1)
diff --git a/pyproject.toml b/pyproject.toml
index b34a3097..833b6a45 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ dev = [
 fuser-ui = "scripts.fuser_ui:main"
 kernel-agent = "scripts.triton_ui:main"
 list-models = "scripts.list_models:main"
+optimization-ui = "scripts.optimization_ui:main"
 pipeline-ui = "scripts.pipeline_ui:main"
 
 [project.urls]
diff --git a/scripts/optimization_ui.py b/scripts/optimization_ui.py
new file mode 100644
index 00000000..8e956d9a
--- /dev/null
+++ b/scripts/optimization_ui.py
@@ -0,0 +1,1013 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gradio UI for the hardware-guided kernel optimization pipeline."""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import re
+import sys
+import threading
+import time
+import traceback
+from pathlib import Path
+
+import gradio as gr
+from dotenv import load_dotenv
+
+# Ensure project root is importable when run as a script.
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent
+if str(_PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(_PROJECT_ROOT))
+
+
+def _list_kernelbench_problems(base: Path) -> list[tuple[str, str]]:
+    """Return list of (label, absolute_path) pairs for KernelBench problems."""
+    problems: list[tuple[str, str]] = []
+    if not base.exists():
+        return problems
+    for level_dir in sorted(base.glob("level*")):
+        if not level_dir.is_dir():
+            continue
+        if level_dir.name.lower() == "level4":
+            continue
+        for problem in sorted(level_dir.glob("*.py")):
+            label = f"{level_dir.name}/{problem.name}"
+            problems.append((label, str(problem.resolve())))
+    return problems
+
+
+def _discover_problems() -> list[tuple[str, str]]:
+    """Find KernelBench problems from common locations."""
+    candidate_roots = [
+        Path.cwd() / "external" / "KernelBench" / "KernelBench",
+        Path.cwd() / "KernelBench" / "KernelBench",
+        Path.cwd().parent / "KernelBench" / "KernelBench",
+        Path.cwd().parent.parent / "KernelBench" / "KernelBench",
+    ]
+    seen: set[str] = set()
+    problems: list[tuple[str, str]] = []
+    for root in candidate_roots:
+        for label, path in _list_kernelbench_problems(root):
+            if path not in seen:
+                seen.add(path)
+                problems.append((label, path))
+    return problems
+
+
+_EXAMPLES_DIR = Path(__file__).resolve().parent.parent / "examples"
+
+_CUSTOM_OPTION = "-- Custom (paste below) --"
+
+
+def _discover_examples() -> list[tuple[str, str]]:
+    """Find optimization examples from the examples/ directory.
+
+    Returns list of (label, directory_path) for dirs matching ``optimize_*``
+    that contain ``input.py`` and ``test.py``.
+    """
+    examples: list[tuple[str, str]] = []
+    if not _EXAMPLES_DIR.is_dir():
+        return examples
+    for d in sorted(_EXAMPLES_DIR.glob("optimize_*")):
+        if not d.is_dir():
+            continue
+        if (d / "input.py").exists() and (d / "test.py").exists():
+            # Turn "optimize_01_matvec" into "MatVec"
+            label = d.name.split("_", 2)[-1].replace("_", " ").title()
+            examples.append((label, str(d)))
+    return examples
+
+
+def _build_input_choices() -> list[str]:
+    """Build the dropdown choices: examples + custom."""
+    choices: list[str] = []
+    for label, _ in _discover_examples():
+        choices.append(f"Example: {label}")
+    choices.append(_CUSTOM_OPTION)
+    return choices
+
+
+def _get_gpu_choices() -> list[str]:
+    """Return GPU names from the specs database."""
+    from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import (
+        GPU_SPECS_DATABASE,
+    )
+
+    return sorted(GPU_SPECS_DATABASE.keys())
+
+
+def _env_var_for_model(model_name: str) -> str:
+    """Determine which API key env var a model needs."""
+    if "claude" in model_name.lower() or "anthropic" in model_name.lower():
+        return "ANTHROPIC_API_KEY"
+    return "OPENAI_API_KEY"
+
+
+def _load_sibling_file(problem_path: str, filename: str) -> str:
+    """Load a sibling file (input.py, test.py) next to a problem file."""
+    if not problem_path:
+        return ""
+    parent = Path(problem_path).parent
+    candidate = parent / filename
+    if candidate.exists():
+        try:
+            return candidate.read_text(encoding="utf-8")
+        except OSError:
+            pass
+    return ""
+
+
+def run_optimization(
+    problem_label: str,
+    kernel_code: str,
+    test_code: str,
+    model_name: str,
+    gpu_name: str,
+    max_rounds: int,
+    high_reasoning: bool,
+    platform: str,
+    api_key: str | None,
+    strategy: str = "greedy",
+    num_workers: int = 1,
+    strategy_config: dict | None = None,
+    problem_file_override: str | None = None,
+    log_capture: _LogCapture | None = None,
+) -> tuple[str, str, str, str | None, str]:
+    """Run the optimization pipeline and return (status_md, best_kernel, log, download_path, per_round_html)."""
+    from triton_kernel_agent.opt_manager import OptimizationManager
+
+    if not kernel_code or not kernel_code.strip():
+        return "**Error:** No kernel code provided.", "", "", None, ""
+    if not test_code or not test_code.strip():
+        return "**Error:** No test code provided.", "", "", None, ""
+
+    # Resolve API key
+    env_var = _env_var_for_model(model_name)
+    user_key = api_key.strip() if api_key else None
+    original_env_key = os.environ.get(env_var)
+    temp_key_set = False
+    if user_key:
+        os.environ[env_var] = user_key
+        temp_key_set = True
+
+    try:
+        # Set up run directory
+        ts = int(time.time())
+        run_dir = Path.cwd() / ".optimize" / f"optimization_{ts}"
+        output_dir = run_dir / "output"
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Resolve problem file: explicit override > KB label lookup > stub
+        if problem_file_override and Path(problem_file_override).exists():
+            problem_file = Path(problem_file_override)
+        else:
+            problem_mapping = {label: path for label, path in _discover_problems()}
+            source_problem = problem_mapping.get(problem_label, "")
+            if source_problem and Path(source_problem).exists():
+                problem_file = Path(source_problem)
+            else:
+                # Write a stub problem file from kernel code context
+                problem_file = run_dir / "problem.py"
+                problem_file.parent.mkdir(parents=True, exist_ok=True)
+                problem_file.write_text(
+                    "# Auto-generated problem stub\n"
+                    "import torch\nimport torch.nn as nn\n\n"
+                    "class Model(nn.Module):\n"
+                    "    def __init__(self):\n"
+                    "        super().__init__()\n"
+                    "    def forward(self, x):\n"
+                    "        return x\n",
+                    encoding="utf-8",
+                )
+
+        # Set up log capture on the OptimizationManager logger
+        if log_capture is None:
+            log_capture = _LogCapture()
+        log_capture.metadata["log_dir"] = str(run_dir)
+
+        mgr_logger = logging.getLogger("OptimizationManager")
+        stream_handler = logging.StreamHandler(log_capture)
+        stream_handler.setFormatter(
+            logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+        )
+        mgr_logger.addHandler(stream_handler)
+
+        try:
+            manager = OptimizationManager(
+                strategy=strategy,
+                num_workers=num_workers,
+                max_rounds=max_rounds,
+                log_dir=run_dir,
+                strategy_config=strategy_config,
+                openai_model=model_name,
+                high_reasoning_effort=high_reasoning,
+                gpu_name=gpu_name,
+                target_platform=platform,
+            )
+
+            result = manager.run_optimization(
+                initial_kernel=kernel_code,
+                problem_file=problem_file,
+                test_code=test_code,
+            )
+        finally:
+            mgr_logger.removeHandler(stream_handler)
+
+        # Build status markdown
+        status_md = _build_status_markdown(result, strategy, num_workers)
+
+        # Build per-round best data from database
+        per_round_best: dict[int, dict] = {}
+        try:
+            all_entries = manager.database.get_all()
+            for entry in all_entries:
+                gen = entry.generation
+                if gen is None or gen < 1:
+                    continue
+                time_ms = entry.metrics.time_ms
+                if (
+                    gen not in per_round_best
+                    or time_ms < per_round_best[gen]["time_ms"]
+                ):
+                    per_round_best[gen] = {
+                        "time_ms": time_ms,
+                        "program_id": entry.program_id,
+                        "kernel_code": entry.kernel_code,
+                    }
+        except Exception:
+            pass
+        round_html = _build_per_round_html(per_round_best)
+
+        # Save best kernel for download
+        best_kernel = result.get("kernel_code") or ""
+        download_path = None
+        if best_kernel:
+            best_file = output_dir / "best_kernel.py"
+            best_file.write_text(best_kernel, encoding="utf-8")
+            download_path = str(best_file)
+
+        log_text = log_capture.getvalue()
+        return status_md, best_kernel, log_text, download_path, round_html
+
+    except Exception as e:
+        tb = traceback.format_exc()
+        return f"## Error\n\n```\n{e}\n```\n\n```\n{tb}\n```", "", "", None, ""
+    finally:
+        if temp_key_set:
+            if original_env_key is not None:
+                os.environ[env_var] = original_env_key
+            else:
+                os.environ.pop(env_var, None)
+
+
+def _build_status_markdown(result: dict, strategy: str, num_workers: int) -> str:
+    """Build the final status markdown from an OptimizationManager result dict."""
+    if not result.get("success"):
+        return "## Optimization Failed\n\nNo improvement found."
+
+    best_time = result.get("best_time_ms", 0)
+    total_rounds = result.get("total_rounds", 0)
+    top_kernels = result.get("top_kernels", [])
+    initial_kernel_time = result.get("initial_kernel_time_ms", float("inf"))
+    pytorch_baseline = result.get("pytorch_baseline_ms", float("inf"))
+    pytorch_compile = result.get("pytorch_compile_ms", float("inf"))
+
+    strategy_label = (
+        f"Beam Search ({num_workers} workers)"
+        if strategy == "beam_search"
+        else f"Greedy ({num_workers} worker)"
+    )
+
+    status_md = "## Optimization Complete\n\n"
+    status_md += "| Metric | Value |\n|---|---|\n"
+    status_md += f"| Best Time | {best_time:.4f} ms |\n"
+    if initial_kernel_time != float("inf"):
+        status_md += f"| Initial Kernel | {initial_kernel_time:.4f} ms |\n"
+    if pytorch_baseline != float("inf"):
+        status_md += f"| PyTorch Eager | {pytorch_baseline:.4f} ms |\n"
+    if pytorch_compile != float("inf"):
+        status_md += f"| PyTorch Compile | {pytorch_compile:.4f} ms |\n"
+    if initial_kernel_time != float("inf") and best_time > 0:
+        speedup = initial_kernel_time / best_time
+        status_md += f"| Speedup vs Initial | {speedup:.2f}x |\n"
+    status_md += f"| Rounds | {total_rounds} |\n"
+    status_md += f"| Strategy | {strategy_label} |\n"
+
+    if len(top_kernels) > 1:
+        status_md += f"| Top Kernels | {len(top_kernels)} found |\n"
+        status_md += "\n### Top Kernels\n"
+        status_md += "| # | Time (ms) | Generation |\n|---|---|---|\n"
+        for i, k in enumerate(top_kernels, 1):
+            status_md += f"| {i} | {k['time_ms']:.4f} | {k.get('generation', '-')} |\n"
+
+    return status_md
+
+
+def _build_per_round_html(per_round_best: dict[int, dict]) -> str:
+    """Render per-round best results as collapsible HTML sections.
+
+    Args:
+        per_round_best: Mapping of round number to best entry dict
+            with keys: time_ms, program_id, kernel_code (snippet).
+
+    Returns:
+        HTML string with <details>/<summary> sections, last round open.
+    """
+    if not per_round_best:
+        return ""
+    parts = ["<h3>Per-Round Results</h3>"]
+    max_round = max(per_round_best)
+    for rnd in sorted(per_round_best):
+        entry = per_round_best[rnd]
+        time_ms = entry.get("time_ms", float("inf"))
+        prog_id = entry.get("program_id", "?")
+        # Last round is open by default
+        open_attr = " open" if rnd == max_round else ""
+        parts.append(f"<details{open_attr}>")
+        parts.append(
+            f"<summary>Round {rnd}: {time_ms:.4f} ms (ID: {prog_id})</summary>"
+        )
+        code = entry.get("kernel_code", "")
+        if code:
+            # Show first 30 lines as preview
+            lines = code.splitlines()[:30]
+            preview = "\n".join(lines)
+            if len(code.splitlines()) > 30:
+                preview += "\n# ... (truncated)"
+            parts.append(f"<pre><code>{preview}</code></pre>")
+        parts.append("</details>")
+    return "\n".join(parts)
+
+
+class _LogCapture:
+    """Thread-safe stream-like object that captures log messages."""
+
+    def __init__(self) -> None:
+        self._parts: list[str] = []
+        self._lock = threading.Lock()
+        self._read_index: int = 0
+        self.metadata: dict = {}
+
+    def write(self, msg: str) -> None:
+        with self._lock:
+            self._parts.append(msg)
+
+    def flush(self) -> None:
+        pass
+
+    def getvalue(self) -> str:
+        with self._lock:
+            return "".join(self._parts)
+
+    def get_new_lines(self) -> str:
+        """Return log content appended since the last call."""
+        with self._lock:
+            new = self._parts[self._read_index :]
+            self._read_index = len(self._parts)
+        return "".join(new)
+
+
+# Patterns matched against the *message* portion of each log line (after the
+# ``asctime - LEVEL - `` prefix).  Order matters: first match wins per line.
+_LOG_PATTERNS: list[tuple[re.Pattern[str], str]] = [
+    # Round boundary
+    (re.compile(r"ROUND\s+(\d+)/(\d+)"), "round"),
+    # Orchestrator phase transitions (exact prefixes to avoid duplicates)
+    (re.compile(r"\[\d+\] Profiling current kernel with NCU"), "phase_profile"),
+    (re.compile(r"\[\d+\] Analyzing bottleneck"), "phase_analyze"),
+    (re.compile(r"\[\d+\] Using pre-computed bottleneck"), "phase_analyze"),
+    (re.compile(r"\[\d+\] Generating optimized kernel"), "phase_generate"),
+    (re.compile(r"\[\d+\] Verifying correctness"), "phase_verify"),
+    # Verification result
+    (re.compile(r"\[\d+\].*Correctness check passed"), "verify_pass"),
+    (re.compile(r"\[\d+\].*Correctness check failed"), "verify_fail"),
+    # Performance results
+    (
+        re.compile(r"NEW BEST RUNTIME.*?(\d+\.?\d*)\s*ms.*?speedup:\s*(\d+\.?\d*)x"),
+        "new_best",
+    ),
+    (
+        re.compile(
+            r"\[\d+\] No improvement:\s*(\d+\.?\d*)\s*ms.*?best\s+(\d+\.?\d*)\s*ms"
+        ),
+        "no_improve",
+    ),
+    # Manager-level baselines (must precede worker-level "Baseline time:")
+    (re.compile(r"PyTorch baseline:\s*(\d+\.?\d*)ms"), "pytorch_eager"),
+    (re.compile(r"PyTorch compile baseline:\s*(\d+\.?\d*)ms"), "pytorch_compile"),
+    (re.compile(r"Initial kernel time:\s*(\d+\.?\d*)ms"), "initial_kernel_time"),
+    (re.compile(r"Speedup vs initial kernel:\s*(\d+\.?\d*)x"), "final_speedup_initial"),
+    (re.compile(r"Speedup vs PyTorch eager:\s*(\d+\.?\d*)x"), "final_speedup_pytorch"),
+    # Worker-level baseline
+    (re.compile(r"Baseline time:\s*(\d+\.?\d*)\s*ms"), "baseline"),
+    (re.compile(r"Using known kernel time:\s*(\d+\.?\d*)\s*ms"), "baseline"),
+    # Roofline / SOL (orchestrator-level, has "-bound" context)
+    (re.compile(r"Baseline SOL:\s*(\d+\.?\d*)%.*?(\w+)-bound"), "baseline_sol"),
+    (re.compile(r"\[\d+\] Roofline.*?(\w+)-bound.*?(\d+\.?\d*)% SOL"), "roofline"),
+    # Per-round best from manager
+    (re.compile(r"Round (\d+) best: worker (\d+) at (\d+\.?\d*) ms"), "round_best"),
+    (re.compile(r"Round (\d+): no successful workers"), "round_no_success"),
+    # Early termination
+    (re.compile(r"\[\d+\].*Early termination:\s*(.+)"), "early_stop"),
+    # Final summary
+    (re.compile(r"OPTIMIZATION COMPLETE"), "done"),
+    (re.compile(r"Speedup vs baseline:\s*(\d+\.?\d*)x"), "final_speedup"),
+    # Errors
+    (re.compile(r"timeout|timed?\s*out", re.IGNORECASE), "error"),
+    (re.compile(r"LLM.*?failed", re.IGNORECASE), "error"),
+]
+
+_WORKER_DIR_RE = re.compile(r"/w(\d+)/")
+
+
+def _tail_worker_logs(log_dir: str, offsets: dict[str, int]) -> dict[int, str]:
+    """Read new content from worker log files since last poll.
+
+    Args:
+        log_dir: Root log directory (same as run_dir).
+        offsets: Mutable dict mapping log file path -> last read position.
+
+    Returns:
+        Dict mapping worker_id (int) to new log content for that worker.
+    """
+    per_worker: dict[int, list[str]] = {}
+    workers_dir = Path(log_dir) / "workers"
+    if not workers_dir.exists():
+        return {}
+    for log_file in sorted(workers_dir.glob("w*/r*/logs/*.log")):
+        path_str = str(log_file)
+        prev = offsets.get(path_str, 0)
+        wid_match = _WORKER_DIR_RE.search(path_str)
+        wid = int(wid_match.group(1)) if wid_match else 0
+        try:
+            with open(log_file, encoding="utf-8", errors="replace") as f:
+                f.seek(prev)
+                chunk = f.read()
+                if chunk:
+                    offsets[path_str] = prev + len(chunk)
+                    per_worker.setdefault(wid, []).append(chunk)
+        except OSError:
+            pass
+    return {wid: "".join(parts) for wid, parts in per_worker.items()}
+
+
+_TIMESTAMP_RE = re.compile(r"(\d{2}:\d{2}:\d{2})")
+
+
+def _parse_log_for_status(raw_lines: str, manager_round: str = "") -> str:
+    """Extract curated status lines from raw log output, prefixed with timestamps.
+
+    Args:
+        raw_lines: Raw log text.
+        manager_round: If set (e.g. "3/5"), worker-level "Round 1/1" lines
+            are rewritten to show the real manager round instead.
+    """
+    if not raw_lines:
+        return ""
+    curated: list[str] = []
+    for line in raw_lines.splitlines():
+        # Extract HH:MM:SS from the log prefix
+        ts_match = _TIMESTAMP_RE.search(line)
+        ts = ts_match.group(1) if ts_match else ""
+
+        for pattern, kind in _LOG_PATTERNS:
+            m = pattern.search(line)
+            if not m:
+                continue
+            prefix = f"[{ts}] " if ts else ""
+            if kind == "round":
+                round_label = (
+                    manager_round if manager_round else f"{m.group(1)}/{m.group(2)}"
+                )
+                curated.append(f"\n{prefix}=== Round {round_label} ===")
+            elif kind == "phase_profile":
+                curated.append(f"{prefix}  Profiling kernel (NCU)...")
+            elif kind == "phase_analyze":
+                curated.append(f"{prefix}  Analyzing bottleneck...")
+            elif kind == "phase_generate":
+                curated.append(f"{prefix}  Generating optimized kernel...")
+            elif kind == "phase_verify":
+                curated.append(f"{prefix}  Verifying correctness...")
+            elif kind == "verify_pass":
+                curated.append(f"{prefix}  Correctness: PASSED")
+            elif kind == "verify_fail":
+                curated.append(f"{prefix}  Correctness: FAILED")
+            elif kind == "new_best":
+                time_val = m.group(1)
+                speedup_val = float(m.group(2))
+                if speedup_val > 1.0:
+                    curated.append(
+                        f"{prefix}  \U0001f389 SPEEDUP {speedup_val:.2f}x \u2014 NEW BEST: {time_val} ms"
+                    )
+                else:
+                    curated.append(
+                        f"{prefix}  NEW BEST: {time_val} ms (speedup {m.group(2)}x)"
+                    )
+            elif kind == "no_improve":
+                curated.append(
+                    f"{prefix}  No improvement ({m.group(1)} ms, best {m.group(2)} ms)"
+                )
+            elif kind == "pytorch_eager":
+                curated.append(f"{prefix}PyTorch eager baseline: {m.group(1)} ms")
+            elif kind == "pytorch_compile":
+                curated.append(f"{prefix}PyTorch compile baseline: {m.group(1)} ms")
+            elif kind == "initial_kernel_time":
+                curated.append(f"{prefix}Initial kernel: {m.group(1)} ms")
+            elif kind == "final_speedup_initial":
+                curated.append(f"{prefix}  Speedup vs initial kernel: {m.group(1)}x")
+            elif kind == "final_speedup_pytorch":
+                curated.append(f"{prefix}  Speedup vs PyTorch eager: {m.group(1)}x")
+            elif kind == "baseline":
+                curated.append(f"{prefix}  Worker baseline: {m.group(1)} ms")
+            elif kind == "baseline_sol":
+                curated.append(
+                    f"{prefix}Baseline SOL: {m.group(1)}% ({m.group(2)}-bound)"
+                )
+            elif kind == "roofline":
+                curated.append(
+                    f"{prefix}  Roofline: {m.group(1)}-bound, {m.group(2)}% SOL"
+                )
+            elif kind == "round_best":
+                curated.append(
+                    f"{prefix}  Round {m.group(1)} winner: worker {m.group(2)} at {m.group(3)} ms"
+                )
+            elif kind == "round_no_success":
+                curated.append(f"{prefix}  Round {m.group(1)}: no successful workers")
+            elif kind == "early_stop":
+                curated.append(f"{prefix}  Early stop: {m.group(1).strip()}")
+            elif kind == "done":
+                curated.append(f"\n{prefix}OPTIMIZATION COMPLETE")
+            elif kind == "final_speedup":
+                curated.append(f"{prefix}  Final speedup: {m.group(1)}x")
+            elif kind == "error":
+                curated.append(f"{prefix}  [ERROR] {m.group(0)}")
+            break  # first matching pattern per line
+    return "\n".join(curated)
+
+
+def build_interface() -> gr.Blocks:
+    from utils.providers.models import _get_model_name_to_config
+
+    # Build dropdown: examples + custom
+    input_choices = _build_input_choices()
+    default_input = input_choices[0] if input_choices else _CUSTOM_OPTION
+
+    # Pre-load default example content so fields aren't empty on launch
+    _examples = _discover_examples()
+    _example_map_init: dict[str, str] = {
+        f"Example: {label}": dirpath for label, dirpath in _examples
+    }
+    default_kernel = ""
+    default_test = ""
+    if default_input in _example_map_init:
+        _d = Path(_example_map_init[default_input])
+        try:
+            default_kernel = (_d / "input.py").read_text(encoding="utf-8")
+        except OSError:
+            pass
+        try:
+            default_test = (_d / "test.py").read_text(encoding="utf-8")
+        except OSError:
+            pass
+
+    model_names = sorted(_get_model_name_to_config().keys()) or ["gpt-5"]
+    default_model = "gpt-5" if "gpt-5" in model_names else model_names[0]
+
+    gpu_choices = _get_gpu_choices()
+    default_gpu = gpu_choices[0] if gpu_choices else ""
+
+    with gr.Blocks(
+        title="KernelAgent — Optimization UI",
+        theme=gr.themes.Soft(),
+        css=".worker-log textarea { background-color: #f5f5f5 !important; }",
+    ) as app:
+        gr.Markdown(
+            "# KernelAgent — Kernel Optimization\n\n"
+            "Hardware-guided optimization: NCU profiling, roofline analysis, "
+            "LLM bottleneck diagnosis, and iterative refinement.\n\n"
+            "We have prepared **three examples** to get started — pick one "
+            "from the dropdown, or paste your own kernel and test code.\n\n"
+            "**Note:** 5 rounds of optimization can take about 30 minutes."
+        )
+
+        with gr.Row():
+            # Left column: configuration
+            with gr.Column(scale=1):
+                gr.Markdown("## Configuration")
+
+                api_key_input = gr.Textbox(
+                    label="API Key (optional)",
+                    placeholder="sk-... or sk-ant-...",
+                    type="password",
+                    value="",
+                    info="Session-only. Uses env var from .env if empty.",
+                )
+
+                input_dropdown = gr.Dropdown(
+                    choices=input_choices,
+                    label="Input Source",
+                    value=default_input,
+                    interactive=True,
+                    info="Pick an example to get started, or select Custom to paste your own.",
+                )
+
+                kernel_input = gr.Textbox(
+                    label="Kernel Code",
+                    placeholder="Paste a verified Triton kernel here...",
+                    lines=12,
+                    max_lines=30,
+                    value=default_kernel,
+                )
+
+                test_input = gr.Textbox(
+                    label="Test Code",
+                    placeholder="Paste test code here...",
+                    lines=8,
+                    max_lines=20,
+                    value=default_test,
+                )
+
+                model_dropdown = gr.Dropdown(
+                    choices=model_names,
+                    label="Model",
+                    value=default_model,
+                    interactive=True,
+                )
+
+                strategy_radio = gr.Radio(
+                    choices=["Greedy (1 worker)", "Beam Search (4 workers)"],
+                    value="Greedy (1 worker)",
+                    label="Search Strategy",
+                )
+
+                gpu_dropdown = gr.Dropdown(
+                    choices=gpu_choices,
+                    label="GPU",
+                    value=default_gpu,
+                    interactive=True,
+                    info="Select the GPU on your machine.",
+                )
+
+                max_rounds_slider = gr.Slider(
+                    1, 10, value=5, step=1, label="Max Optimization Rounds"
+                )
+
+                high_reasoning_cb = gr.Checkbox(
+                    label="High Reasoning Effort",
+                    value=True,
+                    info="Use high reasoning for better quality (o4-mini/o3 series).",
+                )
+
+                optimize_button = gr.Button("Optimize Kernel", variant="primary")
+
+            # Right column: results with tabs
+            with gr.Column(scale=2):
+                gr.Markdown("## Results")
+
+                status_output = gr.Markdown(
+                    value="*Ready — select a problem and paste a kernel to optimize.*"
+                )
+
+                with gr.Tab("Log"):
+                    manager_log_output = gr.Textbox(
+                        label="Manager",
+                        interactive=False,
+                        lines=8,
+                        max_lines=20,
+                    )
+                    with gr.Row():
+                        with gr.Column() as w0_col:
+                            w0_log = gr.Textbox(
+                                label="Worker 0",
+                                interactive=False,
+                                lines=18,
+                                max_lines=40,
+                                elem_classes=["worker-log"],
+                            )
+                        with gr.Column(visible=False) as w1_col:
+                            w1_log = gr.Textbox(
+                                label="Worker 1",
+                                interactive=False,
+                                lines=18,
+                                max_lines=40,
+                                elem_classes=["worker-log"],
+                            )
+                        with gr.Column(visible=False) as w2_col:
+                            w2_log = gr.Textbox(
+                                label="Worker 2",
+                                interactive=False,
+                                lines=18,
+                                max_lines=40,
+                                elem_classes=["worker-log"],
+                            )
+                        with gr.Column(visible=False) as w3_col:
+                            w3_log = gr.Textbox(
+                                label="Worker 3",
+                                interactive=False,
+                                lines=18,
+                                max_lines=40,
+                                elem_classes=["worker-log"],
+                            )
+
+                with gr.Tab("Best Kernel"):
+                    kernel_output = gr.Code(
+                        label="Optimized Kernel",
+                        language="python",
+                        interactive=False,
+                        lines=25,
+                    )
+                    per_round_html = gr.HTML(
+                        value="",
+                        label="Per-Round Results",
+                    )
+
+                with gr.Tab("Download"):
+                    download_output = gr.File(
+                        label="Download best kernel",
+                        interactive=False,
+                    )
+
+        # Wire input dropdown to auto-load kernel and test code
+        _example_map = _example_map_init
+
+        def _read_file(path: Path) -> str:
+            try:
+                return path.read_text(encoding="utf-8")
+            except OSError:
+                return ""
+
+        def on_input_selected(label: str) -> tuple[str, str]:
+            if label == _CUSTOM_OPTION or not label:
+                return "", ""
+            if label in _example_map:
+                d = Path(_example_map[label])
+                return _read_file(d / "input.py"), _read_file(d / "test.py")
+            return "", ""
+
+        input_dropdown.change(
+            fn=on_input_selected,
+            inputs=input_dropdown,
+            outputs=[kernel_input, test_input],
+        )
+
+        # Toggle worker column visibility based on strategy
+        def on_strategy_change(choice: str):
+            is_beam = choice == "Beam Search (4 workers)"
+            return (
+                gr.update(visible=True),
+                gr.update(visible=is_beam),
+                gr.update(visible=is_beam),
+                gr.update(visible=is_beam),
+            )
+
+        strategy_radio.change(
+            fn=on_strategy_change,
+            inputs=strategy_radio,
+            outputs=[w0_col, w1_col, w2_col, w3_col],
+        )
+
+        # Wire optimize button
+        def _parse_strategy(choice: str) -> tuple[str, int, dict]:
+            """Map strategy radio label to (strategy, num_workers, strategy_config)."""
+            if choice == "Beam Search (4 workers)":
+                return "beam_search", 4, {"num_top_kernels": 2, "num_bottlenecks": 2}
+            return "greedy", 1, {"max_no_improvement": 3}
+
+        def on_optimize(
+            input_label: str,
+            kernel_code: str,
+            test_code: str,
+            model_name: str,
+            strategy_choice: str,
+            gpu_name: str,
+            max_rounds: int,
+            high_reasoning: bool,
+            api_key: str | None,
+        ):
+            strategy, num_workers, strategy_config = _parse_strategy(strategy_choice)
+
+            # Resolve problem_label and problem_file_override from input source
+            problem_label = ""
+            problem_file_override = None
+            if input_label.startswith("KB: "):
+                problem_label = input_label[4:]
+            elif input_label in _example_map:
+                problem_file_override = str(
+                    Path(_example_map[input_label]) / "problem.py"
+                )
+
+            log_capture = _LogCapture()
+            result: list[tuple[str, str, str, str | None]] = []
+            error: list[BaseException] = []
+
+            def _worker() -> None:
+                try:
+                    result.append(
+                        run_optimization(
+                            problem_label=problem_label,
+                            kernel_code=kernel_code,
+                            test_code=test_code,
+                            model_name=model_name,
+                            gpu_name=gpu_name,
+                            max_rounds=int(max_rounds),
+                            high_reasoning=high_reasoning,
+                            platform="cuda",
+                            api_key=api_key,
+                            strategy=strategy,
+                            num_workers=num_workers,
+                            strategy_config=strategy_config,
+                            problem_file_override=problem_file_override,
+                            log_capture=log_capture,
+                        )
+                    )
+                except BaseException as exc:
+                    error.append(exc)
+
+            thread = threading.Thread(target=_worker, daemon=True)
+            thread.start()
+
+            # Accumulated curated logs: manager + per-worker
+            mgr_curated = ""
+            worker_curated: dict[int, str] = {i: "" for i in range(4)}
+            # Track live status from log lines
+            current_round = ""
+            current_phase = ""
+            best_info = ""
+            _round_re = re.compile(r"Round (\d+/\d+)")
+            _best_re = re.compile(r"NEW BEST: (.+)")
+            worker_log_offsets: dict[str, int] = {}
+
+            def _poll_logs() -> None:
+                nonlocal mgr_curated, current_round, current_phase, best_info
+                # Manager-level log
+                mgr_new = log_capture.get_new_lines()
+                if mgr_new:
+                    parsed = _parse_log_for_status(mgr_new)
+                    if parsed:
+                        mgr_curated += parsed + "\n"
+                    for cline in (parsed or "").splitlines():
+                        rm = _round_re.search(cline)
+                        if rm:
+                            current_round = rm.group(1)
+                            current_phase = ""
+                        for kw in ("Profiling", "Analyzing", "Generating", "Verifying"):
+                            if kw in cline:
+                                current_phase = kw.lower()
+                        bm = _best_re.search(cline)
+                        if bm:
+                            best_info = bm.group(1)
+                # Worker-level logs (per-worker)
+                log_dir = log_capture.metadata.get("log_dir", "")
+                if log_dir:
+                    per_worker = _tail_worker_logs(log_dir, worker_log_offsets)
+                    for wid, raw in per_worker.items():
+                        parsed = _parse_log_for_status(raw, manager_round=current_round)
+                        if parsed:
+                            worker_curated[wid] = (
+                                worker_curated.get(wid, "") + parsed + "\n"
+                            )
+
+            round_html_val: list[str] = []
+
+            def _make_yield(status, kernel_code, download):
+                return (
+                    status,
+                    kernel_code,
+                    mgr_curated.rstrip(),
+                    worker_curated.get(0, "").rstrip(),
+                    worker_curated.get(1, "").rstrip(),
+                    worker_curated.get(2, "").rstrip(),
+                    worker_curated.get(3, "").rstrip(),
+                    download,
+                    round_html_val[-1] if round_html_val else "",
+                )
+
+            # Poll with a hard timeout so the generator always terminates.
+            # 30 min per round × max_rounds + extra margin for baselines.
+            poll_deadline = time.time() + int(max_rounds) * 1800 + 600
+            while thread.is_alive():
+                thread.join(timeout=2)
+                _poll_logs()
+                status_parts = ["**Optimizing…**"]
+                if current_round:
+                    status_parts.append(f"Round {current_round}")
+                if current_phase:
+                    status_parts.append(f"({current_phase})")
+                if best_info:
+                    status_parts.append(f"| Best so far: {best_info}")
+                yield _make_yield(" ".join(status_parts), "", None)
+                if time.time() > poll_deadline:
+                    error.append(
+                        TimeoutError("Optimization exceeded maximum wall time")
+                    )
+                    break
+
+            # Drain remaining logs
+            _poll_logs()
+
+            if error:
+                tb = "".join(traceback.format_exception(error[0]))
+                yield _make_yield(
+                    f"## Error\n\n```\n{error[0]}\n```\n\n```\n{tb}\n```",
+                    "",
+                    None,
+                )
+            elif result:
+                status, best_kernel, raw_log, download_path, rh = result[0]
+                round_html_val.append(rh)
+                # If no curated manager log, fall back to raw
+                if not mgr_curated.strip():
+                    mgr_curated = raw_log
+                yield _make_yield(status, best_kernel, download_path)
+            else:
+                yield _make_yield(
+                    "## Error\n\nOptimization thread finished without result.",
+                    "",
+                    None,
+                )
+
+        optimize_button.click(
+            fn=on_optimize,
+            inputs=[
+                input_dropdown,
+                kernel_input,
+                test_input,
+                model_dropdown,
+                strategy_radio,
+                gpu_dropdown,
+                max_rounds_slider,
+                high_reasoning_cb,
+                api_key_input,
+            ],
+            outputs=[
+                status_output,
+                kernel_output,
+                manager_log_output,
+                w0_log,
+                w1_log,
+                w2_log,
+                w3_log,
+                download_output,
+                per_round_html,
+            ],
+            show_progress="hidden",
+        )
+
+    return app
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Optimization UI")
+    parser.add_argument("--port", type=int, default=8088)
+    parser.add_argument("--host", type=str, default="localhost")
+    args = parser.parse_args()
+
+    load_dotenv()
+    app = build_interface()
+
+    print("Starting Optimization UI...")
+
+    meta_keyfile = Path("/var/facebook/x509_identities/server.pem")
+    is_meta_devserver = meta_keyfile.exists()
+
+    if is_meta_devserver:
+        server_name = os.uname()[1]
+        print(f"Meta devserver detected. Visit https://{server_name}:{args.port}/")
+        app.launch(
+            share=False,
+            show_error=True,
+            server_name=server_name,
+            server_port=args.port,
+            ssl_keyfile=str(meta_keyfile),
+            ssl_certfile=str(meta_keyfile),
+            ssl_verify=False,
+            inbrowser=False,
+        )
+    else:
+        print(f"Visit http://{args.host}:{args.port}/")
+        app.launch(
+            share=False,
+            show_error=True,
+            server_name=args.host,
+            server_port=args.port,
+            inbrowser=True,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/triton_kernel_agent/opt_manager.py b/triton_kernel_agent/opt_manager.py
index 09bf6957..ac0d358a 100644
--- a/triton_kernel_agent/opt_manager.py
+++ b/triton_kernel_agent/opt_manager.py
@@ -242,6 +242,14 @@ def run_optimization(
         # Benchmark PyTorch baseline once (before spawning workers)
         pytorch_baseline = self._benchmark_pytorch_baseline(problem_file)
 
+        # Benchmark torch.compile baseline
+        pytorch_compile_time = self._benchmark_pytorch_compile(problem_file)
+
+        # Benchmark the initial kernel
+        initial_kernel_time = self._benchmark_initial_kernel(
+            initial_kernel, problem_file
+        )
+
         # Round loop
         round_num = 0
         for round_num in range(1, max_rounds + 1):
@@ -262,6 +270,16 @@ def run_optimization(
             # 3. Update strategy with results
             self.strategy.update_with_results(results, round_num)
 
+            # Log per-round winner summary
+            successful = [r for r in results if r.get("success")]
+            if successful:
+                best = min(successful, key=lambda r: r.get("time_ms", float("inf")))
+                self.logger.info(
+                    f"Round {round_num} best: worker {best['worker_id']} at {best['time_ms']:.4f} ms"
+                )
+            else:
+                self.logger.info(f"Round {round_num}: no successful workers")
+
             # 4. Check termination
             if self.strategy.should_terminate(round_num, max_rounds):
                 self.logger.info("Strategy signaled termination")
@@ -277,12 +295,21 @@ def run_optimization(
 
         if best:
             self.logger.info(f"Best time: {best.metrics.time_ms:.4f}ms")
+            if initial_kernel_time != float("inf") and best.metrics.time_ms > 0:
+                speedup = initial_kernel_time / best.metrics.time_ms
+                self.logger.info(f"Speedup vs initial kernel: {speedup:.2f}x")
+            if pytorch_baseline != float("inf") and best.metrics.time_ms > 0:
+                speedup_pt = pytorch_baseline / best.metrics.time_ms
+                self.logger.info(f"Speedup vs PyTorch eager: {speedup_pt:.2f}x")
 
         return {
             "success": best is not None and best.metrics.time_ms != float("inf"),
             "kernel_code": best.kernel_code if best else None,
             "best_time_ms": best.metrics.time_ms if best else float("inf"),
             "total_rounds": round_num,
+            "pytorch_baseline_ms": pytorch_baseline,
+            "pytorch_compile_ms": pytorch_compile_time,
+            "initial_kernel_time_ms": initial_kernel_time,
             "top_kernels": [
                 {
                     "kernel_code": p.kernel_code,
@@ -325,6 +352,75 @@ def _benchmark_pytorch_baseline(self, problem_file: Path) -> float:
 
         return pytorch_time
 
+    def _benchmark_initial_kernel(
+        self, initial_kernel: str, problem_file: Path
+    ) -> float:
+        """Benchmark the initial kernel before optimization begins.
+
+        Args:
+            initial_kernel: Kernel source code
+            problem_file: Path to problem.py
+
+        Returns:
+            Initial kernel time in ms
+        """
+        from triton_kernel_agent.opt_worker_component.benchmarking.benchmark import (
+            Benchmark,
+        )
+
+        artifacts_dir = self.log_dir / "artifacts"
+        artifacts_dir.mkdir(parents=True, exist_ok=True)
+
+        # Write kernel to a temp file
+        kernel_file = artifacts_dir / "initial_kernel.py"
+        kernel_file.write_text(initial_kernel, encoding="utf-8")
+
+        benchmarker = Benchmark(
+            logger=self.logger,
+            artifacts_dir=artifacts_dir,
+            benchmark_lock=self.benchmark_lock,
+            worker_id=-1,
+        )
+
+        result = benchmarker.benchmark_kernel(kernel_file, problem_file)
+        kernel_time = result.get("time_ms", float("inf"))
+
+        if kernel_time != float("inf"):
+            self.logger.info(f"Initial kernel time: {kernel_time:.4f}ms")
+
+        return kernel_time
+
+    def _benchmark_pytorch_compile(self, problem_file: Path) -> float:
+        """Benchmark torch.compile'd PyTorch baseline.
+
+        Args:
+            problem_file: Path to problem.py
+
+        Returns:
+            torch.compile baseline time in ms
+        """
+        from triton_kernel_agent.opt_worker_component.benchmarking.benchmark import (
+            Benchmark,
+        )
+
+        artifacts_dir = self.log_dir / "artifacts"
+        artifacts_dir.mkdir(parents=True, exist_ok=True)
+
+        benchmarker = Benchmark(
+            logger=self.logger,
+            artifacts_dir=artifacts_dir,
+            benchmark_lock=self.benchmark_lock,
+            worker_id=-1,
+        )
+
+        result = benchmarker.benchmark_pytorch_compile(problem_file)
+        compile_time = result.get("time_ms", float("inf"))
+
+        if compile_time != float("inf"):
+            self.logger.info(f"PyTorch compile baseline: {compile_time:.4f}ms")
+
+        return compile_time
+
     def _run_workers(
         self,
         candidates: list[dict[str, Any]],
@@ -400,6 +496,11 @@ def _run_workers(
                 self.logger.warning(f"Worker {w.pid} timed out, terminating")
                 w.terminate()
                 w.join(timeout=5)
+                if w.is_alive():
+                    self.logger.warning(f"Worker {w.pid} still alive, killing")
+                    w.kill()
+                    w.join(timeout=2)
+            w.close()
 
         # Collect results
         results = []
@@ -409,6 +510,10 @@ def _run_workers(
             except Exception:
                 break
 
+        # Clean up queue resources to prevent thread hangs during GC
+        result_queue.close()
+        result_queue.join_thread()
+
         successful = sum(1 for r in results if r.get("success"))
         self.logger.info(
             f"Round {round_num}: {successful}/{len(candidates)} workers succeeded "
diff --git a/triton_kernel_agent/opt_worker_component/benchmarking/benchmark.py b/triton_kernel_agent/opt_worker_component/benchmarking/benchmark.py
index 8ee39e46..9f8314ac 100644
--- a/triton_kernel_agent/opt_worker_component/benchmarking/benchmark.py
+++ b/triton_kernel_agent/opt_worker_component/benchmarking/benchmark.py
@@ -234,3 +234,68 @@ def benchmark_pytorch(
             self.logger.error(f"PyTorch baseline benchmark failed: {e}")
             self.logger.error(traceback.format_exc())
             return {"time_ms": float("inf")}
+
+    def benchmark_pytorch_compile(
+        self,
+        problem_file: Path,
+        dtype: Optional[torch.dtype] = None,
+    ) -> dict[str, Any]:
+        """Benchmark torch.compile'd PyTorch baseline using direct in-process timing.
+
+        Mirrors benchmark_pytorch() but wraps the model with torch.compile()
+        and uses extended warmup (3 forward calls) before timing to allow
+        compilation and warm caches.
+
+        Args:
+            problem_file: Path to problem file (must define Model class and get_inputs())
+            dtype: Data type to use (default: auto-detect based on model parameters)
+
+        Returns:
+            Dictionary with benchmark results:
+                - time_ms: Mean time in ms
+                - stats: Full timing statistics (mean, std, min, max, all_times, etc.)
+        """
+        try:
+            with self.lock_manager:
+                model, inputs = prepare_pytorch_model(
+                    problem_file=problem_file,
+                    device="cuda",
+                    dtype=dtype,
+                )
+
+                model = torch.compile(model)
+
+                # Extended warmup: 3 forward calls to trigger compilation
+                for _ in range(3):
+                    model(*inputs)
+                torch.cuda.synchronize()
+
+                if self.timing_method == "do_bench":
+                    times = time_with_triton_do_bench(
+                        lambda: model(*inputs),
+                        [],
+                        warmup=self.warmup,
+                        rep=self.repeat,
+                        verbose=False,
+                    )
+                else:  # cuda_event
+                    times = time_with_cuda_events(
+                        lambda: model(*inputs),
+                        [],
+                        num_warmup=self.warmup,
+                        num_trials=self.repeat,
+                        clear_cache=True,
+                        verbose=False,
+                    )
+
+                stats = compute_timing_stats(times)
+
+                return {
+                    "time_ms": stats["mean"],
+                    "stats": stats,
+                }
+
+        except Exception as e:
+            self.logger.error(f"PyTorch compile benchmark failed: {e}")
+            self.logger.error(traceback.format_exc())
+            return {"time_ms": float("inf")}
diff --git a/triton_kernel_agent/opt_worker_component/orchestrator/optimization_orchestrator.py b/triton_kernel_agent/opt_worker_component/orchestrator/optimization_orchestrator.py
index d4832f49..2d05ce02 100644
--- a/triton_kernel_agent/opt_worker_component/orchestrator/optimization_orchestrator.py
+++ b/triton_kernel_agent/opt_worker_component/orchestrator/optimization_orchestrator.py
@@ -29,11 +29,7 @@
 from kernel_perf_agent.kernel_opt.roofline.ncu_roofline import RooflineAnalyzer
 from triton_kernel_agent.prompt_manager import PromptManager
 from triton_kernel_agent.worker import VerificationWorker
-from triton_kernel_agent.worker_util import (
-    _call_llm,
-    _extract_code_from_response,
-    _write_kernel_file,
-)
+from triton_kernel_agent.worker_util import _write_kernel_file
 from utils.providers.base import BaseProvider
 
 
@@ -854,12 +850,8 @@ def _generate_optimized_kernel(self, opt_prompt: str, round_num: int) -> str | N
         self.logger.info(f"[{round_num}] Generating optimized kernel...")
         try:
             messages = [{"role": "user", "content": opt_prompt}]
-            response_text = _call_llm(
-                provider=self.provider,
-                model=self.model,
-                messages=messages,
-                high_reasoning_effort=self.high_reasoning_effort,
-                logger=self.logger,
+            response_text = self.verification_worker._call_llm(
+                messages,
                 max_tokens=24576,
             )
 
@@ -869,9 +861,8 @@ def _generate_optimized_kernel(self, opt_prompt: str, round_num: int) -> str | N
                 f.write(response_text)
 
             # Extract code
-            optimized_kernel = _extract_code_from_response(
-                response_text=response_text,
-                logger=self.logger,
+            optimized_kernel = self.verification_worker._extract_code_from_response(
+                response_text,
             )
 
             if not optimized_kernel or len(optimized_kernel) < 100:
@@ -949,14 +940,14 @@ def _generate_reflexion(self, attempt: OptimizationAttempt) -> Reflexion | None:
             reflexion_prompt = self.prompt_manager.render_reflexion_prompt(attempt)
 
             messages = [{"role": "user", "content": reflexion_prompt}]
-            response_text = _call_llm(
-                provider=self.provider,
-                model=self.model,
-                messages=messages,
-                high_reasoning_effort=False,  # Use standard reasoning for reflexion
-                logger=self.logger,
+            # Use provider directly with high_reasoning_effort=False
+            # (worker._call_llm would force high_reasoning=True if worker was configured that way)
+            response = self.provider.get_response(
+                self.model,
+                messages,
                 max_tokens=2048,
             )
+            response_text = response.content
 
             # Save reflexion response
             reflexion_file = (
diff --git a/triton_kernel_agent/opt_worker_component/profiling/ncu_wrapper_template.j2 b/triton_kernel_agent/opt_worker_component/profiling/ncu_wrapper_template.j2
index 29a7d413..48866810 100644
--- a/triton_kernel_agent/opt_worker_component/profiling/ncu_wrapper_template.j2
+++ b/triton_kernel_agent/opt_worker_component/profiling/ncu_wrapper_template.j2
@@ -15,6 +15,7 @@ limitations under the License.
 #}
 
 """NCU profiling wrapper."""
+import importlib
 import sys
 import torch
 import inspect
@@ -22,14 +23,15 @@ sys.path.insert(0, str({{ kernel_file_parent }}))
 sys.path.insert(0, str({{ problem_file_parent }}))
 
 from {{ kernel_module }} import kernel_function
-from {{ problem_module }} import get_inputs, get_init_inputs
 
-# Try to import Model if it exists (for Conv, Linear, etc.)
-try:
-    from {{ problem_module }} import Model
-    has_model = True
-except ImportError:
-    has_model = False
+_problem_mod = importlib.import_module({{ problem_module | tojson }})
+get_inputs = _problem_mod.get_inputs
+get_init_inputs = _problem_mod.get_init_inputs
+
+# Try to get Model if it exists (for Conv, Linear, etc.)
+has_model = hasattr(_problem_mod, 'Model')
+if has_model:
+    Model = _problem_mod.Model
 
 # Get inputs
 inputs = get_inputs()
diff --git a/triton_kernel_agent/worker_util.py b/triton_kernel_agent/worker_util.py
index ee5ef955..2e113d44 100644
--- a/triton_kernel_agent/worker_util.py
+++ b/triton_kernel_agent/worker_util.py
@@ -25,6 +25,18 @@
 # ------------------------
 
 
+def _call_llm(
+    provider,
+    model: str,
+    messages: list,
+    logger: Logger | None = None,
+    **kwargs,
+) -> str:
+    """Call an LLM provider and return the response text."""
+    response = provider.get_response(model, messages, **kwargs)
+    return response.content
+
+
 def _extract_history_usage_from_response(
     response_text: str,
     logger: Logger | None = None,
diff --git a/utils/providers/relay_provider.py b/utils/providers/relay_provider.py
index 6edc4d7e..3c35781a 100644
--- a/utils/providers/relay_provider.py
+++ b/utils/providers/relay_provider.py
@@ -93,7 +93,7 @@ def _handle_request(
             self.server_url,
             json=request_data,
             headers={"Content-Type": "application/json"},
-            timeout=int(os.environ.get("LLM_RELAY_TIMEOUT_S", 120)),
+            timeout=int(os.environ.get("LLM_RELAY_TIMEOUT_S", 600)),
         )
 
         if response.status_code != 200: