From 870c733aefc5e8e5bbafd67a5932acf0a75eef2c Mon Sep 17 00:00:00 2001
From: Simon Byrne <sbyrne@nvidia.com>
Date: Thu, 15 Jan 2026 22:03:51 +0000
Subject: [PATCH 1/2] fix onnxscript export for irfft

Signed-off-by: Simon Byrne <sbyrne@nvidia.com>
---
 onnxscript/function_libs/torch_lib/ops/fft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxscript/function_libs/torch_lib/ops/fft.py b/onnxscript/function_libs/torch_lib/ops/fft.py
index ea92dc347d..2b07209ed6 100644
--- a/onnxscript/function_libs/torch_lib/ops/fft.py
+++ b/onnxscript/function_libs/torch_lib/ops/fft.py
@@ -123,7 +123,7 @@ def aten__fft_c2r(
         dft_length=last_dim_size,
         axis=dimension,
         inverse=True,
-        onesided=False,
+        onesided=True,
     )
     transformed = _fftn_onnx_normalization(
         transformed,

From b9462d247a5c8831e1cf1eedc0c651463dbe2c7d Mon Sep 17 00:00:00 2001
From: Simon Byrne <sbyrne@nvidia.com>
Date: Thu, 15 Jan 2026 22:24:39 +0000
Subject: [PATCH 2/2] update onnx_opset

Signed-off-by: Simon Byrne <sbyrne@nvidia.com>
---
 onnxscript/onnx_opset/__init__.py             |   24 +
 onnxscript/onnx_opset/_impl/opset1.py         |  137 +-
 onnxscript/onnx_opset/_impl/opset10.py        |    2 +-
 onnxscript/onnx_opset/_impl/opset11.py        |   71 +-
 onnxscript/onnx_opset/_impl/opset12.py        |   28 +-
 onnxscript/onnx_opset/_impl/opset13.py        |  140 +-
 onnxscript/onnx_opset/_impl/opset14.py        |    2 +-
 onnxscript/onnx_opset/_impl/opset15.py        |   63 +-
 onnxscript/onnx_opset/_impl/opset16.py        |   49 +-
 onnxscript/onnx_opset/_impl/opset17.py        |   22 +-
 onnxscript/onnx_opset/_impl/opset18.py        |  183 +-
 onnxscript/onnx_opset/_impl/opset19.py        |   55 +-
 onnxscript/onnx_opset/_impl/opset2.py         |    9 +-
 onnxscript/onnx_opset/_impl/opset20.py        |   57 +-
 onnxscript/onnx_opset/_impl/opset21.py        |   74 +-
 onnxscript/onnx_opset/_impl/opset22.py        |   14 +-
 onnxscript/onnx_opset/_impl/opset23.py        |  136 +-
 onnxscript/onnx_opset/_impl/opset24.py        |   85 +-
 onnxscript/onnx_opset/_impl/opset25.py        | 1983 +++++++++++++++++
 onnxscript/onnx_opset/_impl/opset26.py        |   88 +
 onnxscript/onnx_opset/_impl/opset3.py         |    2 +-
 onnxscript/onnx_opset/_impl/opset4.py         |    2 +-
 onnxscript/onnx_opset/_impl/opset5.py         |    2 +-
 onnxscript/onnx_opset/_impl/opset6.py         |   15 +-
 onnxscript/onnx_opset/_impl/opset7.py         |    2 +-
 onnxscript/onnx_opset/_impl/opset8.py         |    2 +-
 onnxscript/onnx_opset/_impl/opset9.py         |   35 +-
 .../onnx_opset/_impl/opset_ai_onnx_ml1.py     |    2 +-
 .../onnx_opset/_impl/opset_ai_onnx_ml2.py     |    2 +-
 .../onnx_opset/_impl/opset_ai_onnx_ml3.py     |    2 +-
 .../onnx_opset/_impl/opset_ai_onnx_ml4.py     |    2 +-
 .../onnx_opset/_impl/opset_ai_onnx_ml5.py     |    2 +-
 .../_impl/opset_ai_onnx_preview_training1.py  |  576 +++++
 33 files changed, 3263 insertions(+), 605 deletions(-)
 create mode 100644 onnxscript/onnx_opset/_impl/opset25.py
 create mode 100644 onnxscript/onnx_opset/_impl/opset26.py
 create mode 100644 onnxscript/onnx_opset/_impl/opset_ai_onnx_preview_training1.py

diff --git a/onnxscript/onnx_opset/__init__.py b/onnxscript/onnx_opset/__init__.py
index 9b6ed0915c..cbcd967f89 100644
--- a/onnxscript/onnx_opset/__init__.py
+++ b/onnxscript/onnx_opset/__init__.py
@@ -7,6 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -39,11 +40,16 @@
 from onnxscript.onnx_opset._impl.opset22 import Opset22
 from onnxscript.onnx_opset._impl.opset23 import Opset23
 from onnxscript.onnx_opset._impl.opset24 import Opset24
+from onnxscript.onnx_opset._impl.opset25 import Opset25
+from onnxscript.onnx_opset._impl.opset26 import Opset26
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml1 import Opset_ai_onnx_ml1
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml2 import Opset_ai_onnx_ml2
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml3 import Opset_ai_onnx_ml3
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml4 import Opset_ai_onnx_ml4
 from onnxscript.onnx_opset._impl.opset_ai_onnx_ml5 import Opset_ai_onnx_ml5
+from onnxscript.onnx_opset._impl.opset_ai_onnx_preview_training1 import (
+    Opset_ai_onnx_preview_training1,
+)
 from onnxscript.values import Opset
 
 __all__ = [
@@ -72,11 +78,14 @@
     "opset22",
     "opset23",
     "opset24",
+    "opset25",
+    "opset26",
     "opset_ai_onnx_ml1",
     "opset_ai_onnx_ml2",
     "opset_ai_onnx_ml3",
     "opset_ai_onnx_ml4",
     "opset_ai_onnx_ml5",
+    "opset_ai_onnx_preview_training1",
 ]
 
 
@@ -110,11 +119,14 @@
 opset22 = Opset22()
 opset23 = Opset23()
 opset24 = Opset24()
+opset25 = Opset25()
+opset26 = Opset26()
 opset_ai_onnx_ml1 = Opset_ai_onnx_ml1()
 opset_ai_onnx_ml2 = Opset_ai_onnx_ml2()
 opset_ai_onnx_ml3 = Opset_ai_onnx_ml3()
 opset_ai_onnx_ml4 = Opset_ai_onnx_ml4()
 opset_ai_onnx_ml5 = Opset_ai_onnx_ml5()
+opset_ai_onnx_preview_training1 = Opset_ai_onnx_preview_training1()
 all_opsets: Mapping[Tuple[str, int], Opset] = {
     (
         "",
@@ -212,6 +224,14 @@
         "",
         24,
     ): opset24,
+    (
+        "",
+        25,
+    ): opset25,
+    (
+        "",
+        26,
+    ): opset26,
     (
         "ai.onnx.ml",
         1,
@@ -232,4 +252,8 @@
         "ai.onnx.ml",
         5,
     ): opset_ai_onnx_ml5,
+    (
+        "ai.onnx.preview.training",
+        1,
+    ): opset_ai_onnx_preview_training1,
 }
diff --git a/onnxscript/onnx_opset/_impl/opset1.py b/onnxscript/onnx_opset/_impl/opset1.py
index 4af313184d..454169f69f 100644
--- a/onnxscript/onnx_opset/_impl/opset1.py
+++ b/onnxscript/onnx_opset/_impl/opset1.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D214, D402, D405, D411, D416, D417
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -397,18 +397,7 @@ def BatchNormalization(
     )
 
     T2_Cast: TypeAlias = Union[
-        BOOL,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT16,
-        INT32,
-        INT64,
-        INT8,
-        UINT16,
-        UINT32,
-        UINT64,
-        UINT8,
+        BOOL, DOUBLE, FLOAT, FLOAT16, INT16, INT32, INT64, INT8, UINT16, UINT32, UINT64, UINT8
     ]
 
     def Cast(self, input: T1_Cast, *, to: str) -> T2_Cast:
@@ -847,11 +836,7 @@ def Dropout(
     T_Elu = TypeVar("T_Elu", DOUBLE, FLOAT, FLOAT16)
 
     def Elu(
-        self,
-        X: T_Elu,
-        *,
-        alpha: float = 1.0,
-        consumed_inputs: Optional[Sequence[int]] = None,
+        self, X: T_Elu, *, alpha: float = 1.0, consumed_inputs: Optional[Sequence[int]] = None
     ) -> T_Elu:
         r"""[🌐 Elu(1)](https://onnx.ai/onnx/operators/onnx__Elu.html#elu-1 "Online Documentation")
 
@@ -873,9 +858,7 @@ def Elu(
         schema = get_schema("Elu", 1, "")
         op = Op(self, "Elu", schema)
         return op(
-            *self._prepare_inputs(schema, X),
-            alpha=alpha,
-            consumed_inputs=consumed_inputs,
+            *self._prepare_inputs(schema, X), alpha=alpha, consumed_inputs=consumed_inputs
         )
 
     T_Equal = TypeVar("T_Equal", BOOL, INT32, INT64)
@@ -1354,12 +1337,7 @@ def GlobalMaxPool(self, X: T_GlobalMaxPool) -> T_GlobalMaxPool:
     T1_Greater: TypeAlias = BOOL
 
     def Greater(
-        self,
-        A: T_Greater,
-        B: T_Greater,
-        *,
-        axis: Optional[int] = None,
-        broadcast: int = 0,
+        self, A: T_Greater, B: T_Greater, *, axis: Optional[int] = None, broadcast: int = 0
     ) -> T1_Greater:
         r"""[🌐 Greater(1)](https://onnx.ai/onnx/operators/onnx__Greater.html#greater-1 "Online Documentation")
 
@@ -1624,11 +1602,7 @@ def LRN(
         schema = get_schema("LRN", 1, "")
         op = Op(self, "LRN", schema)
         return op(
-            *self._prepare_inputs(schema, X),
-            alpha=alpha,
-            beta=beta,
-            bias=bias,
-            size=size,
+            *self._prepare_inputs(schema, X), alpha=alpha, beta=beta, bias=bias, size=size
         )
 
     T_LSTM = TypeVar("T_LSTM", DOUBLE, FLOAT, FLOAT16)
@@ -1847,9 +1821,7 @@ def LeakyRelu(
         schema = get_schema("LeakyRelu", 1, "")
         op = Op(self, "LeakyRelu", schema)
         return op(
-            *self._prepare_inputs(schema, X),
-            alpha=alpha,
-            consumed_inputs=consumed_inputs,
+            *self._prepare_inputs(schema, X), alpha=alpha, consumed_inputs=consumed_inputs
         )
 
     T_Less = TypeVar("T_Less", DOUBLE, FLOAT, FLOAT16)
@@ -1962,11 +1934,7 @@ def LogSoftmax(self, input: T_LogSoftmax, *, axis: int = 1) -> T_LogSoftmax:
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(1)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-1 "Online Documentation")
 
@@ -2524,11 +2492,7 @@ def Or(self, A: T_Or, B: T_Or, *, axis: Optional[int] = None, broadcast: int = 0
     T_PRelu = TypeVar("T_PRelu", DOUBLE, FLOAT, FLOAT16)
 
     def PRelu(
-        self,
-        X: T_PRelu,
-        slope: T_PRelu,
-        *,
-        consumed_inputs: Optional[Sequence[int]] = None,
+        self, X: T_PRelu, slope: T_PRelu, *, consumed_inputs: Optional[Sequence[int]] = None
     ) -> T_PRelu:
         r"""[🌐 PRelu(1)](https://onnx.ai/onnx/operators/onnx__PRelu.html#prelu-1 "Online Documentation")
 
@@ -2602,10 +2566,7 @@ def Pad(
         schema = get_schema("Pad", 1, "")
         op = Op(self, "Pad", schema)
         return op(
-            *self._prepare_inputs(schema, data),
-            mode=mode,
-            paddings=paddings,
-            value=value,
+            *self._prepare_inputs(schema, data), mode=mode, paddings=paddings, value=value
         )
 
     T_Pow = TypeVar("T_Pow", DOUBLE, FLOAT, FLOAT16)
@@ -3013,11 +2974,7 @@ def RandomUniformLike(
         schema = get_schema("RandomUniformLike", 1, "")
         op = Op(self, "RandomUniformLike", schema)
         return op(
-            *self._prepare_inputs(schema, input),
-            dtype=dtype,
-            high=high,
-            low=low,
-            seed=seed,
+            *self._prepare_inputs(schema, input), dtype=dtype, high=high, low=low, seed=seed
         )
 
     T_Reciprocal = TypeVar("T_Reciprocal", DOUBLE, FLOAT, FLOAT16)
@@ -3046,11 +3003,7 @@ def Reciprocal(
     T_ReduceL1 = TypeVar("T_ReduceL1", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceL1(
-        self,
-        data: T_ReduceL1,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceL1, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceL1:
         r"""[🌐 ReduceL1(1)](https://onnx.ai/onnx/operators/onnx__ReduceL1.html#reducel1-1 "Online Documentation")
 
@@ -3080,11 +3033,7 @@ def ReduceL1(
     T_ReduceL2 = TypeVar("T_ReduceL2", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceL2(
-        self,
-        data: T_ReduceL2,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceL2, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceL2:
         r"""[🌐 ReduceL2(1)](https://onnx.ai/onnx/operators/onnx__ReduceL2.html#reducel2-1 "Online Documentation")
 
@@ -3116,11 +3065,7 @@ def ReduceL2(
     )
 
     def ReduceLogSum(
-        self,
-        data: T_ReduceLogSum,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceLogSum, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceLogSum:
         r"""[🌐 ReduceLogSum(1)](https://onnx.ai/onnx/operators/onnx__ReduceLogSum.html#reducelogsum-1 "Online Documentation")
 
@@ -3186,11 +3131,7 @@ def ReduceLogSumExp(
     T_ReduceMax = TypeVar("T_ReduceMax", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceMax(
-        self,
-        data: T_ReduceMax,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMax, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMax:
         r"""[🌐 ReduceMax(1)](https://onnx.ai/onnx/operators/onnx__ReduceMax.html#reducemax-1 "Online Documentation")
 
@@ -3222,11 +3163,7 @@ def ReduceMax(
     )
 
     def ReduceMean(
-        self,
-        data: T_ReduceMean,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMean, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMean:
         r"""[🌐 ReduceMean(1)](https://onnx.ai/onnx/operators/onnx__ReduceMean.html#reducemean-1 "Online Documentation")
 
@@ -3256,11 +3193,7 @@ def ReduceMean(
     T_ReduceMin = TypeVar("T_ReduceMin", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceMin(
-        self,
-        data: T_ReduceMin,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMin, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMin:
         r"""[🌐 ReduceMin(1)](https://onnx.ai/onnx/operators/onnx__ReduceMin.html#reducemin-1 "Online Documentation")
 
@@ -3292,11 +3225,7 @@ def ReduceMin(
     )
 
     def ReduceProd(
-        self,
-        data: T_ReduceProd,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceProd, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceProd:
         r"""[🌐 ReduceProd(1)](https://onnx.ai/onnx/operators/onnx__ReduceProd.html#reduceprod-1 "Online Documentation")
 
@@ -3326,11 +3255,7 @@ def ReduceProd(
     T_ReduceSum = TypeVar("T_ReduceSum", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceSum(
-        self,
-        data: T_ReduceSum,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceSum, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceSum:
         r"""[🌐 ReduceSum(1)](https://onnx.ai/onnx/operators/onnx__ReduceSum.html#reducesum-1 "Online Documentation")
 
@@ -3445,9 +3370,7 @@ def Reshape(
         schema = get_schema("Reshape", 1, "")
         op = Op(self, "Reshape", schema)
         return op(
-            *self._prepare_inputs(schema, data),
-            consumed_inputs=consumed_inputs,
-            shape=shape,
+            *self._prepare_inputs(schema, data), consumed_inputs=consumed_inputs, shape=shape
         )
 
     T_Selu = TypeVar("T_Selu", DOUBLE, FLOAT, FLOAT16)
@@ -4036,9 +3959,16 @@ def Transpose(
         r"""[🌐 Transpose(1)](https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-1 "Online Documentation")
 
 
-        Transpose the input tensor similar to numpy.transpose. For example, when
-        perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-        will be (2, 1, 3).
+        Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+        The optional attribute `perm` must be a permutation of the dimensions of
+        the input tensor. Axis `i` of the output tensor corresponds to the axis
+        `perm[i]` of the input tensor.
+        For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 1, 3).
+        When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 3, 1).
+        If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+        where `n` is the rank of the input tensor.
 
 
         Args:
@@ -4095,12 +4025,7 @@ def Unsqueeze(self, data: T_Unsqueeze, *, axes: Sequence[int]) -> T_Unsqueeze:
     T_Upsample = TypeVar("T_Upsample", BOOL, DOUBLE, FLOAT, FLOAT16, INT32, INT64)
 
     def Upsample(
-        self,
-        X: T_Upsample,
-        *,
-        height_scale: float,
-        mode: str = "nearest",
-        width_scale: float,
+        self, X: T_Upsample, *, height_scale: float, mode: str = "nearest", width_scale: float
     ) -> T_Upsample:
         r"""[🌐 Upsample(1)](https://onnx.ai/onnx/operators/onnx__Upsample.html#upsample-1 "Online Documentation")
 
diff --git a/onnxscript/onnx_opset/_impl/opset10.py b/onnxscript/onnx_opset/_impl/opset10.py
index ec1734b266..f4aee792f6 100644
--- a/onnxscript/onnx_opset/_impl/opset10.py
+++ b/onnxscript/onnx_opset/_impl/opset10.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset11.py b/onnxscript/onnx_opset/_impl/opset11.py
index 6538ac3afb..094a9aaa81 100644
--- a/onnxscript/onnx_opset/_impl/opset11.py
+++ b/onnxscript/onnx_opset/_impl/opset11.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: E741, D214, D402, D405, D411, D416
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -846,7 +846,7 @@ def DynamicQuantizeLinear(
         ::
 
             intermediate_zero_point = qmin - min(x)/y_scale
-            y_zero_point = cast(round(saturate(itermediate_zero_point)))
+            y_zero_point = cast(round(saturate(intermediate_zero_point)))
 
 
 
@@ -1464,11 +1464,7 @@ def LogSoftmax(self, input: T_LogSoftmax, *, axis: int = 1) -> T_LogSoftmax:
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(11)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-11 "Online Documentation")
 
@@ -1586,7 +1582,7 @@ def Loop(
         1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
            be referenced in the inputs of the loop.
         2) Any values computed in the loop body that needs to be used in a subsequent
-           iteration or after the loop are modelled using a pair of variables in the loop-body,
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
            consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
            These are referred to as loop-carried dependences. The loop operation node
            supplies the input value of the input variable for the first iteration, and
@@ -2241,11 +2237,7 @@ def Range(self, start: T_Range, limit: T_Range, delta: T_Range) -> T_Range:
     T_ReduceL1 = TypeVar("T_ReduceL1", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceL1(
-        self,
-        data: T_ReduceL1,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceL1, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceL1:
         r"""[🌐 ReduceL1(11)](https://onnx.ai/onnx/operators/onnx__ReduceL1.html#reducel1-11 "Online Documentation")
 
@@ -2275,11 +2267,7 @@ def ReduceL1(
     T_ReduceL2 = TypeVar("T_ReduceL2", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceL2(
-        self,
-        data: T_ReduceL2,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceL2, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceL2:
         r"""[🌐 ReduceL2(11)](https://onnx.ai/onnx/operators/onnx__ReduceL2.html#reducel2-11 "Online Documentation")
 
@@ -2311,11 +2299,7 @@ def ReduceL2(
     )
 
     def ReduceLogSum(
-        self,
-        data: T_ReduceLogSum,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceLogSum, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceLogSum:
         r"""[🌐 ReduceLogSum(11)](https://onnx.ai/onnx/operators/onnx__ReduceLogSum.html#reducelogsum-11 "Online Documentation")
 
@@ -2381,11 +2365,7 @@ def ReduceLogSumExp(
     T_ReduceMax = TypeVar("T_ReduceMax", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceMax(
-        self,
-        data: T_ReduceMax,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMax, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMax:
         r"""[🌐 ReduceMax(11)](https://onnx.ai/onnx/operators/onnx__ReduceMax.html#reducemax-11 "Online Documentation")
 
@@ -2418,11 +2398,7 @@ def ReduceMax(
     )
 
     def ReduceMean(
-        self,
-        data: T_ReduceMean,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMean, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMean:
         r"""[🌐 ReduceMean(11)](https://onnx.ai/onnx/operators/onnx__ReduceMean.html#reducemean-11 "Online Documentation")
 
@@ -2452,11 +2428,7 @@ def ReduceMean(
     T_ReduceMin = TypeVar("T_ReduceMin", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceMin(
-        self,
-        data: T_ReduceMin,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMin, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMin:
         r"""[🌐 ReduceMin(11)](https://onnx.ai/onnx/operators/onnx__ReduceMin.html#reducemin-11 "Online Documentation")
 
@@ -2489,11 +2461,7 @@ def ReduceMin(
     )
 
     def ReduceProd(
-        self,
-        data: T_ReduceProd,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceProd, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceProd:
         r"""[🌐 ReduceProd(11)](https://onnx.ai/onnx/operators/onnx__ReduceProd.html#reduceprod-11 "Online Documentation")
 
@@ -2523,11 +2491,7 @@ def ReduceProd(
     T_ReduceSum = TypeVar("T_ReduceSum", DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64)
 
     def ReduceSum(
-        self,
-        data: T_ReduceSum,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceSum, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceSum:
         r"""[🌐 ReduceSum(11)](https://onnx.ai/onnx/operators/onnx__ReduceSum.html#reducesum-11 "Online Documentation")
 
@@ -3130,7 +3094,7 @@ def ScatterND(
             output = np.copy(data)
             update_indices = indices.shape[:-1]
             for idx in np.ndindex(update_indices):
-                output[indices[idx]] = updates[idx]
+                output[tuple(indices[idx])] = updates[idx]
 
         The order of iteration in the above loop is not specified.
         In particular, indices should not have duplicate entries: that is, if idx1 != idx2, then indices[idx1] != indices[idx2].
@@ -3349,9 +3313,7 @@ def SequenceEmpty(self, *, dtype: Optional[int] = None) -> S_SequenceEmpty:
     I_SequenceErase = TypeVar("I_SequenceErase", INT32, INT64)
 
     def SequenceErase(
-        self,
-        input_sequence: S_SequenceErase,
-        position: Optional[I_SequenceErase] = None,
+        self, input_sequence: S_SequenceErase, position: Optional[I_SequenceErase] = None
     ) -> S_SequenceErase:
         r"""[🌐 SequenceErase(11)](https://onnx.ai/onnx/operators/onnx__SequenceErase.html#sequenceerase-11 "Online Documentation")
 
@@ -3835,10 +3797,7 @@ def TopK(
         schema = get_schema("TopK", 11, "")
         op = Op(self, "TopK", schema)
         return op(
-            *self._prepare_inputs(schema, X, K),
-            axis=axis,
-            largest=largest,
-            sorted=sorted,
+            *self._prepare_inputs(schema, X, K), axis=axis, largest=largest, sorted=sorted
         )
 
     T_Unique = TypeVar(
diff --git a/onnxscript/onnx_opset/_impl/opset12.py b/onnxscript/onnx_opset/_impl/opset12.py
index 95b2ea83c5..91dd7ab5c7 100644
--- a/onnxscript/onnx_opset/_impl/opset12.py
+++ b/onnxscript/onnx_opset/_impl/opset12.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -59,12 +59,7 @@ def __new__(cls):
     )
 
     def ArgMax(
-        self,
-        data: T_ArgMax,
-        *,
-        axis: int = 0,
-        keepdims: int = 1,
-        select_last_index: int = 0,
+        self, data: T_ArgMax, *, axis: int = 0, keepdims: int = 1, select_last_index: int = 0
     ) -> INT64:
         r"""[🌐 ArgMax(12)](https://onnx.ai/onnx/operators/onnx__ArgMax.html#argmax-12 "Online Documentation")
 
@@ -115,12 +110,7 @@ def ArgMax(
     )
 
     def ArgMin(
-        self,
-        data: T_ArgMin,
-        *,
-        axis: int = 0,
-        keepdims: int = 1,
-        select_last_index: int = 0,
+        self, data: T_ArgMin, *, axis: int = 0, keepdims: int = 1, select_last_index: int = 0
     ) -> INT64:
         r"""[🌐 ArgMin(12)](https://onnx.ai/onnx/operators/onnx__ArgMin.html#argmin-12 "Online Documentation")
 
@@ -947,11 +937,7 @@ def Pow(self, X: T_Pow, Y: T1_Pow) -> T_Pow:
     )
 
     def ReduceMax(
-        self,
-        data: T_ReduceMax,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMax, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMax:
         r"""[🌐 ReduceMax(12)](https://onnx.ai/onnx/operators/onnx__ReduceMax.html#reducemax-12 "Online Documentation")
 
@@ -983,11 +969,7 @@ def ReduceMax(
     )
 
     def ReduceMin(
-        self,
-        data: T_ReduceMin,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMin, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMin:
         r"""[🌐 ReduceMin(12)](https://onnx.ai/onnx/operators/onnx__ReduceMin.html#reducemin-12 "Online Documentation")
 
diff --git a/onnxscript/onnx_opset/_impl/opset13.py b/onnxscript/onnx_opset/_impl/opset13.py
index 5403df22cf..b1e02bb813 100644
--- a/onnxscript/onnx_opset/_impl/opset13.py
+++ b/onnxscript/onnx_opset/_impl/opset13.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D214, D402, D405, D411, D416, D417
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -115,12 +115,7 @@ def Add(self, A: T_Add, B: T_Add) -> T_Add:
     )
 
     def ArgMax(
-        self,
-        data: T_ArgMax,
-        *,
-        axis: int = 0,
-        keepdims: int = 1,
-        select_last_index: int = 0,
+        self, data: T_ArgMax, *, axis: int = 0, keepdims: int = 1, select_last_index: int = 0
     ) -> INT64:
         r"""[🌐 ArgMax(13)](https://onnx.ai/onnx/operators/onnx__ArgMax.html#argmax-13 "Online Documentation")
 
@@ -172,12 +167,7 @@ def ArgMax(
     )
 
     def ArgMin(
-        self,
-        data: T_ArgMin,
-        *,
-        axis: int = 0,
-        keepdims: int = 1,
-        select_last_index: int = 0,
+        self, data: T_ArgMin, *, axis: int = 0, keepdims: int = 1, select_last_index: int = 0
     ) -> INT64:
         r"""[🌐 ArgMin(13)](https://onnx.ai/onnx/operators/onnx__ArgMin.html#argmin-13 "Online Documentation")
 
@@ -700,21 +690,7 @@ def Equal(self, A: T_Equal, B: T_Equal) -> T1_Equal:
         op = Op(self, "Equal", schema)
         return op(*self._prepare_inputs(schema, A, B))
 
-    T_Erf = TypeVar(
-        "T_Erf",
-        BFLOAT16,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT16,
-        INT32,
-        INT64,
-        INT8,
-        UINT16,
-        UINT32,
-        UINT64,
-        UINT8,
-    )
+    T_Erf = TypeVar("T_Erf", BFLOAT16, DOUBLE, FLOAT, FLOAT16)
 
     def Erf(self, input: T_Erf) -> T_Erf:
         r"""[🌐 Erf(13)](https://onnx.ai/onnx/operators/onnx__Erf.html#erf-13 "Online Documentation")
@@ -1488,11 +1464,7 @@ def LRN(
         schema = get_schema("LRN", 13, "")
         op = Op(self, "LRN", schema)
         return op(
-            *self._prepare_inputs(schema, X),
-            alpha=alpha,
-            beta=beta,
-            bias=bias,
-            size=size,
+            *self._prepare_inputs(schema, X), alpha=alpha, beta=beta, bias=bias, size=size
         )
 
     T_Less = TypeVar(
@@ -1619,11 +1591,7 @@ def LogSoftmax(self, input: T_LogSoftmax, *, axis: int = -1) -> T_LogSoftmax:
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(13)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-13 "Online Documentation")
 
@@ -1741,7 +1709,7 @@ def Loop(
         1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
            be referenced in the inputs of the loop.
         2) Any values computed in the loop body that needs to be used in a subsequent
-           iteration or after the loop are modelled using a pair of variables in the loop-body,
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
            consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
            These are referred to as loop-carried dependences. The loop operation node
            supplies the input value of the input variable for the first iteration, and
@@ -2452,11 +2420,7 @@ def Reciprocal(self, X: T_Reciprocal) -> T_Reciprocal:
     )
 
     def ReduceL1(
-        self,
-        data: T_ReduceL1,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceL1, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceL1:
         r"""[🌐 ReduceL1(13)](https://onnx.ai/onnx/operators/onnx__ReduceL1.html#reducel1-13 "Online Documentation")
 
@@ -2490,11 +2454,7 @@ def ReduceL1(
     )
 
     def ReduceL2(
-        self,
-        data: T_ReduceL2,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceL2, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceL2:
         r"""[🌐 ReduceL2(13)](https://onnx.ai/onnx/operators/onnx__ReduceL2.html#reducel2-13 "Online Documentation")
 
@@ -2528,11 +2488,7 @@ def ReduceL2(
     )
 
     def ReduceLogSum(
-        self,
-        data: T_ReduceLogSum,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceLogSum, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceLogSum:
         r"""[🌐 ReduceLogSum(13)](https://onnx.ai/onnx/operators/onnx__ReduceLogSum.html#reducelogsum-13 "Online Documentation")
 
@@ -2562,15 +2518,7 @@ def ReduceLogSum(
         return op(*self._prepare_inputs(schema, data), axes=axes, keepdims=keepdims)
 
     T_ReduceLogSumExp = TypeVar(
-        "T_ReduceLogSumExp",
-        BFLOAT16,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT32,
-        INT64,
-        UINT32,
-        UINT64,
+        "T_ReduceLogSumExp", BFLOAT16, DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64
     )
 
     def ReduceLogSumExp(
@@ -2622,11 +2570,7 @@ def ReduceLogSumExp(
     )
 
     def ReduceMax(
-        self,
-        data: T_ReduceMax,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMax, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMax:
         r"""[🌐 ReduceMax(13)](https://onnx.ai/onnx/operators/onnx__ReduceMax.html#reducemax-13 "Online Documentation")
 
@@ -2660,11 +2604,7 @@ def ReduceMax(
     )
 
     def ReduceMean(
-        self,
-        data: T_ReduceMean,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMean, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMean:
         r"""[🌐 ReduceMean(13)](https://onnx.ai/onnx/operators/onnx__ReduceMean.html#reducemean-13 "Online Documentation")
 
@@ -2708,11 +2648,7 @@ def ReduceMean(
     )
 
     def ReduceMin(
-        self,
-        data: T_ReduceMin,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceMin, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceMin:
         r"""[🌐 ReduceMin(13)](https://onnx.ai/onnx/operators/onnx__ReduceMin.html#reducemin-13 "Online Documentation")
 
@@ -2746,11 +2682,7 @@ def ReduceMin(
     )
 
     def ReduceProd(
-        self,
-        data: T_ReduceProd,
-        *,
-        axes: Optional[Sequence[int]] = None,
-        keepdims: int = 1,
+        self, data: T_ReduceProd, *, axes: Optional[Sequence[int]] = None, keepdims: int = 1
     ) -> T_ReduceProd:
         r"""[🌐 ReduceProd(13)](https://onnx.ai/onnx/operators/onnx__ReduceProd.html#reduceprod-13 "Online Documentation")
 
@@ -2810,17 +2742,22 @@ def ReduceSum(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceSum", 13, "")
@@ -2832,15 +2769,7 @@ def ReduceSum(
         )
 
     T_ReduceSumSquare = TypeVar(
-        "T_ReduceSumSquare",
-        BFLOAT16,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT32,
-        INT64,
-        UINT32,
-        UINT64,
+        "T_ReduceSumSquare", BFLOAT16, DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64
     )
 
     def ReduceSumSquare(
@@ -3246,7 +3175,7 @@ def ScatterND(
             output = np.copy(data)
             update_indices = indices.shape[:-1]
             for idx in np.ndindex(update_indices):
-                output[indices[idx]] = updates[idx]
+                output[tuple(indices[idx])] = updates[idx]
 
         The order of iteration in the above loop is not specified.
         In particular, indices should not have duplicate entries: that is, if idx1 != idx2, then indices[idx1] != indices[idx2].
@@ -3943,9 +3872,16 @@ def Transpose(
         r"""[🌐 Transpose(13)](https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-13 "Online Documentation")
 
 
-        Transpose the input tensor similar to numpy.transpose. For example, when
-        perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-        will be (2, 1, 3).
+        Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+        The optional attribute `perm` must be a permutation of the dimensions of
+        the input tensor. Axis `i` of the output tensor corresponds to the axis
+        `perm[i]` of the input tensor.
+        For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 1, 3).
+        When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 3, 1).
+        If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+        where `n` is the rank of the input tensor.
 
 
         Args:
diff --git a/onnxscript/onnx_opset/_impl/opset14.py b/onnxscript/onnx_opset/_impl/opset14.py
index a9ec21f0d8..99a3ab596d 100644
--- a/onnxscript/onnx_opset/_impl/opset14.py
+++ b/onnxscript/onnx_opset/_impl/opset14.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402, D405
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset15.py b/onnxscript/onnx_opset/_impl/opset15.py
index c0758999f0..5906de97ea 100644
--- a/onnxscript/onnx_opset/_impl/opset15.py
+++ b/onnxscript/onnx_opset/_impl/opset15.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402, D412
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -290,37 +290,36 @@ def CastLike(self, input: T1_CastLike, target_type: T2_CastLike) -> T2_CastLike:
     )
 
     O_Optional: TypeAlias = Union[
-        None,
-        Sequence[BOOL],
-        Sequence[COMPLEX128],
-        Sequence[COMPLEX64],
-        Sequence[DOUBLE],
-        Sequence[FLOAT],
-        Sequence[FLOAT16],
-        Sequence[INT16],
-        Sequence[INT32],
-        Sequence[INT64],
-        Sequence[INT8],
-        Sequence[STRING],
-        Sequence[UINT16],
-        Sequence[UINT32],
-        Sequence[UINT64],
-        Sequence[UINT8],
-        BOOL,
-        COMPLEX128,
-        COMPLEX64,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT16,
-        INT32,
-        INT64,
-        INT8,
-        STRING,
-        UINT16,
-        UINT32,
-        UINT64,
-        UINT8,
+        _Optional[Sequence[BOOL]],
+        _Optional[Sequence[COMPLEX128]],
+        _Optional[Sequence[COMPLEX64]],
+        _Optional[Sequence[DOUBLE]],
+        _Optional[Sequence[FLOAT]],
+        _Optional[Sequence[FLOAT16]],
+        _Optional[Sequence[INT16]],
+        _Optional[Sequence[INT32]],
+        _Optional[Sequence[INT64]],
+        _Optional[Sequence[INT8]],
+        _Optional[Sequence[STRING]],
+        _Optional[Sequence[UINT16]],
+        _Optional[Sequence[UINT32]],
+        _Optional[Sequence[UINT64]],
+        _Optional[Sequence[UINT8]],
+        _Optional[BOOL],
+        _Optional[COMPLEX128],
+        _Optional[COMPLEX64],
+        _Optional[DOUBLE],
+        _Optional[FLOAT],
+        _Optional[FLOAT16],
+        _Optional[INT16],
+        _Optional[INT32],
+        _Optional[INT64],
+        _Optional[INT8],
+        _Optional[STRING],
+        _Optional[UINT16],
+        _Optional[UINT32],
+        _Optional[UINT64],
+        _Optional[UINT8],
     ]
 
     def Optional(
diff --git a/onnxscript/onnx_opset/_impl/opset16.py b/onnxscript/onnx_opset/_impl/opset16.py
index 21a92a6026..d3f1ad860e 100644
--- a/onnxscript/onnx_opset/_impl/opset16.py
+++ b/onnxscript/onnx_opset/_impl/opset16.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D214, D402, D405, D411, D416
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -252,7 +252,38 @@ def Identity(self, input: V_Identity) -> V_Identity:
     B_If: TypeAlias = BOOL
 
     V_If: TypeAlias = Union[
-        None,
+        Optional[Sequence[BFLOAT16]],
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BFLOAT16],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[INT16],
+        Optional[INT32],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT32],
+        Optional[UINT64],
+        Optional[UINT8],
         Sequence[BFLOAT16],
         Sequence[BOOL],
         Sequence[COMPLEX128],
@@ -444,11 +475,7 @@ def LessOrEqual(self, A: T_LessOrEqual, B: T_LessOrEqual) -> T1_LessOrEqual:
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(16)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-16 "Online Documentation")
 
@@ -566,7 +593,7 @@ def Loop(
         1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
            be referenced in the inputs of the loop.
         2) Any values computed in the loop body that needs to be used in a subsequent
-           iteration or after the loop are modelled using a pair of variables in the loop-body,
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
            consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
            These are referred to as loop-carried dependences. The loop operation node
            supplies the input value of the input variable for the first iteration, and
@@ -1117,7 +1144,7 @@ def ScatterND(
             output = np.copy(data)
             update_indices = indices.shape[:-1]
             for idx in np.ndindex(update_indices):
-                output[indices[idx]] = updates[idx]
+                output[tuple(indices[idx])] = updates[idx]
         The order of iteration in the above loop is not specified.
         In particular, indices should not have duplicate entries: that is, if idx1 != idx2, then indices[idx1] != indices[idx2].
         This ensures that the output value does not depend on the iteration order.
@@ -1130,12 +1157,12 @@ def ScatterND(
             output = np.copy(data)
             update_indices = indices.shape[:-1]
             for idx in np.ndindex(update_indices):
-                output[indices[idx]] += updates[idx]
+                output[tuple(indices[idx])] += updates[idx]
         When `reduction` is set to "mul", `output` is calculated as follows:
             output = np.copy(data)
             update_indices = indices.shape[:-1]
             for idx in np.ndindex(update_indices):
-                output[indices[idx]] *= updates[idx]
+                output[tuple(indices[idx])] *= updates[idx]
         This operator is the inverse of GatherND.
         Example 1:
         ::
diff --git a/onnxscript/onnx_opset/_impl/opset17.py b/onnxscript/onnx_opset/_impl/opset17.py
index 092658a502..431dd8334a 100644
--- a/onnxscript/onnx_opset/_impl/opset17.py
+++ b/onnxscript/onnx_opset/_impl/opset17.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -121,8 +121,10 @@ def DFT(
             dft_length: (optional, non-differentiable) The length of the signal as a
                 scalar. If greater than the axis dimension, the signal will be
                 zero-padded up to dft_length. If less than the axis dimension, only the
-                first dft_length values will be used as the signal. It's an optional
-                value.
+                first dft_length values will be used as the signal. If not provided, the
+                default dft_length = signal_dim_axis, except for the IRFFT case
+                (onesided=1, inverse=1), in which case the default dft_length is 2 *
+                (signal_dim_axis - 1). It's an optional value.
 
             axis: The axis on which to perform the DFT. By default this value is set to
                 1, which corresponds to the first dimension after the batch index.
@@ -134,13 +136,13 @@ def DFT(
                 default this value is set to 0, which corresponds to false.
 
             onesided: If onesided is 1, only values for w in [0, 1, 2, ...,
-                floor(n_fft/2) + 1] are returned because the real-to-complex Fourier
-                transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,
-                n_fft-w]*. Note if the input or window tensors are complex, then
-                onesided output is not possible. Enabling onesided with real inputs
-                performs a Real-valued fast Fourier transform (RFFT). When invoked with
-                real or complex valued input, the default value is 0. Values can be 0 or
-                1.
+                floor(n_fft/2) + 1] are used or returned because the real-to-complex
+                Fourier transform satisfies the conjugate symmetry, i.e., X[m, w] = X[m,
+                n_fft-w]*. When onesided=1 and inverse=0 (forward DFT), only real input
+                is supported and a one-sided complex spectrum is returned (RFFT). When
+                onesided=1 and inverse=1 (inverse DFT), only complex input is supported
+                and a full real signal is returned (IRFFT). When invoked with real or
+                complex valued input, the default value is 0. Values can be 0 or 1.
         """
 
         schema = get_schema("DFT", 17, "")
diff --git a/onnxscript/onnx_opset/_impl/opset18.py b/onnxscript/onnx_opset/_impl/opset18.py
index a795391355..773ad1a3d2 100644
--- a/onnxscript/onnx_opset/_impl/opset18.py
+++ b/onnxscript/onnx_opset/_impl/opset18.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402, D405
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -787,17 +787,22 @@ def ReduceL1(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceL1", 18, "")
@@ -839,17 +844,22 @@ def ReduceL2(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceL2", 18, "")
@@ -891,17 +901,22 @@ def ReduceLogSum(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceLogSum", 18, "")
@@ -913,15 +928,7 @@ def ReduceLogSum(
         )
 
     T_ReduceLogSumExp = TypeVar(
-        "T_ReduceLogSumExp",
-        BFLOAT16,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT32,
-        INT64,
-        UINT32,
-        UINT64,
+        "T_ReduceLogSumExp", BFLOAT16, DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64
     )
 
     def ReduceLogSumExp(
@@ -951,17 +958,22 @@ def ReduceLogSumExp(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceLogSumExp", 18, "")
@@ -1013,17 +1025,22 @@ def ReduceMax(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceMax", 18, "")
@@ -1065,17 +1082,22 @@ def ReduceMean(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceMean", 18, "")
@@ -1127,17 +1149,22 @@ def ReduceMin(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceMin", 18, "")
@@ -1179,17 +1206,22 @@ def ReduceProd(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceProd", 18, "")
@@ -1201,15 +1233,7 @@ def ReduceProd(
         )
 
     T_ReduceSumSquare = TypeVar(
-        "T_ReduceSumSquare",
-        BFLOAT16,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT32,
-        INT64,
-        UINT32,
-        UINT64,
+        "T_ReduceSumSquare", BFLOAT16, DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64
     )
 
     def ReduceSumSquare(
@@ -1239,17 +1263,22 @@ def ReduceSumSquare(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceSumSquare", 18, "")
@@ -1650,7 +1679,7 @@ def ScatterND(
             output = np.copy(data)
             update_indices = indices.shape[:-1]
             for idx in np.ndindex(update_indices):
-                output[indices[idx]] = updates[idx]
+                output[tuple(indices[idx])] = updates[idx]
 
 
 
@@ -1669,7 +1698,7 @@ def ScatterND(
             output = np.copy(data)
             update_indices = indices.shape[:-1]
             for idx in np.ndindex(update_indices):
-                output[indices[idx]] = f(output[indices[idx]], updates[idx])
+                output[tuple(indices[idx])] = f(output[tuple(indices[idx])], updates[idx])
 
 
 
@@ -1779,7 +1808,5 @@ def Split(
         schema = get_schema("Split", 18, "")
         op = Op(self, "Split", schema)
         return op(
-            *self._prepare_inputs(schema, input, split),
-            axis=axis,
-            num_outputs=num_outputs,
+            *self._prepare_inputs(schema, input, split), axis=axis, num_outputs=num_outputs
         )
diff --git a/onnxscript/onnx_opset/_impl/opset19.py b/onnxscript/onnx_opset/_impl/opset19.py
index 18a7cba17a..5aa9c75932 100644
--- a/onnxscript/onnx_opset/_impl/opset19.py
+++ b/onnxscript/onnx_opset/_impl/opset19.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D214, D402, D405, D411, D412, D416
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -703,7 +703,42 @@ def Identity(self, input: V_Identity) -> V_Identity:
     B_If: TypeAlias = BOOL
 
     V_If: TypeAlias = Union[
-        None,
+        Optional[Sequence[BFLOAT16]],
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BFLOAT16],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[FLOAT8E4M3FN],
+        Optional[FLOAT8E4M3FNUZ],
+        Optional[FLOAT8E5M2],
+        Optional[FLOAT8E5M2FNUZ],
+        Optional[INT16],
+        Optional[INT32],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT32],
+        Optional[UINT64],
+        Optional[UINT8],
         Sequence[BFLOAT16],
         Sequence[BOOL],
         Sequence[COMPLEX128],
@@ -711,6 +746,10 @@ def Identity(self, input: V_Identity) -> V_Identity:
         Sequence[DOUBLE],
         Sequence[FLOAT],
         Sequence[FLOAT16],
+        Sequence[FLOAT8E4M3FN],
+        Sequence[FLOAT8E4M3FNUZ],
+        Sequence[FLOAT8E5M2],
+        Sequence[FLOAT8E5M2FNUZ],
         Sequence[INT16],
         Sequence[INT32],
         Sequence[INT64],
@@ -740,10 +779,6 @@ def Identity(self, input: V_Identity) -> V_Identity:
         UINT32,
         UINT64,
         UINT8,
-        Sequence[FLOAT8E4M3FN],
-        Sequence[FLOAT8E4M3FNUZ],
-        Sequence[FLOAT8E5M2],
-        Sequence[FLOAT8E5M2FNUZ],
     ]
 
     def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) -> V_If:
@@ -856,11 +891,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(19)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-19 "Online Documentation")
 
@@ -978,7 +1009,7 @@ def Loop(
         1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
            be referenced in the inputs of the loop.
         2) Any values computed in the loop body that needs to be used in a subsequent
-           iteration or after the loop are modelled using a pair of variables in the loop-body,
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
            consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
            These are referred to as loop-carried dependences. The loop operation node
            supplies the input value of the input variable for the first iteration, and
diff --git a/onnxscript/onnx_opset/_impl/opset2.py b/onnxscript/onnx_opset/_impl/opset2.py
index a4a0e7f291..a44ffea074 100644
--- a/onnxscript/onnx_opset/_impl/opset2.py
+++ b/onnxscript/onnx_opset/_impl/opset2.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402, D411
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -131,12 +131,7 @@ def LpPool(
     T_Pad = TypeVar("T_Pad", DOUBLE, FLOAT, FLOAT16)
 
     def Pad(
-        self,
-        data: T_Pad,
-        *,
-        mode: str = "constant",
-        pads: Sequence[int],
-        value: float = 0.0,
+        self, data: T_Pad, *, mode: str = "constant", pads: Sequence[int], value: float = 0.0
     ) -> T_Pad:
         r"""[🌐 Pad(2)](https://onnx.ai/onnx/operators/onnx__Pad.html#pad-2 "Online Documentation")
 
diff --git a/onnxscript/onnx_opset/_impl/opset20.py b/onnxscript/onnx_opset/_impl/opset20.py
index 2f3f264c2a..afa52eabde 100644
--- a/onnxscript/onnx_opset/_impl/opset20.py
+++ b/onnxscript/onnx_opset/_impl/opset20.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -198,7 +198,10 @@ def DFT(
             dft_length: (optional, non-differentiable) The length of the signal as a
                 scalar. If greater than the axis dimension, the signal will be
                 zero-padded up to `dft_length`. If less than the axis dimension, only
-                the first `dft_length` values will be used as the signal.
+                the first `dft_length` values will be used as the signal. If not
+                provided, the default `dft_length = signal_dim_axis`, except for the
+                IRFFT case (`onesided=1`, `inverse=1`), in which case the default
+                dft_length is `2 * (signal_dim_axis - 1)`.
 
             axis: (optional, non-differentiable) The axis as a scalar on which to
                 perform the DFT. Default is `-2` (last signal axis). Negative value
@@ -209,13 +212,15 @@ def DFT(
             inverse: Whether to perform the inverse discrete Fourier Transform. Default
                 is 0, which corresponds to `false`.
 
-            onesided: If `onesided` is `1` and input is real, only values for `k` in
-                `[0, 1, 2, ..., floor(n_fft/2) + 1]` are returned because the
-                real-to-complex Fourier transform satisfies the conjugate symmetry,
-                i.e., `X[m, k] = X[m, n_fft-k]*`, where `m` denotes "all other
-                dimensions" DFT was not applied on. If the input tensor is complex,
-                onesided output is not possible. Value can be `0` or `1`. Default is
-                `0`.
+            onesided: If `onesided` is `1`, only values for `k` in `[0, 1, 2, ...,
+                floor(n_fft/2) + 1]` are used or returned because the real-to-complex
+                Fourier transform satisfies the conjugate symmetry, i.e., `X[m, k] =
+                X[m, n_fft-k]*`, where `m` denotes "all other dimensions" DFT was not
+                applied on. When `onesided=1` and `inverse=0` (forward DFT), only real
+                input is supported and a one-sided complex spectrum is returned (RFFT).
+                When `onesided=1` and `inverse=1` (inverse DFT), only complex input is
+                supported and a full real signal is returned (IRFFT). Value can be `0`
+                or `1`. Default is `0`.
         """
 
         schema = get_schema("DFT", 20, "")
@@ -515,17 +520,22 @@ def ReduceMax(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceMax", 20, "")
@@ -580,17 +590,22 @@ def ReduceMin(
                 which to reduce. The default is to reduce over empty axes. When axes is
                 empty (either not provided or explicitly empty), behavior depends on
                 'noop_with_empty_axes': reduction over all axes if
-                'noop_with_empty_axes' is false, or no reduction is applied if
-                'noop_with_empty_axes' is true (but other operations will be performed).
-                Accepted range is [-r, r-1] where r = rank(data).
+                'noop_with_empty_axes' is false, and reduction over the empty set of
+                axes when 'noop_with_empty_axes' is true. Accepted range is [-r, r-1]
+                where r = rank(data).
 
             keepdims: Keep the reduced dimension or not, default 1 means keep reduced
                 dimension.
 
             noop_with_empty_axes: Defines behavior when axes is not provided or is
-                empty. If false (default), reduction happens over all axes. If true, no
-                reduction is applied, but other operations will be performed. For
-                example, ReduceSumSquare acts as a vanilla Square.
+                empty. If false (default), reduction happens over all axes (similar to
+                the case when `axis=None` in numpy). If true, reduction happens over an
+                empty set of axes (similar to the case when `axis=()` in numpy). Note
+                that reduction over an empty set of axes means that the reduction step
+                behaves like a no-op (identity function), but composite-reduction
+                operators will still perform the non-reduction steps as needed. Thus,
+                ReduceLogSum returns the Log of input tensor, and ReduceSumSquare
+                returns the Square of the input tensor, in this case.
         """
 
         schema = get_schema("ReduceMin", 20, "")
diff --git a/onnxscript/onnx_opset/_impl/opset21.py b/onnxscript/onnx_opset/_impl/opset21.py
index b0ae5a2e9c..52dd8152d0 100644
--- a/onnxscript/onnx_opset/_impl/opset21.py
+++ b/onnxscript/onnx_opset/_impl/opset21.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D214, D402, D405, D411, D412, D416
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -670,7 +670,44 @@ def Identity(self, input: V_Identity) -> V_Identity:
     B_If: TypeAlias = BOOL
 
     V_If: TypeAlias = Union[
-        None,
+        Optional[Sequence[BFLOAT16]],
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BFLOAT16],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[FLOAT8E4M3FN],
+        Optional[FLOAT8E4M3FNUZ],
+        Optional[FLOAT8E5M2],
+        Optional[FLOAT8E5M2FNUZ],
+        Optional[INT16],
+        Optional[INT32],
+        Optional[INT4],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT32],
+        Optional[UINT4],
+        Optional[UINT64],
+        Optional[UINT8],
         Sequence[BFLOAT16],
         Sequence[BOOL],
         Sequence[COMPLEX128],
@@ -678,13 +715,19 @@ def Identity(self, input: V_Identity) -> V_Identity:
         Sequence[DOUBLE],
         Sequence[FLOAT],
         Sequence[FLOAT16],
+        Sequence[FLOAT8E4M3FN],
+        Sequence[FLOAT8E4M3FNUZ],
+        Sequence[FLOAT8E5M2],
+        Sequence[FLOAT8E5M2FNUZ],
         Sequence[INT16],
         Sequence[INT32],
+        Sequence[INT4],
         Sequence[INT64],
         Sequence[INT8],
         Sequence[STRING],
         Sequence[UINT16],
         Sequence[UINT32],
+        Sequence[UINT4],
         Sequence[UINT64],
         Sequence[UINT8],
         BFLOAT16,
@@ -709,12 +752,6 @@ def Identity(self, input: V_Identity) -> V_Identity:
         UINT4,
         UINT64,
         UINT8,
-        Sequence[FLOAT8E4M3FN],
-        Sequence[FLOAT8E4M3FNUZ],
-        Sequence[FLOAT8E5M2],
-        Sequence[FLOAT8E5M2FNUZ],
-        Sequence[INT4],
-        Sequence[UINT4],
     ]
 
     def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) -> V_If:
@@ -833,11 +870,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(21)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-21 "Online Documentation")
 
@@ -955,7 +988,7 @@ def Loop(
         1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
            be referenced in the inputs of the loop.
         2) Any values computed in the loop body that needs to be used in a subsequent
-           iteration or after the loop are modelled using a pair of variables in the loop-body,
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
            consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
            These are referred to as loop-carried dependences. The loop operation node
            supplies the input value of the input variable for the first iteration, and
@@ -1868,9 +1901,16 @@ def Transpose(
         r"""[🌐 Transpose(21)](https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-21 "Online Documentation")
 
 
-        Transpose the input tensor similar to numpy.transpose. For example, when
-        perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-        will be (2, 1, 3).
+        Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+        The optional attribute `perm` must be a permutation of the dimensions of
+        the input tensor. Axis `i` of the output tensor corresponds to the axis
+        `perm[i]` of the input tensor.
+        For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 1, 3).
+        When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 3, 1).
+        If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+        where `n` is the rank of the input tensor.
 
 
         Args:
diff --git a/onnxscript/onnx_opset/_impl/opset22.py b/onnxscript/onnx_opset/_impl/opset22.py
index 2b1656ed2a..d6deedad42 100644
--- a/onnxscript/onnx_opset/_impl/opset22.py
+++ b/onnxscript/onnx_opset/_impl/opset22.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: E741, D402, D405
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -275,11 +275,7 @@ def AveragePool(
     ]
 
     def Bernoulli(
-        self,
-        input: T1_Bernoulli,
-        *,
-        dtype: Optional[int] = None,
-        seed: Optional[float] = None,
+        self, input: T1_Bernoulli, *, dtype: Optional[int] = None, seed: Optional[float] = None
     ) -> T2_Bernoulli:
         r"""[🌐 Bernoulli(22)](https://onnx.ai/onnx/operators/onnx__Bernoulli.html#bernoulli-22 "Online Documentation")
 
@@ -2328,11 +2324,7 @@ def RandomUniformLike(
         schema = get_schema("RandomUniformLike", 22, "")
         op = Op(self, "RandomUniformLike", schema)
         return op(
-            *self._prepare_inputs(schema, input),
-            dtype=dtype,
-            high=high,
-            low=low,
-            seed=seed,
+            *self._prepare_inputs(schema, input), dtype=dtype, high=high, low=low, seed=seed
         )
 
     T1_RoiAlign = TypeVar("T1_RoiAlign", BFLOAT16, DOUBLE, FLOAT, FLOAT16)
diff --git a/onnxscript/onnx_opset/_impl/opset23.py b/onnxscript/onnx_opset/_impl/opset23.py
index 73b7480073..1052c995c6 100644
--- a/onnxscript/onnx_opset/_impl/opset23.py
+++ b/onnxscript/onnx_opset/_impl/opset23.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D214, D402, D405, D411, D412, D416
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -777,7 +777,45 @@ def Identity(self, input: V_Identity) -> V_Identity:
     B_If: TypeAlias = BOOL
 
     V_If: TypeAlias = Union[
-        None,
+        Optional[Sequence[BFLOAT16]],
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BFLOAT16],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[FLOAT4E2M1],
+        Optional[FLOAT8E4M3FN],
+        Optional[FLOAT8E4M3FNUZ],
+        Optional[FLOAT8E5M2],
+        Optional[FLOAT8E5M2FNUZ],
+        Optional[INT16],
+        Optional[INT32],
+        Optional[INT4],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT32],
+        Optional[UINT4],
+        Optional[UINT64],
+        Optional[UINT8],
         Sequence[BFLOAT16],
         Sequence[BOOL],
         Sequence[COMPLEX128],
@@ -785,13 +823,20 @@ def Identity(self, input: V_Identity) -> V_Identity:
         Sequence[DOUBLE],
         Sequence[FLOAT],
         Sequence[FLOAT16],
+        Sequence[FLOAT4E2M1],
+        Sequence[FLOAT8E4M3FN],
+        Sequence[FLOAT8E4M3FNUZ],
+        Sequence[FLOAT8E5M2],
+        Sequence[FLOAT8E5M2FNUZ],
         Sequence[INT16],
         Sequence[INT32],
+        Sequence[INT4],
         Sequence[INT64],
         Sequence[INT8],
         Sequence[STRING],
         Sequence[UINT16],
         Sequence[UINT32],
+        Sequence[UINT4],
         Sequence[UINT64],
         Sequence[UINT8],
         BFLOAT16,
@@ -817,13 +862,6 @@ def Identity(self, input: V_Identity) -> V_Identity:
         UINT4,
         UINT64,
         UINT8,
-        Sequence[FLOAT4E2M1],
-        Sequence[FLOAT8E4M3FN],
-        Sequence[FLOAT8E4M3FNUZ],
-        Sequence[FLOAT8E5M2],
-        Sequence[FLOAT8E5M2FNUZ],
-        Sequence[INT4],
-        Sequence[UINT4],
     ]
 
     def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) -> V_If:
@@ -945,11 +983,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(23)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-23 "Online Documentation")
 
@@ -1067,7 +1101,7 @@ def Loop(
         1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
            be referenced in the inputs of the loop.
         2) Any values computed in the loop body that needs to be used in a subsequent
-           iteration or after the loop are modelled using a pair of variables in the loop-body,
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
            consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
            These are referred to as loop-carried dependences. The loop operation node
            supplies the input value of the input variable for the first iteration, and
@@ -1579,15 +1613,16 @@ def RotaryEmbedding(
 
         ::
 
-            def compute_rotary_embedding(
-                input,
-                position_ids,
-                sin_cache,
-                cos_cache,
-                interleaved=0,
-                rotary_embedding_dim=0,
-                num_heads=0,
-            ):
+            def rotary_embedding(
+                input: np.ndarray,
+                cos_cache: np.ndarray,
+                sin_cache: np.ndarray,
+                position_ids: np.ndarray | None = None,
+                interleaved=None,
+                rotary_embedding_dim=None,
+                num_heads=None,
+            ) -> np.ndarray:
+                original_input_shape = input.shape
                 # First ensure input to be processed has shape [batch_size, seq_len, num_heads, head_size]
                 if len(input.shape) == 4:
                     input = np.transpose(input, (0, 2, 1, 3))
@@ -1603,7 +1638,7 @@ def compute_rotary_embedding(
                 head_size = input.shape[3]
 
                 # Fully or partially perform rotation on input based on rotary_embedding_dim attribute
-                if rotary_embedding_dim == 0:
+                if rotary_embedding_dim is None or rotary_embedding_dim == 0:
                     # If rotary_embedding_dim not provided, perform full rotation by using head_size
                     rotary_embedding_dim = head_size
                 x_rotate = input[:, :, :, :rotary_embedding_dim]
@@ -1612,15 +1647,29 @@ def compute_rotary_embedding(
 
                 # Retrieve sin and cos caches using position ids
                 if position_ids is not None:
-                    cos = cos_cache[position_ids]  # Shape: [batch_size, sequence_length, head_size/2]
-                    sin = sin_cache[position_ids]  # Shape: [batch_size, sequence_length, head_size/2]
-                else:
-                    cos = cos_cache
-                    sin = sin_cache
-                cos = cos[:, :, :rotary_embedding_dim_half]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
-                sin = sin[:, :, :rotary_embedding_dim_half]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
-                cos = np.expand_dims(cos, axis=2)  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
-                sin = np.expand_dims(sin, axis=2)  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
+                    cos_cache = cos_cache[
+                        position_ids
+                    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+                    sin_cache = sin_cache[
+                        position_ids
+                    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+
+                # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+                if cos_cache.shape[-1] != rotary_embedding_dim_half:
+                    raise ValueError(
+                        f"Last dimension of cos cache ({cos_cache.shape[-1]}) does not match rotary_embedding_dim/2 ({rotary_embedding_dim_half})."
+                    )
+                if sin_cache.shape[-1] != rotary_embedding_dim_half:
+                    raise ValueError(
+                        f"Last dimension of sin cache ({sin_cache.shape[-1]}) does not match rotary_embedding_dim/2 ({rotary_embedding_dim_half})."
+                    )
+
+                cos_cache = np.expand_dims(
+                    cos_cache, axis=2
+                )  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
+                sin_cache = np.expand_dims(
+                    sin_cache, axis=2
+                )  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
 
                 # Either divide the input in halves or interleave (based on interleaved attribute)
                 if interleaved:
@@ -1630,8 +1679,8 @@ def compute_rotary_embedding(
                     x1, x2 = np.split(x_rotate, 2, axis=-1)
 
                 # Calculate real and imaginary values
-                real = cos * x1 - sin * x2
-                imag = sin * x1 + cos * x2
+                real = (cos_cache * x1) - (sin_cache * x2)
+                imag = (sin_cache * x1) + (cos_cache * x2)
 
                 # Inserted rotated embeddings back to the original input
                 if interleaved:
@@ -1645,7 +1694,7 @@ def compute_rotary_embedding(
                     x_rotate = np.concatenate((real, imag), axis=-1)
                 output = np.concatenate((x_rotate, x_not_rotate), axis=-1)
                 if len(original_input_shape) == 3:
-                    output = np.reshape(output, input.shape)
+                    output = np.reshape(output, original_input_shape)
                 else:
                     output = np.transpose(output, (0, 2, 1, 3))
                 return output
@@ -2137,9 +2186,16 @@ def Transpose(
         r"""[🌐 Transpose(23)](https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-23 "Online Documentation")
 
 
-        Transpose the input tensor similar to numpy.transpose. For example, when
-        perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-        will be (2, 1, 3).
+        Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+        The optional attribute `perm` must be a permutation of the dimensions of
+        the input tensor. Axis `i` of the output tensor corresponds to the axis
+        `perm[i]` of the input tensor.
+        For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 1, 3).
+        When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 3, 1).
+        If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+        where `n` is the rank of the input tensor.
 
 
         Args:
diff --git a/onnxscript/onnx_opset/_impl/opset24.py b/onnxscript/onnx_opset/_impl/opset24.py
index d85fcaefe5..8da6879022 100644
--- a/onnxscript/onnx_opset/_impl/opset24.py
+++ b/onnxscript/onnx_opset/_impl/opset24.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D214, D402, D405, D411, D412, D416
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -854,7 +854,46 @@ def Identity(self, input: V_Identity) -> V_Identity:
     B_If: TypeAlias = BOOL
 
     V_If: TypeAlias = Union[
-        None,
+        Optional[Sequence[BFLOAT16]],
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BFLOAT16],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[FLOAT4E2M1],
+        Optional[FLOAT8E4M3FN],
+        Optional[FLOAT8E4M3FNUZ],
+        Optional[FLOAT8E5M2],
+        Optional[FLOAT8E5M2FNUZ],
+        Optional[FLOAT8E8M0],
+        Optional[INT16],
+        Optional[INT32],
+        Optional[INT4],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT32],
+        Optional[UINT4],
+        Optional[UINT64],
+        Optional[UINT8],
         Sequence[BFLOAT16],
         Sequence[BOOL],
         Sequence[COMPLEX128],
@@ -862,13 +901,21 @@ def Identity(self, input: V_Identity) -> V_Identity:
         Sequence[DOUBLE],
         Sequence[FLOAT],
         Sequence[FLOAT16],
+        Sequence[FLOAT4E2M1],
+        Sequence[FLOAT8E4M3FN],
+        Sequence[FLOAT8E4M3FNUZ],
+        Sequence[FLOAT8E5M2],
+        Sequence[FLOAT8E5M2FNUZ],
+        Sequence[FLOAT8E8M0],
         Sequence[INT16],
         Sequence[INT32],
+        Sequence[INT4],
         Sequence[INT64],
         Sequence[INT8],
         Sequence[STRING],
         Sequence[UINT16],
         Sequence[UINT32],
+        Sequence[UINT4],
         Sequence[UINT64],
         Sequence[UINT8],
         BFLOAT16,
@@ -895,14 +942,6 @@ def Identity(self, input: V_Identity) -> V_Identity:
         UINT4,
         UINT64,
         UINT8,
-        Sequence[FLOAT4E2M1],
-        Sequence[FLOAT8E4M3FN],
-        Sequence[FLOAT8E4M3FNUZ],
-        Sequence[FLOAT8E5M2],
-        Sequence[FLOAT8E5M2FNUZ],
-        Sequence[FLOAT8E8M0],
-        Sequence[INT4],
-        Sequence[UINT4],
     ]
 
     def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) -> V_If:
@@ -1027,11 +1066,7 @@ def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) ->
     )
 
     def Loop(
-        self,
-        M: Optional[I_Loop],
-        cond: Optional[B_Loop],
-        *v_initial: V_Loop,
-        body: GraphProto,
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
     ) -> V_Loop:
         r"""[🌐 Loop(24)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-24 "Online Documentation")
 
@@ -1149,7 +1184,7 @@ def Loop(
         1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
            be referenced in the inputs of the loop.
         2) Any values computed in the loop body that needs to be used in a subsequent
-           iteration or after the loop are modelled using a pair of variables in the loop-body,
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
            consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
            These are referred to as loop-carried dependences. The loop operation node
            supplies the input value of the input variable for the first iteration, and
@@ -2228,10 +2263,7 @@ def TopK(
         schema = get_schema("TopK", 24, "")
         op = Op(self, "TopK", schema)
         return op(
-            *self._prepare_inputs(schema, X, K),
-            axis=axis,
-            largest=largest,
-            sorted=sorted,
+            *self._prepare_inputs(schema, X, K), axis=axis, largest=largest, sorted=sorted
         )
 
     T_Transpose = TypeVar(
@@ -2268,9 +2300,16 @@ def Transpose(
         r"""[🌐 Transpose(24)](https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-24 "Online Documentation")
 
 
-        Transpose the input tensor similar to numpy.transpose. For example, when
-        perm=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
-        will be (2, 1, 3).
+        Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+        The optional attribute `perm` must be a permutation of the dimensions of
+        the input tensor. Axis `i` of the output tensor corresponds to the axis
+        `perm[i]` of the input tensor.
+        For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 1, 3).
+        When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 3, 1).
+        If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+        where `n` is the rank of the input tensor.
 
 
         Args:
diff --git a/onnxscript/onnx_opset/_impl/opset25.py b/onnxscript/onnx_opset/_impl/opset25.py
new file mode 100644
index 0000000000..247f978a06
--- /dev/null
+++ b/onnxscript/onnx_opset/_impl/opset25.py
@@ -0,0 +1,1983 @@
+# --------------------------------------------------------------------------
+# ⚠️ WARNING - AUTO-GENERATED CODE - DO NOT EDIT ⚠️
+# ⚙️ Generated by 'python -m opgen'
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# pylint: disable=W0221,W0222,R0901,W0237
+# mypy: disable-error-code=override
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Optional, Sequence, TypeVar, Union
+
+from onnx import GraphProto, SparseTensorProto, TensorProto
+from onnx.defs import get_schema
+from typing_extensions import TypeAlias
+
+from onnxscript.onnx_opset._impl.opset24 import Opset24
+from onnxscript.onnx_types import (
+    BFLOAT16,
+    BOOL,
+    COMPLEX64,
+    COMPLEX128,
+    DOUBLE,
+    FLOAT,
+    FLOAT4E2M1,
+    FLOAT8E4M3FN,
+    FLOAT8E4M3FNUZ,
+    FLOAT8E5M2,
+    FLOAT8E5M2FNUZ,
+    FLOAT8E8M0,
+    FLOAT16,
+    INT2,
+    INT4,
+    INT8,
+    INT16,
+    INT32,
+    INT64,
+    STRING,
+    UINT2,
+    UINT4,
+    UINT8,
+    UINT16,
+    UINT32,
+    UINT64,
+)
+from onnxscript.values import Op, Opset
+
+
+class Opset25(Opset24):
+    def __new__(cls):
+        return Opset.__new__(cls, "", 25)
+
+    T1_Cast = TypeVar(
+        "T1_Cast",
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    T2_Cast: TypeAlias = Union[
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    ]
+
+    def Cast(
+        self, input: T1_Cast, *, round_mode: str = "up", saturate: int = 1, to: int
+    ) -> T2_Cast:
+        r"""[🌐 Cast(25)](https://onnx.ai/onnx/operators/onnx__Cast.html#cast-25 "Online Documentation")
+
+
+        The operator casts the elements of a given input tensor to a data type
+        specified by the 'to' argument and returns an output tensor of the same size in
+        the converted type. The 'to' argument must be one of the data types specified
+        in the 'DataType' enum field in the TensorProto message.
+
+        Casting from string tensor in plain (e.g., "3.14" and "1000") and scientific numeric representations
+        (e.g., "1e-5" and "1E8") to float types is supported. For example, converting string "100.5" to an integer may
+        yield result 100. There are some string literals reserved for special floating-point values;
+        "+INF" (and "INF"), "-INF", and "NaN" are positive infinity, negative infinity, and not-a-number, respectively.
+        Any string which can exactly match "+INF" in a case-insensitive way would be mapped to positive infinite. Similarly,
+        this case-insensitive rule is applied to "INF" and "NaN". When casting from numeric tensors
+        to string tensors, plain floating-point representation (such as "314.15926") would be used.
+        Converting non-numerical-literal string such as "Hello World!" is an undefined behavior. Cases
+        of converting string representing floating-point arithmetic value, such as "2.718", to INT is an undefined behavior.
+
+        Conversion from a numerical type to any numerical type is always allowed.
+        User must be aware of precision loss and value change caused by range difference between two types.
+        For example, a 64-bit float 3.1415926459 may be round to a 32-bit float 3.141592. Similarly, converting
+        an integer 36 to Boolean may produce 1 because we truncate bits which can't be stored in the targeted type.
+
+        In more detail, the conversion among numerical types should follow these rules
+        if the destination type is not a float 8 type.
+
+        * Casting from floating point to:
+          * floating point: +/- infinity if OOR (out of range).
+          * fixed point: undefined if OOR.
+          * bool: +/- 0.0 to False; all else to True.
+        * Casting from fixed point to:
+          * floating point: +/- infinity if OOR. (+ infinity in the case of uint)
+          * fixed point: when OOR, discard higher bits and reinterpret (with respect to two's complement representation for
+            signed types). For example, 200 (int16) -> -56 (int8).
+          * bool: zero to False; nonzero to True.
+        * Casting from bool to:
+          * floating point: `{1.0, 0.0}`.
+          * fixed point: `{1, 0}`.
+          * bool: no change.
+
+        Float 8 types (E4M3FN, E4M3FNUZ, E5M2, E5M2FNUZ) were introduced to speed up the training of
+        deep models. By default the conversion of a float *x* obeys
+        to the following rules. `[x]` means the value rounded to
+        the target mantissa width.
+
+        | x                 | E4M3FN   | E4M3FNUZ | E5M2     | E5M2FNUZ |
+        | ----------------- | -------- | -------- | -------- | -------- |
+        | 0                 | 0        | 0        | 0        | 0        |
+        | -0                | -0       | 0        | -0       | 0        |
+        | NaN               | NaN      | NaN      | NaN      | NaN      |
+        | Inf               | FLT_MAX  | FLT_MAX  | FLT_MAX  | FLT_MAX  |
+        | -Inf              | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
+        | \[x\] > FLT_MAX   | FLT_MAX  | FLT_MAX  | FLT_MAX  | FLT_MAX  |
+        | \[x\] \< -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX | -FLT_MAX |
+        | else              | RNE      | RNE      | RNE      | RNE      |
+
+        The behavior changes if the parameter 'saturate' is set to False.
+        The rules then become:
+
+        | x                 | E4M3FN | E4M3FNUZ | E5M2 | E5M2FNUZ |
+        | ----------------- | ------ | -------- | ---- | -------- |
+        | 0                 | 0      | 0        | 0    | 0        |
+        | -0                | -0     | 0        | -0   | 0        |
+        | NaN               | NaN    | NaN      | NaN  | NaN      |
+        | -NaN              | -NaN   | NaN      | -NaN | NaN      |
+        | Inf               | NaN    | NaN      | Inf  | NaN      |
+        | -Inf              | -NaN   | NaN      | -Inf | NaN      |
+        | \[x\] > FLT_MAX   | NaN    | NaN      | Inf  | NaN      |
+        | \[x\] \< -FLT_MAX | NaN    | NaN      | -Inf | NaN      |
+        | else              | RNE    | RNE      | RNE  | RNE      |
+
+        FLOAT8E8M0 type was introduced to enable [Microscaling (MX) formats](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
+        When casting to FLOAT8E8M0, the rounding behavior can be specified using the `round_mode` and `saturate` attributes.
+        The current CUDA behavior is to round up and saturate. Casting negative values to FLOAT8E8M0 gives undefined behavior.
+        The following table describes the casting behavior of special values to FLOAT8E8M0 in the two most common cases.
+
+        | x                 | saturate + up | non-saturate + nearest |
+        | ----------------- | ------------- | ---------------------  |
+        | 0                 | 0             | NaN                    |
+        | -0                | Unspecified   | Unspecified            |
+        | NaN               | NaN           | NaN                    |
+        | Inf               | E8M0_MAX      | NaN                    |
+        | x > E8M0_MAX      | E8M0_MAX      | NaN                    |
+        | x \< E8M0_MIN     | E8M0_MIN      | NaN                    |
+        | x \< 0            | Unspecified   | Unspecified            |
+
+
+        Args:
+            input: (differentiable) Input tensor to be cast.
+
+            round_mode: Rounding mode for conversion to float8e8m0. It only applies to
+                casting to float8e8m0 and is `up` by default. `up`: round to nearest
+                value away from zero, `down`: round to nearest value towards zero,
+                `nearest`: round to nearest value and ties round up.
+
+            saturate: The parameter defines how the conversion behaves if an input value
+                is out of range of the destination type. It only applies for float 8
+                conversion (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz,
+                float8e8m0). It is true by default. All cases are fully described in the
+                tables inserted in the operator description.
+
+            to: The data type to which the elements of the input tensor are cast.
+                Strictly must be one of the types from DataType enum in TensorProto
+        """
+
+        schema = get_schema("Cast", 25, "")
+        op = Op(self, "Cast", schema)
+        return op(
+            *self._prepare_inputs(schema, input),
+            round_mode=round_mode,
+            saturate=saturate,
+            to=to,
+        )
+
+    T1_CastLike = TypeVar(
+        "T1_CastLike",
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    T2_CastLike = TypeVar(
+        "T2_CastLike",
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def CastLike(
+        self,
+        input: T1_CastLike,
+        target_type: T2_CastLike,
+        *,
+        round_mode: str = "up",
+        saturate: int = 1,
+    ) -> T2_CastLike:
+        r"""[🌐 CastLike(25)](https://onnx.ai/onnx/operators/onnx__CastLike.html#castlike-25 "Online Documentation")
+
+
+        The operator casts the elements of a given input tensor (the first input) to
+        the same data type as the elements of the second input tensor.
+        See documentation of the Cast operator for further details.
+
+
+        Args:
+            input: (differentiable) Input tensor to be cast.
+
+            target_type: (non-differentiable) The (first) input tensor will be cast to
+                produce a tensor of the same type as this (second input) tensor.
+
+            round_mode: Rounding mode for conversion to float8e8m0. It only applies to
+                casting to float8e8m0 and is `up` by default. `up`: round to nearest
+                value away from zero, `down`: round to nearest value towards zero,
+                `nearest`: round to nearest value and ties round up. Please refer to
+                operator Cast description for further details.
+
+            saturate: The parameter defines how the conversion behaves if an input value
+                is out of range of the destination type. It only applies for float 8
+                conversion (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz,
+                float8e8m0). It is true by default. Please refer to operator Cast
+                description for further details.
+        """
+
+        schema = get_schema("CastLike", 25, "")
+        op = Op(self, "CastLike", schema)
+        return op(
+            *self._prepare_inputs(schema, input, target_type),
+            round_mode=round_mode,
+            saturate=saturate,
+        )
+
+    T_Constant: TypeAlias = Union[
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    ]
+
+    def Constant(
+        self,
+        *,
+        sparse_value: Optional[SparseTensorProto] = None,
+        value: Optional[TensorProto] = None,
+        value_float: Optional[float] = None,
+        value_floats: Optional[Sequence[float]] = None,
+        value_int: Optional[int] = None,
+        value_ints: Optional[Sequence[int]] = None,
+        value_string: Optional[str] = None,
+        value_strings: Optional[Sequence[str]] = None,
+    ) -> T_Constant:
+        r"""[🌐 Constant(25)](https://onnx.ai/onnx/operators/onnx__Constant.html#constant-25 "Online Documentation")
+
+
+        This operator produces a constant tensor. Exactly one of the provided attributes, either value, sparse_value,
+        or value_* must be specified.
+
+
+        Args:
+            sparse_value: The value for the elements of the output tensor in sparse
+                format.
+
+            value: The value for the elements of the output tensor.
+
+            value_float: The value for the sole element for the scalar, float32, output
+                tensor.
+
+            value_floats: The values for the elements for the 1D, float32, output
+                tensor.
+
+            value_int: The value for the sole element for the scalar, int64, output
+                tensor.
+
+            value_ints: The values for the elements for the 1D, int64, output tensor.
+
+            value_string: The value for the sole element for the scalar, UTF-8 string,
+                output tensor.
+
+            value_strings: The values for the elements for the 1D, UTF-8 string, output
+                tensor.
+        """
+
+        schema = get_schema("Constant", 25, "")
+        op = Op(self, "Constant", schema)
+        return op(
+            sparse_value=sparse_value,
+            value=value,
+            value_float=value_float,
+            value_floats=value_floats,
+            value_int=value_int,
+            value_ints=value_ints,
+            value_string=value_string,
+            value_strings=value_strings,
+        )
+
+    T1_ConstantOfShape: TypeAlias = INT64
+
+    T2_ConstantOfShape: TypeAlias = Union[
+        BFLOAT16,
+        BOOL,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    ]
+
+    def ConstantOfShape(
+        self, input: T1_ConstantOfShape, *, value: Optional[TensorProto] = None
+    ) -> T2_ConstantOfShape:
+        r"""[🌐 ConstantOfShape(25)](https://onnx.ai/onnx/operators/onnx__ConstantOfShape.html#constantofshape-25 "Online Documentation")
+
+
+        Generate a tensor with given value and shape.
+
+
+        Args:
+            input: 1D tensor. The shape of the expected output tensor. If empty tensor
+                is given, the output would be a scalar. All values must be >= 0.
+
+            value: (Optional) The value of the output elements.Should be a one-element
+                tensor. If not specified, it defaults to a tensor of value 0 and
+                datatype float32
+        """
+
+        schema = get_schema("ConstantOfShape", 25, "")
+        op = Op(self, "ConstantOfShape", schema)
+        return op(*self._prepare_inputs(schema, input), value=value)
+
+    T1_DequantizeLinear = TypeVar(
+        "T1_DequantizeLinear",
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT8,
+        UINT16,
+        UINT2,
+        UINT4,
+        UINT8,
+    )
+
+    T2_DequantizeLinear = TypeVar("T2_DequantizeLinear", BFLOAT16, FLOAT, FLOAT16, FLOAT8E8M0)
+
+    T3_DequantizeLinear: TypeAlias = Union[BFLOAT16, FLOAT, FLOAT16]
+
+    def DequantizeLinear(
+        self,
+        x: T1_DequantizeLinear,
+        x_scale: T2_DequantizeLinear,
+        x_zero_point: Optional[T1_DequantizeLinear] = None,
+        *,
+        axis: int = 1,
+        block_size: int = 0,
+        output_dtype: int = 0,
+    ) -> T3_DequantizeLinear:
+        r"""[🌐 DequantizeLinear(25)](https://onnx.ai/onnx/operators/onnx__DequantizeLinear.html#dequantizelinear-25 "Online Documentation")
+
+
+        The linear dequantization operator. It consumes a quantized tensor, a scale, and a zero point to compute the
+        full-precision tensor. The dequantization formula is `y = (x - x_zero_point) * x_scale`. `x_scale` and `x_zero_point`
+        must have the same shape, determining the quantization's granularity: a scalar for per-tensor/per-layer quantization,
+        a 1-D tensor for per-axis quantization, or have a rank identical to the input for blocked quantization.
+        See QuantizeLinear for details on quantization granularity.
+
+        `x_zero_point` and `x` must have the same type. `x` and `y` must have the same shape. In the case of dequantizing
+        `int32`, there's no zero point (zero point is supposed to be 0).
+        `zero-point` is usually not used in the case of float8 and 4-bit types quantization, but the dequantization formula remains the same
+        for consistency. The output type is determined by the attribute `output_dtype`. If `output_dtype` is not supplied then the output type
+        is the same as `x_scale`. The output type also determines the precision of the multiplication operation.
+
+
+
+        Args:
+            x: N-D quantized input tensor to be de-quantized.
+
+            x_scale: Scale for input `x`. For per-tensor/layer dequantization the scale
+                is a scalar, for per per-axis dequantization it is a 1-D Tensor and for
+                blocked dequantization it has the same shape as the input, except for
+                one dimension in which blocking is performed.
+
+            x_zero_point: (optional) Zero point for input `x`. Shape must match x_scale.
+                It's optional. Zero point is 0 when it's not specified.
+
+            axis: (Optional) The axis of the dequantizing dimension of the input tensor.
+                Used for per-axis and blocked quantization. Negative value means
+                counting dimensions from the back. Accepted range is `[-r, r-1]` where
+                `r = rank(input)`.
+
+            block_size: (Optional) The size of the quantization block (number of times
+                every scale is replicated). Used only for blocked quantization. The
+                block size is a positive integer. Given `x` shape `(D0, ..., Di, ...,
+                Dn)`, `y_scale` shape `(S0, ... Si, ...Sn)` and `axis=i`, the accepted
+                range is `[ceil(Di/Si), ceil(Di/(Si-1))-1]`
+
+            output_dtype: (Optional) The output data type. If not supplied, the output
+                data type is inferred from `x_scale` data type (`T2`)
+        """
+
+        schema = get_schema("DequantizeLinear", 25, "")
+        op = Op(self, "DequantizeLinear", schema)
+        return op(
+            *self._prepare_inputs(schema, x, x_scale, x_zero_point),
+            axis=axis,
+            block_size=block_size,
+            output_dtype=output_dtype,
+        )
+
+    T_Flatten = TypeVar(
+        "T_Flatten",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Flatten(self, input: T_Flatten, *, axis: int = 1) -> T_Flatten:
+        r"""[🌐 Flatten(25)](https://onnx.ai/onnx/operators/onnx__Flatten.html#flatten-25 "Online Documentation")
+
+
+        Flattens the input tensor into a 2D matrix. If input tensor has shape
+        (d_0, d_1, ... d_n) then the output will have shape
+        (d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn).
+
+
+        Args:
+            input: (differentiable) A tensor of rank >= axis.
+
+            axis: Indicate up to which input dimensions (exclusive) should be flattened
+                to the outer dimension of the output. The value for axis must be in the
+                range [-r, r], where r is the rank of the input tensor. Negative value
+                means counting dimensions from the back. When axis = 0, the shape of the
+                output tensor is (1, (d_0 X d_1 ... d_n), where the shape of the input
+                tensor is (d_0, d_1, ... d_n).
+        """
+
+        schema = get_schema("Flatten", 25, "")
+        op = Op(self, "Flatten", schema)
+        return op(*self._prepare_inputs(schema, input), axis=axis)
+
+    V_Identity = TypeVar(
+        "V_Identity",
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[INT16],
+        Optional[INT32],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT32],
+        Optional[UINT64],
+        Optional[UINT8],
+        Sequence[BOOL],
+        Sequence[COMPLEX128],
+        Sequence[COMPLEX64],
+        Sequence[DOUBLE],
+        Sequence[FLOAT],
+        Sequence[FLOAT16],
+        Sequence[INT16],
+        Sequence[INT32],
+        Sequence[INT64],
+        Sequence[INT8],
+        Sequence[STRING],
+        Sequence[UINT16],
+        Sequence[UINT32],
+        Sequence[UINT64],
+        Sequence[UINT8],
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Identity(self, input: V_Identity) -> V_Identity:
+        r"""[🌐 Identity(25)](https://onnx.ai/onnx/operators/onnx__Identity.html#identity-25 "Online Documentation")
+
+        Identity operator
+
+        Args:
+            input: (differentiable) Input tensor
+        """
+
+        schema = get_schema("Identity", 25, "")
+        op = Op(self, "Identity", schema)
+        return op(*self._prepare_inputs(schema, input))
+
+    B_If: TypeAlias = BOOL
+
+    V_If: TypeAlias = Union[
+        Optional[Sequence[BFLOAT16]],
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BFLOAT16],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[FLOAT4E2M1],
+        Optional[FLOAT8E4M3FN],
+        Optional[FLOAT8E4M3FNUZ],
+        Optional[FLOAT8E5M2],
+        Optional[FLOAT8E5M2FNUZ],
+        Optional[FLOAT8E8M0],
+        Optional[INT16],
+        Optional[INT2],
+        Optional[INT32],
+        Optional[INT4],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT2],
+        Optional[UINT32],
+        Optional[UINT4],
+        Optional[UINT64],
+        Optional[UINT8],
+        Sequence[BFLOAT16],
+        Sequence[BOOL],
+        Sequence[COMPLEX128],
+        Sequence[COMPLEX64],
+        Sequence[DOUBLE],
+        Sequence[FLOAT],
+        Sequence[FLOAT16],
+        Sequence[FLOAT4E2M1],
+        Sequence[FLOAT8E4M3FN],
+        Sequence[FLOAT8E4M3FNUZ],
+        Sequence[FLOAT8E5M2],
+        Sequence[FLOAT8E5M2FNUZ],
+        Sequence[FLOAT8E8M0],
+        Sequence[INT16],
+        Sequence[INT2],
+        Sequence[INT32],
+        Sequence[INT4],
+        Sequence[INT64],
+        Sequence[INT8],
+        Sequence[STRING],
+        Sequence[UINT16],
+        Sequence[UINT2],
+        Sequence[UINT32],
+        Sequence[UINT4],
+        Sequence[UINT64],
+        Sequence[UINT8],
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    ]
+
+    def If(self, cond: B_If, *, else_branch: GraphProto, then_branch: GraphProto) -> V_If:
+        r"""[🌐 If(25)](https://onnx.ai/onnx/operators/onnx__If.html#if-25 "Online Documentation")
+
+        If conditional
+
+        Args:
+            cond: Condition for the if. The tensor must contain a single element.
+
+            else_branch: Graph to run if condition is false. Has N outputs: values you
+                wish to be live-out to the enclosing scope. The number of outputs must
+                match the number of outputs in the then_branch.
+
+            then_branch: Graph to run if condition is true. Has N outputs: values you
+                wish to be live-out to the enclosing scope. The number of outputs must
+                match the number of outputs in the else_branch.
+        """
+
+        schema = get_schema("If", 25, "")
+        op = Op(self, "If", schema)
+        return op(
+            *self._prepare_inputs(schema, cond),
+            else_branch=else_branch,
+            then_branch=then_branch,
+        )
+
+    I_Loop: TypeAlias = INT64
+
+    B_Loop: TypeAlias = BOOL
+
+    V_Loop = TypeVar(
+        "V_Loop",
+        Optional[Sequence[BFLOAT16]],
+        Optional[Sequence[BOOL]],
+        Optional[Sequence[COMPLEX128]],
+        Optional[Sequence[COMPLEX64]],
+        Optional[Sequence[DOUBLE]],
+        Optional[Sequence[FLOAT]],
+        Optional[Sequence[FLOAT16]],
+        Optional[Sequence[INT16]],
+        Optional[Sequence[INT32]],
+        Optional[Sequence[INT64]],
+        Optional[Sequence[INT8]],
+        Optional[Sequence[STRING]],
+        Optional[Sequence[UINT16]],
+        Optional[Sequence[UINT32]],
+        Optional[Sequence[UINT64]],
+        Optional[Sequence[UINT8]],
+        Optional[BFLOAT16],
+        Optional[BOOL],
+        Optional[COMPLEX128],
+        Optional[COMPLEX64],
+        Optional[DOUBLE],
+        Optional[FLOAT],
+        Optional[FLOAT16],
+        Optional[FLOAT4E2M1],
+        Optional[FLOAT8E4M3FN],
+        Optional[FLOAT8E4M3FNUZ],
+        Optional[FLOAT8E5M2],
+        Optional[FLOAT8E5M2FNUZ],
+        Optional[FLOAT8E8M0],
+        Optional[INT16],
+        Optional[INT2],
+        Optional[INT32],
+        Optional[INT4],
+        Optional[INT64],
+        Optional[INT8],
+        Optional[STRING],
+        Optional[UINT16],
+        Optional[UINT2],
+        Optional[UINT32],
+        Optional[UINT4],
+        Optional[UINT64],
+        Optional[UINT8],
+        Sequence[BFLOAT16],
+        Sequence[BOOL],
+        Sequence[COMPLEX128],
+        Sequence[COMPLEX64],
+        Sequence[DOUBLE],
+        Sequence[FLOAT],
+        Sequence[FLOAT16],
+        Sequence[FLOAT4E2M1],
+        Sequence[FLOAT8E4M3FN],
+        Sequence[FLOAT8E4M3FNUZ],
+        Sequence[FLOAT8E5M2],
+        Sequence[FLOAT8E5M2FNUZ],
+        Sequence[FLOAT8E8M0],
+        Sequence[INT16],
+        Sequence[INT2],
+        Sequence[INT32],
+        Sequence[INT4],
+        Sequence[INT64],
+        Sequence[INT8],
+        Sequence[STRING],
+        Sequence[UINT16],
+        Sequence[UINT2],
+        Sequence[UINT32],
+        Sequence[UINT4],
+        Sequence[UINT64],
+        Sequence[UINT8],
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Loop(
+        self, M: Optional[I_Loop], cond: Optional[B_Loop], *v_initial: V_Loop, body: GraphProto
+    ) -> V_Loop:
+        r"""[🌐 Loop(25)](https://onnx.ai/onnx/operators/onnx__Loop.html#loop-25 "Online Documentation")
+
+
+        Generic Looping construct. This loop has multiple termination conditions:
+
+        1) Trip count. Iteration count specified at runtime. Set by
+           specifying the input M. Optional. Set to empty string to omit.
+           Note that a static trip count (specified at graph construction time) can be
+           specified by passing in a constant node for input M.
+        2) Loop termination condition. This is an input to the op that determines
+           whether to run the first iteration and also a loop-carried dependency for
+           the body graph. The body graph must yield a value for the condition variable,
+           whether this input is provided or not.
+
+        This table summarizes the operating modes of this operator with equivalent
+        C-style code:
+
+        Operator inputs defined as (max_trip_count, condition_var).
+
+        * input ("", ""):
+                for (int i=0; ; ++i) {
+                  cond = ... // Note this value is ignored, but is required in the body
+                }
+
+        * input ("", cond) // Note this is analogous to a while loop
+                bool cond = ...;
+                for (int i=0; cond; ++i) {
+                  cond = ...;
+                }
+
+        * input ("", 1) // Note this is analogous to a do-while loop
+                bool cond = true
+                for (int i=0; cond; ++i) {
+                  cond = ...;
+                }
+
+        * input (trip_count, "") // Note this is analogous to a for loop
+                int trip_count = ...
+                for (int i=0; i < trip_count; ++i) {
+                  cond = ...; // ignored
+                }
+
+        * input (trip_count, cond)
+                int trip_count = ...;
+                bool cond = ...;
+                for (int i=0; i < trip_count && cond; ++i) {
+                  cond = ...;
+                }
+
+
+        *Sample usage - cond as well as trip count*
+
+            graph predict-net {
+              %a = Constant[value = <Scalar Tensor [3]>]()
+              %b = Constant[value = <Scalar Tensor [6]>]()
+              %keepgoing = Constant[value = <Scalar Tensor [1]>]()
+              %max_trip_count = Constant[value = <Scalar Tensor [10]>]()
+              %keepgoing_out, %b_out, %user_defined_vals = Loop[body = <graph body-net>](%max_trip_count, %keepgoing, %b)
+              return
+            }
+
+            graph body-net (
+              %i[INT32, scalar]           // iteration number
+              %keepgoing_in[BOOL, scalar] // incoming loop-termination-condition; not used
+              %b_in[INT32, scalar]        // incoming value of loop-carried-dependency b
+            ) {
+              %my_local = Add(%a, %b_in)
+              %b_out = Sub(%a, %b_in) // outgoing value of loop-carried-dependency b
+              %keepgoing_out = Greater(%my_local, %b_out) // outgoing loop-termination-condition
+              %user_defined_val = Add(%b_in, %b_in) // scan-output value to be accumulated
+              return %keepgoing_out, %b_out, %user_defined_val
+            }
+
+        *Sample equivalent C code*
+
+            {
+              /* User-defined code (enclosing scope) */
+              int a = 3, b = 6;
+              bool keepgoing = true; // Analogous to input cond
+              /* End user-defined code */
+
+              /* Implicitly-defined code */
+              const int max_trip_count = 10; // Analogous to input M
+              int user_defined_vals[]; // Imagine this is resizable
+              /* End implicitly-defined code */
+              /* initialize loop-carried variables and scan-output variables */
+              bool keepgoing_out = keepgoing
+              int b_out = b
+
+              for (int i=0; i < max_trip_count && keepgoing_out; ++i) {
+                /* Implicitly-defined code: bind actual parameter values
+                   to formal parameter variables of loop-body */
+                bool keepgoing_in = keepgoing_out;
+                bool b_in = b_out;
+
+                /* User-defined code (loop body) */
+                int my_local = a + b_in; // Reading value "a" from the enclosing scope is fine
+                b_out = a - b_in;
+                keepgoing_out = my_local > b_out;
+                user_defined_val = b_in + b_in; // b_in and b_out are different variables
+                /* End user-defined code */
+
+                /* Implicitly defined-code */
+                user_defined_vals[i] = user_defined_val // accumulate scan-output values
+              }
+              // int t = my_local; // Can't do this. my_local is not accessible here.
+
+              // The values below are bound to the output variables of the loop and therefore accessible
+              // b_out; user_defined_vals; keepgoing_out;
+            }
+
+        There are several things of note in this code snippet:
+
+        1) Values from the enclosing scope (i.e. variable "a" here) are in scope and can
+           be referenced in the inputs of the loop.
+        2) Any values computed in the loop body that needs to be used in a subsequent
+           iteration or after the loop are modeled using a pair of variables in the loop-body,
+           consisting of an input variable (eg., b_in) and an output variable (eg., b_out).
+           These are referred to as loop-carried dependences. The loop operation node
+           supplies the input value of the input variable for the first iteration, and
+           returns the output value of the output variable produced by the final
+           iteration.
+        3) Scan_output variables are used to implicitly concatenate values computed across
+           all the iterations. In the above example, the value of user_defined_val computed
+           over all iterations are concatenated and returned as the value of user_defined_vals
+           after the loop.
+        4) Values created in the body cannot be accessed in the enclosing scope,
+           except using the mechanism described above.
+
+        Note that the semantics of this op support "diagonal" or "wavefront" execution.
+        (See Step 3 here for an example:
+        https://devblogs.nvidia.com/optimizing-recurrent-neural-networks-cudnn-5/).
+        Frontends should emit multi-layer RNNs as a series of While operators (with
+        time being the inner looping dimension), with each successive layer consuming
+        the scan_outputs from the previous layer, possibly going through several
+        point-wise operators (e.g. dropout, residual connections, linear layer).
+
+        The input/output of subgraph (produced by loop node) matching is based on order instead of name. The implementation will figure out the names based on this order.
+
+
+        Args:
+            M: (optional) A maximum trip-count for the loop specified at runtime.
+                Optional. Pass empty string to skip.
+
+            cond: (optional) A boolean termination condition. Optional. Pass empty
+                string to skip.
+
+            v_initial: (variadic, heterogeneous) The initial values of any loop-carried
+                dependencies (values that change across loop iterations)
+
+            body: The graph run each iteration. It has 2+N inputs: (iteration_num,
+                condition, loop carried dependencies...). It has 1+N+K outputs:
+                (condition, loop carried dependencies..., scan_outputs...). Each
+                scan_output is created by concatenating the value of the specified
+                output value at the end of each iteration of the loop. It is an error if
+                the dimensions or data type of these scan_outputs change across loop
+                iterations.
+        """
+
+        schema = get_schema("Loop", 25, "")
+        op = Op(self, "Loop", schema)
+        return op(*self._prepare_inputs(schema, M, cond, *v_initial), body=body)
+
+    T_Pad = TypeVar(
+        "T_Pad",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    Tind_Pad = TypeVar("Tind_Pad", INT32, INT64)
+
+    def Pad(
+        self,
+        data: T_Pad,
+        pads: INT64,
+        constant_value: Optional[T_Pad] = None,
+        axes: Optional[Tind_Pad] = None,
+        *,
+        mode: str = "constant",
+    ) -> T_Pad:
+        r"""[🌐 Pad(25)](https://onnx.ai/onnx/operators/onnx__Pad.html#pad-25 "Online Documentation")
+
+
+        Given a tensor containing the data to be padded (`data`), a tensor containing the number of start and end pad values for axis (`pads`), (optionally) a `mode`, and (optionally) `constant_value`,
+        a padded tensor (`output`) is generated.
+
+        The three supported `modes` are (similar to corresponding modes supported by `numpy.pad`):
+
+        1) `constant`(default) - pads with a given constant value as specified by `constant_value` (which defaults to 0, empty string, or False)
+
+        2) `reflect` - pads with the reflection of the vector mirrored on the first and last values of the vector along each axis
+
+        3) `edge` - pads with the edge values of array
+
+        4) `wrap` - wrap-around padding as if the data tensor forms a torus
+
+
+        Example 1 (`constant` mode):
+
+        Insert 0 pads to the beginning of the second dimension.
+
+        ::
+
+            data = [
+                [1.0, 1.2],
+                [2.3, 3.4],
+                [4.5, 5.7],
+            ]
+
+            pads = [0, 2, 0, 0]
+
+            mode = 'constant'
+
+            constant_value = 0.0
+
+            output = [
+                [0.0, 0.0, 1.0, 1.2],
+                [0.0, 0.0, 2.3, 3.4],
+                [0.0, 0.0, 4.5, 5.7],
+            ]
+
+
+
+        Example 2 (`reflect` mode):
+
+        ::
+
+            data = [
+                [1.0, 1.2],
+                [2.3, 3.4],
+                [4.5, 5.7],
+            ]
+
+            pads = [0, 2, 0, 0]
+
+            mode = 'reflect'
+
+            output = [
+                [1.0, 1.2, 1.0, 1.2],
+                [2.3, 3.4, 2.3, 3.4],
+                [4.5, 5.7, 4.5, 5.7],
+            ]
+
+
+
+        Example 3 (`edge` mode):
+
+        ::
+
+            data = [
+                [1.0, 1.2],
+                [2.3, 3.4],
+                [4.5, 5.7],
+            ]
+
+            pads = [0, 2, 0, 0]
+
+            mode = 'edge'
+
+            output = [
+                [1.0, 1.0, 1.0, 1.2],
+                [2.3, 2.3, 2.3, 3.4],
+                [4.5, 4.5, 4.5, 5.7],
+            ]
+
+
+
+        Example 4 (`wrap` mode):
+
+        ::
+
+            data = [
+                [1.0, 1.2],
+                [2.3, 3.4],
+                [4.5, 5.7],
+            ]
+
+            pads = [2, 1, 1, 1]
+
+            mode = 'wrap'
+
+            output = [
+                [3.4, 2.3, 3.4, 2.3],
+                [5.7, 4.5, 5.7, 4.5],
+                [1.2, 1.0, 1.2, 1.0],
+                [3.4, 2.3, 3.4, 2.3],
+                [5.7, 4.5, 5.7, 4.5],
+                [1.2, 1.0, 1.2, 1.0],
+            ]
+
+
+
+
+        Args:
+            data: (differentiable) Input tensor.
+
+            pads: (non-differentiable) Tensor of integers indicating the number of
+                padding elements to add or remove (if negative) at the beginning and end
+                of each axis. For 2D input tensor, it is the number of pixels. `pads`
+                should be a 1D tensor of shape [2 * num_axes] where `num_axes` refers to
+                the number of elements in the `axes` input or the input rank if `axes`
+                are not provided explicitly. `pads` format should be: [x1_begin,
+                x2_begin, ..., x1_end, x2_end,...], where xi_begin is the number of pad
+                values added at the beginning of axis `axes[i]` and xi_end, the number
+                of pad values added at the end of axis `axes[i]`.
+
+            constant_value: (optional, non-differentiable) (Optional) A scalar value to
+                be used if the mode chosen is `constant` (by default it is 0, empty
+                string or False).
+
+            axes: (optional, non-differentiable) 1-D tensor of axes that `pads` apply
+                to. Negative value means counting dimensions from the back. Accepted
+                range is [-r, r-1] where r = rank(data). Behavior is undefined if an
+                axis is repeated. If not provided, all axes are assumed (`[0, 1, ...,
+                input_rank-1]`).
+
+            mode: Supported modes: `constant`(default), `reflect`, `edge`, `wrap`
+        """
+
+        schema = get_schema("Pad", 25, "")
+        op = Op(self, "Pad", schema)
+        return op(*self._prepare_inputs(schema, data, pads, constant_value, axes), mode=mode)
+
+    T1_QuantizeLinear = TypeVar("T1_QuantizeLinear", BFLOAT16, FLOAT, FLOAT16, INT32)
+
+    T2_QuantizeLinear = TypeVar(
+        "T2_QuantizeLinear", BFLOAT16, FLOAT, FLOAT16, FLOAT8E8M0, INT32
+    )
+
+    T3_QuantizeLinear = TypeVar(
+        "T3_QuantizeLinear",
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        INT16,
+        INT2,
+        INT4,
+        INT8,
+        UINT16,
+        UINT2,
+        UINT4,
+        UINT8,
+    )
+
+    def QuantizeLinear(
+        self,
+        x: T1_QuantizeLinear,
+        y_scale: T2_QuantizeLinear,
+        y_zero_point: Optional[T3_QuantizeLinear] = None,
+        *,
+        axis: int = 1,
+        block_size: int = 0,
+        output_dtype: int = 0,
+        precision: int = 0,
+        saturate: int = 1,
+    ) -> T3_QuantizeLinear:
+        r"""[🌐 QuantizeLinear(25)](https://onnx.ai/onnx/operators/onnx__QuantizeLinear.html#quantizelinear-25 "Online Documentation")
+
+
+        The linear quantization operator consumes a high-precision tensor, a scale, and a zero point to compute the
+        low-precision/quantized tensor. The scale factor and zero point must have the same shape, determining the quantization
+        granularity. The quantization formula is `y = saturate((x / y_scale) + y_zero_point)`.
+
+        Saturation is done according to:
+        - uint16: [0, 65535]
+        - int16: [-32768, 32767]
+        - uint8: [0, 255]
+        - int8: [-128, 127]
+        - uint4: [0, 15]
+        - int4: [-8, 7]
+        - uint2: [0, 3]
+        - int2: [-2, 1]
+
+        For `(x / y_scale)`, it rounds to the nearest even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
+
+        `y_zero_point` and `y` must have the same type. `y_zero_point` is usually not used for quantization to float8 and 4bit types, but the quantization
+        formula remains the same for consistency, and the type of the attribute `y_zero_point` still determines the quantization type.
+        `x` and `y_scale` are allowed to have different types. The type of `y_scale` determines the precision of the division operation between `x` and
+        `y_scale`, unless the `precision` attribute is specified.
+
+        There are three supported quantization granularities, determined by the shape of `y_scale`.
+        In all cases, `y_zero_point` must have the same shape as `y_scale`.
+        - Per-tensor (per-layer) quantization: `y_scale` is a scalar.
+        - Per-axis quantization: The scale must be a 1-D tensor, with the length of the quantization axis. For an input shape
+         `(D0, ..., Di, ..., Dn)` and `axis=i`, `y_scale` is a 1-D tensor of length `Di`.
+        - Blocked quantization: The scale's shape is identical to the input's shape, except for one dimension, in which
+          blocking is performed. Given `x` shape `(D0, ..., Di, ..., Dn)`, `axis=i`, and block size `B`: `y_scale` shape is
+          `(D0, ..., ceil(Di/B), ..., Dn)`.
+
+
+        Args:
+            x: N-D full precision Input tensor to be quantized.
+
+            y_scale: Scale for doing quantization to get `y`. For per-tensor/layer
+                quantization the scale is a scalar, for per-axis quantization it is a
+                1-D Tensor and for blocked quantization it has the same shape as the
+                input, except for one dimension in which blocking is performed.
+
+            y_zero_point: (optional) Zero point for doing quantization to get `y`. Shape
+                must match `y_scale`. Default is uint8 with zero point of 0 if it's not
+                specified.
+
+            axis: (Optional) The axis of the dequantizing dimension of the input tensor.
+                Used only for per-axis and blocked quantization. Negative value means
+                counting dimensions from the back. Accepted range is `[-r, r-1]` where
+                `r = rank(input)`. When the rank of the input is 1, per-tensor
+                quantization is applied, rendering the axis unnecessary in this
+                scenario.
+
+            block_size: (Optional) The size of the quantization block (number of times
+                every scale is replicated). Used only for blocked quantization. The
+                block size is a positive integer. Given `x` shape `(D0, ..., Di, ...,
+                Dn)`, `y_scale` shape `(S0, ... Si, ...Sn)` and `axis=i`, the accepted
+                range is `[ceil(Di/Si), ceil(Di/(Si-1))-1]`
+
+            output_dtype: (Optional) The output data type. If not supplied, the output
+                data type is inferred from `y_zero_point` data type (`T3`). If neither
+                `output_dtype` nor `y_zero_point` are supplied, output data type is
+                uint8. If both `output_dtype` and `y_zero_point` are specified,
+                `output_dtype` must be `T3`.
+
+            precision: (Optional) The precision of the division operation between `x`
+                and `y_scale`. If not provided, it will be the same as the type of
+                `y_scale`.
+
+            saturate: The parameter defines how the conversion behaves if an input value
+                is out of range of the destination type. It only applies for float 8
+                quantization (float8e4m3fn, float8e4m3fnuz, float8e5m2, float8e5m2fnuz).
+                It is true by default. All cases are fully described in two tables
+                inserted in the operator description.
+        """
+
+        schema = get_schema("QuantizeLinear", 25, "")
+        op = Op(self, "QuantizeLinear", schema)
+        return op(
+            *self._prepare_inputs(schema, x, y_scale, y_zero_point),
+            axis=axis,
+            block_size=block_size,
+            output_dtype=output_dtype,
+            precision=precision,
+            saturate=saturate,
+        )
+
+    T_Reshape = TypeVar(
+        "T_Reshape",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Reshape(self, data: T_Reshape, shape: INT64, *, allowzero: int = 0) -> T_Reshape:
+        r"""[🌐 Reshape(25)](https://onnx.ai/onnx/operators/onnx__Reshape.html#reshape-25 "Online Documentation")
+
+
+        Reshape the input tensor similar to numpy.reshape.
+        First input is the data tensor, second input is a shape tensor which specifies the output shape. It outputs the reshaped tensor.
+        At most one dimension of the new shape can be -1. In this case, the value is
+        inferred from the size of the tensor and the remaining dimensions. A dimension
+        could also be 0, in which case the actual dimension value is unchanged (i.e. taken
+        from the input tensor). If 'allowzero' is set, and the new shape includes 0, the
+        dimension will be set explicitly to zero (i.e. not taken from input tensor).
+        Shape (second input) could be an empty shape, which means converting to a scalar.
+        The input tensor's shape and the output tensor's shape are required to have the same number of elements.
+
+        If the attribute 'allowzero' is set, it is invalid for the specified shape to
+        contain both a zero value and -1, as the value of the dimension corresponding
+        to -1 cannot be determined uniquely.
+
+
+        Args:
+            data: (differentiable) An input tensor.
+
+            shape: (non-differentiable) Specified shape for output.
+
+            allowzero: (Optional) By default, when any value in the 'shape' input is
+                equal to zero the corresponding dimension value is copied from the input
+                tensor dynamically. allowzero=1 indicates that if any value in the
+                'shape' input is set to zero, the zero value is honored, similar to
+                NumPy.
+        """
+
+        schema = get_schema("Reshape", 25, "")
+        op = Op(self, "Reshape", schema)
+        return op(*self._prepare_inputs(schema, data, shape), allowzero=allowzero)
+
+    V_Scan = TypeVar(
+        "V_Scan",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Scan(
+        self,
+        *initial_state_and_scan_inputs: V_Scan,
+        body: GraphProto,
+        num_scan_inputs: int,
+        scan_input_axes: Optional[Sequence[int]] = None,
+        scan_input_directions: Optional[Sequence[int]] = None,
+        scan_output_axes: Optional[Sequence[int]] = None,
+        scan_output_directions: Optional[Sequence[int]] = None,
+    ) -> V_Scan:
+        r"""[🌐 Scan(25)](https://onnx.ai/onnx/operators/onnx__Scan.html#scan-25 "Online Documentation")
+
+
+        Scan can be used to iterate over one or more scan_input tensors,
+        constructing zero or more scan_output tensors. It combines ideas from general recurrences,
+        functional programming constructs such as scan, fold, map, and zip, and is intended to enable
+        generalizations of RNN-like constructs for sequence-to-sequence processing.
+        Other tensors (referred to as state_variables here) can be used to carry a state
+        when iterating from one element to another (similar to hidden-state in RNNs, also referred
+        to as loop-carried dependences in the context of loops).
+        Many common usages involve a single scan_input tensor (where functionality
+        similar to scan, fold and map can be obtained). When more than one scan_input is used,
+        a behavior similar to zip is obtained.
+
+        The attribute body must be a graph, specifying the computation to be performed in
+        every iteration. It takes as input the current values of the state_variables and
+        the current iterated element of the scan_inputs. It must return the (updated) values
+        of the state_variables and zero or more scan_output_element tensors. The values of the
+        scan_output_element tensors are concatenated over all the iterations to produce the
+        scan_output values of the scan construct (similar to the concatenated intermediate
+        hidden-state values of RNN-like constructs). All the output tensors (state_variables as
+        well as scan_output_element tensors) are required to have the same shape in each iteration
+        of the loop (a restriction imposed to enable efficient memory allocation).
+
+        Note that the iterated element passed to the body subgraph does not have a sequence
+        axis. It will have a rank one less than the rank of the corresponding scan_input.
+
+        The scan operation returns the final values of the state_variables as well as the
+        scan_outputs.
+
+        The optional attribute scan_input_directions specifies the direction (forward or backward)
+        for each scan input. If this attribute is omitted, all sequences are scanned in the forward
+        direction. A bidirectional scan may be performed by specifying the same tensor input twice
+        in the scan_inputs, once with a forward direction, and once with a backward direction.
+
+        The scan_output of the operation is produced by concatenating the scan_output_element
+        values produced by the body in each iteration.  The optional attribute scan_output_directions
+        specifies the direction in which scan_output is constructed (by appending or prepending the
+        scan_output_element to scan_output in each iteration) for each scan_output. If this attribute
+        is omitted, the scan_output_element is appended to the scan_output in each iteration.
+
+        The optional attribute scan_input_axes specifies the axis to be scanned for each scan_input.
+        If omitted, every scan_input will be scanned in axis 0. For example, if axis 0 is the
+        batch axis and axis 1 is the time axis (to be scanned), specify an axis value of 1.
+        Note that scanning a non-zero axis may be less efficient than scanning axis zero.
+
+        The optional attribute scan_output_axes specifies the axis along which the scan_outputs
+        are accumulated for each scan_output. For example, if axis 1 is the time axis (to be
+        scanned) for both inputs and outputs, specify a scan_input axis and scan_output axis
+        value of 1.
+
+        Note that because of the ONNX restriction that only the last parameter of an operator can
+        be variadic, the initial-states and scan-inputs are listed together as one input parameter.
+        Similarly, the final-states and scan-outputs are listed together as one output parameter.
+        The attribute num_scan_inputs indicates the number M of scan-inputs.
+
+        The behavior of
+
+            Scan <
+                num_scan_inputs = m,
+                body = loop-body,
+                scan_input_axes = [axis_1, ..., axis_m]
+            > (init_1, ..., init_n, scan_1, ..., scan_m)
+
+        is equivalent to the following pseudo-code:
+
+            // scan_i.shape[axis_i] denotes the (max) sequence-length of scan_i
+            // scan_i.shape[axis_i] is required to be equal to scan_j.shape[axis_j] for all i,j.
+            sequence_length = scan_1.shape[axis_1];
+
+            // initialize state-variables
+            st_1 = init_1; ... st_n = init_n;
+            // initialize scan-output variables: [] denotes an empty tensor
+            scan_out_1 = []; ...; scan_out_k = [];
+            // identify number of iterations:
+
+            // execute loop
+            for (int t = 0; t < sequence_length; ++t) {
+                // generate the scan-input elements: the notation T<axis=k>[t] indicates the sub-tensor
+                // of rank one less than T obtained by indexing T at position t along axis k.
+                si_1 = scan_1<axis=axis_1>[t];
+                ... ;
+                si_m = scan_m<axis=axis_m>[t];
+                // execute loop-body
+                st_1, ..., st_n, so_1, ..., so_k = loop-body(st_1, ..., st_n, si_1, ..., si_m)
+                // accumulate the scan-output elements
+                scan_out_1 = Concat<axis=0>(scan_out_1, so_1); ... ; scan_out_k = Concat<axis=0>(scan_out_k, so_k);
+            }
+
+            return st_1, ..., st_n, scan_out_1, ..., scan_out_k;
+
+        *Sample usage: Encoding RNN using a Scan*
+
+        The following example shows how a simple RNN over an input tensor %X, with weight tensor %Wi,
+        recurrence weight tensor %Ri, bias tensors %Wbi and %Rbi, and initial hidden-state %H_0 can
+        be encoded as a ScanLoop. Note that the loop-body is a nested graph, and it directly computes
+        %Wi, %Ri, %Wbi, and %Rbi (typically constants or initializers in the body graph). If these
+        values are computed in the outer graph, they need to be passed in as extra state_variables.
+
+            graph rnn-encoding {
+              %H_0 = ...
+              %X = ...
+              %Y_h, %Y = Scan[body = <graph rnn-cell-1>, num_scan_inputs=1](%H_0, %X)
+              return %Y, %Y_h
+            }
+
+            graph rnn-cell-1 (
+              %H_tminus1[FLOAT, tensor]
+              %X_t[FLOAT, tensor]
+            ) {
+              %Wi = ...
+              %Ri = ...
+              %Wbi = ...
+              %Rbi = ...
+              %t1 = X_t * (Wi^T)
+              %t2 = H_tminus1*(Ri^T)
+              %t3 = Add(%t1, %t2)
+              %t4 = Add(%t3, %Wbi)
+              %t5 = Add(%t4, %Rbi)
+              %Ht = Tanh(%t5)
+              %Accumulate = Identity(%Ht)
+              return %Ht, %Accumulate
+            }
+
+
+
+        Args:
+            initial_state_and_scan_inputs: (variadic, heterogeneous) Initial values of
+                the loop's N state variables followed by M scan_inputs
+
+            body: The graph run each iteration. It has N+M inputs: (loop state
+                variables..., scan_input_elts...). It has N+K outputs: (loop state
+                variables..., scan_output_elts...). Each scan_output is created by
+                concatenating the value of the specified scan_output_elt value at the
+                end of each iteration of the loop. It is an error if the dimensions of
+                these values change across loop iterations.
+
+            num_scan_inputs: An attribute specifying the number of scan_inputs M.
+
+            scan_input_axes: An optional list of M flags. The i-th element of the list
+                specifies the axis to be scanned (the sequence axis) for the i-th
+                scan_input. If omitted, 0 will be used as the scan axis for every
+                scan_input. Negative value for an axis means counting dimensions from
+                the back. Accepted range is [-r, r-1] where r = rank(input).
+
+            scan_input_directions: An optional list of M flags. The i-th element of the
+                list specifies the direction to be scanned for the i-th scan_input
+                tensor: 0 indicates forward direction and 1 indicates reverse direction.
+                If omitted, all scan_input tensors will be scanned in the forward
+                direction.
+
+            scan_output_axes: An optional list of K flags. The i-th element of the list
+                specifies the axis for the i-th scan_output. The scan outputs are
+                accumulated along the specified axis. If omitted, 0 will be used as the
+                scan axis for every scan_output. Negative value for an axis means
+                counting dimensions from the back. Accepted range is [-r, r-1].
+
+            scan_output_directions: An optional list of K flags, one for each
+                scan_output. The i-th element of the list specifies whether the i-th
+                scan_output should be constructed by appending or prepending a new value
+                in each iteration: 0 indicates appending and 1 indicates prepending. If
+                omitted, all scan_output tensors will be produced by appending a value
+                in each iteration.
+        """
+
+        schema = get_schema("Scan", 25, "")
+        op = Op(self, "Scan", schema)
+        return op(
+            *self._prepare_inputs(schema, *initial_state_and_scan_inputs),
+            body=body,
+            num_scan_inputs=num_scan_inputs,
+            scan_input_axes=scan_input_axes,
+            scan_input_directions=scan_input_directions,
+            scan_output_axes=scan_output_axes,
+            scan_output_directions=scan_output_directions,
+        )
+
+    T_Shape = TypeVar(
+        "T_Shape",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    T1_Shape: TypeAlias = INT64
+
+    def Shape(self, data: T_Shape, *, end: Optional[int] = None, start: int = 0) -> T1_Shape:
+        r"""[🌐 Shape(25)](https://onnx.ai/onnx/operators/onnx__Shape.html#shape-25 "Online Documentation")
+
+
+        Takes a tensor as input and outputs an 1D int64 tensor containing the shape of the input tensor.
+        Optional attributes start and end can be used to compute a slice of the input tensor's shape.
+        If start axis is omitted, the slice starts from axis 0.
+        The end axis, if specified, is exclusive (and the returned value will not include the size of that axis).
+        If the end axis is omitted, the axes upto the last one will be included.
+        Negative axes indicate counting back from the last axis.
+        Note that axes will be clamped to the range [0, r], where r is the
+        rank of the input tensor if they are out-of-range (after adding r in the case of
+        negative axis). Thus, specifying any end value > r is equivalent to specifying an end
+        value of r, and specifying any start value < -r is equivalent to specifying a start
+        value of 0. If start > end, the result will be an empty shape.
+
+        Examples:
+
+        ::
+
+            Input tensor with shape: [2, 3, 4]
+            No attributes specified.
+            Output: [2, 3, 4]
+
+
+
+        ::
+
+            Input tensor with shape: [2, 3, 4]
+            start: -1
+            Output: [4]
+
+
+
+        ::
+
+            Input tensor with shape: [2, 3, 4]
+            end: -1
+            Output: [2, 3]
+
+
+
+        ::
+
+            Input tensor with shape: [2, 3, 4]
+            start: 1
+            end: 2
+            Output: [3]
+
+
+
+
+        Args:
+            data: (non-differentiable) An input tensor.
+
+            end: (Optional) Ending axis for slicing the shape. Negative value means
+                counting dimensions from the back. If omitted, sizes of all axes upto
+                (including) the last one will be included.
+
+            start: (Optional) Starting axis for slicing the shape. Default value is
+                0.Negative value means counting dimensions from the back.
+        """
+
+        schema = get_schema("Shape", 25, "")
+        op = Op(self, "Shape", schema)
+        return op(*self._prepare_inputs(schema, data), end=end, start=start)
+
+    T_Size = TypeVar(
+        "T_Size",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    T1_Size: TypeAlias = INT64
+
+    def Size(self, data: T_Size) -> T1_Size:
+        r"""[🌐 Size(25)](https://onnx.ai/onnx/operators/onnx__Size.html#size-25 "Online Documentation")
+
+
+        Takes a tensor as input and outputs a int64 scalar that equals to the total number of elements of the input tensor.
+
+
+        Args:
+            data: (non-differentiable) An input tensor.
+        """
+
+        schema = get_schema("Size", 25, "")
+        op = Op(self, "Size", schema)
+        return op(*self._prepare_inputs(schema, data))
+
+    T_Squeeze = TypeVar(
+        "T_Squeeze",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Squeeze(self, data: T_Squeeze, axes: Optional[INT64] = None) -> T_Squeeze:
+        r"""[🌐 Squeeze(25)](https://onnx.ai/onnx/operators/onnx__Squeeze.html#squeeze-25 "Online Documentation")
+
+
+        Remove single-dimensional entries from the shape of a tensor.
+        Takes an input `axes` with a list of axes to squeeze.
+        If `axes` is not provided, all the single dimensions will be removed from
+        the shape. If an axis is selected with shape entry not equal to one, an error is raised.
+
+
+        Args:
+            data: (differentiable) Tensors with at least max(dims) dimensions.
+
+            axes: (optional, non-differentiable) 1D tensor of integers indicating the
+                dimensions to squeeze. Negative value means counting dimensions from the
+                back. Accepted range is [-r, r-1] where r = rank(data).
+        """
+
+        schema = get_schema("Squeeze", 25, "")
+        op = Op(self, "Squeeze", schema)
+        return op(*self._prepare_inputs(schema, data, axes))
+
+    T_Transpose = TypeVar(
+        "T_Transpose",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Transpose(
+        self, data: T_Transpose, *, perm: Optional[Sequence[int]] = None
+    ) -> T_Transpose:
+        r"""[🌐 Transpose(25)](https://onnx.ai/onnx/operators/onnx__Transpose.html#transpose-25 "Online Documentation")
+
+
+        Returns a transpose of the input tensor. (Similar to `numpy.transpose`).
+        The optional attribute `perm` must be a permutation of the dimensions of
+        the input tensor. Axis `i` of the output tensor corresponds to the axis
+        `perm[i]` of the input tensor.
+        For example, when perm=(1, 0, 2), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 1, 3).
+        When perm=(1, 2, 0), given an input tensor of shape (1, 2, 3),
+        the output shape will be (2, 3, 1).
+        If the attribute `perm` is omitted, its default value is `(n-1, ..., 0)`,
+        where `n` is the rank of the input tensor.
+
+
+        Args:
+            data: (differentiable) An input tensor.
+
+            perm: A list of integers. By default, reverse the dimensions, otherwise
+                permute the axes according to the values given. Its length must be equal
+                to the rank of the input.
+        """
+
+        schema = get_schema("Transpose", 25, "")
+        op = Op(self, "Transpose", schema)
+        return op(*self._prepare_inputs(schema, data), perm=perm)
+
+    T_Unsqueeze = TypeVar(
+        "T_Unsqueeze",
+        BFLOAT16,
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        FLOAT4E2M1,
+        FLOAT8E4M3FN,
+        FLOAT8E4M3FNUZ,
+        FLOAT8E5M2,
+        FLOAT8E5M2FNUZ,
+        FLOAT8E8M0,
+        INT16,
+        INT2,
+        INT32,
+        INT4,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT2,
+        UINT32,
+        UINT4,
+        UINT64,
+        UINT8,
+    )
+
+    def Unsqueeze(self, data: T_Unsqueeze, axes: INT64) -> T_Unsqueeze:
+        r"""[🌐 Unsqueeze(25)](https://onnx.ai/onnx/operators/onnx__Unsqueeze.html#unsqueeze-25 "Online Documentation")
+
+
+        Insert single-dimensional entries to the shape of an input tensor (`data`).
+        Takes one required input `axes` - which contains a list of dimension indices and this operator will insert a dimension of value `1` into the corresponding index of the output tensor (`expanded`).
+
+        For example, given an input tensor (`data`) of shape [3, 4, 5], then
+        Unsqueeze(data, axes=[0, 4]) outputs a tensor (`expanded`) containing same data as `data` but with shape [1, 3, 4, 5, 1].
+
+        The input `axes` should not contain any duplicate entries. It is an error if it contains duplicates.
+        The rank of the output tensor (`output_rank`) is the rank of the input tensor (`data`) plus the number of values in `axes`.
+        Each value in `axes` should be within the (inclusive) range [-output_rank , output_rank - 1].
+        The order of values in `axes` does not matter and can come in any order.
+
+
+        Args:
+            data: (differentiable) Original tensor
+
+            axes: (non-differentiable) 1D tensor of integers indicating the dimensions
+                to be inserted. Negative value means counting dimensions from the back.
+                Accepted range is [-r, r-1] where r = rank(expanded).
+        """
+
+        schema = get_schema("Unsqueeze", 25, "")
+        op = Op(self, "Unsqueeze", schema)
+        return op(*self._prepare_inputs(schema, data, axes))
diff --git a/onnxscript/onnx_opset/_impl/opset26.py b/onnxscript/onnx_opset/_impl/opset26.py
new file mode 100644
index 0000000000..1bc1aa6b3b
--- /dev/null
+++ b/onnxscript/onnx_opset/_impl/opset26.py
@@ -0,0 +1,88 @@
+# --------------------------------------------------------------------------
+# ⚠️ WARNING - AUTO-GENERATED CODE - DO NOT EDIT ⚠️
+# ⚙️ Generated by 'python -m opgen'
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# pylint: disable=W0221,W0222,R0901,W0237
+# mypy: disable-error-code=override
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import TypeVar
+
+from onnx.defs import get_schema
+
+from onnxscript.onnx_opset._impl.opset25 import Opset25
+from onnxscript.onnx_types import (
+    BFLOAT16,
+    DOUBLE,
+    FLOAT,
+    FLOAT16,
+    INT32,
+    INT64,
+    UINT32,
+    UINT64,
+)
+from onnxscript.values import Op, Opset
+
+
+class Opset26(Opset25):
+    def __new__(cls):
+        return Opset.__new__(cls, "", 26)
+
+    T_CumProd = TypeVar(
+        "T_CumProd", BFLOAT16, DOUBLE, FLOAT, FLOAT16, INT32, INT64, UINT32, UINT64
+    )
+
+    T2_CumProd = TypeVar("T2_CumProd", INT32, INT64)
+
+    def CumProd(
+        self, x: T_CumProd, axis: T2_CumProd, *, exclusive: int = 0, reverse: int = 0
+    ) -> T_CumProd:
+        r"""[🌐 CumProd(26)](https://onnx.ai/onnx/operators/onnx__CumProd.html#cumprod-26 "Online Documentation")
+
+
+        Performs cumulative product of the input elements along the given axis.
+        By default, it will do the product inclusively meaning the first element is copied as is.
+        Through an `exclusive` attribute, this behavior can change to exclude the first element.
+        It can also perform product in the opposite direction of the axis. For that, set `reverse` attribute to 1.
+
+        Example:
+        ::
+
+            input_x = [1, 2, 3]
+            axis=0
+            output = [1, 2, 6]
+            exclusive=1
+            output = [1, 1, 2]
+            exclusive=0
+            reverse=1
+            output = [6, 6, 3]
+            exclusive=1
+            reverse=1
+            output = [6, 3, 1]
+
+
+
+
+        Args:
+            x: (differentiable) An input tensor that is to be processed.
+
+            axis: (non-differentiable) A 0-D tensor. Must be in the range [-rank(x),
+                rank(x)-1]. Negative value means counting dimensions from the back.
+
+            exclusive: If set to 1 will return exclusive product in which the top
+                element is not included. In other terms, if set to 1, the j-th output
+                element would be the product of the first (j-1) elements. Otherwise, it
+                would be the product of the first j elements.
+
+            reverse: If set to 1 will perform the products in reverse direction.
+        """
+
+        schema = get_schema("CumProd", 26, "")
+        op = Op(self, "CumProd", schema)
+        return op(*self._prepare_inputs(schema, x, axis), exclusive=exclusive, reverse=reverse)
diff --git a/onnxscript/onnx_opset/_impl/opset3.py b/onnxscript/onnx_opset/_impl/opset3.py
index fd684dd238..79d16d52a2 100644
--- a/onnxscript/onnx_opset/_impl/opset3.py
+++ b/onnxscript/onnx_opset/_impl/opset3.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset4.py b/onnxscript/onnx_opset/_impl/opset4.py
index a1b7fb890b..8cea891d5f 100644
--- a/onnxscript/onnx_opset/_impl/opset4.py
+++ b/onnxscript/onnx_opset/_impl/opset4.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset5.py b/onnxscript/onnx_opset/_impl/opset5.py
index d7e34f8d5d..79b062b73e 100644
--- a/onnxscript/onnx_opset/_impl/opset5.py
+++ b/onnxscript/onnx_opset/_impl/opset5.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset6.py b/onnxscript/onnx_opset/_impl/opset6.py
index b7b7981154..17dbe43582 100644
--- a/onnxscript/onnx_opset/_impl/opset6.py
+++ b/onnxscript/onnx_opset/_impl/opset6.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -210,18 +210,7 @@ def BatchNormalization(
     )
 
     T2_Cast: TypeAlias = Union[
-        BOOL,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT16,
-        INT32,
-        INT64,
-        INT8,
-        UINT16,
-        UINT32,
-        UINT64,
-        UINT8,
+        BOOL, DOUBLE, FLOAT, FLOAT16, INT16, INT32, INT64, INT8, UINT16, UINT32, UINT64, UINT8
     ]
 
     def Cast(self, input: T1_Cast, *, to: int) -> T2_Cast:
diff --git a/onnxscript/onnx_opset/_impl/opset7.py b/onnxscript/onnx_opset/_impl/opset7.py
index eed9bde7d2..01e82ae970 100644
--- a/onnxscript/onnx_opset/_impl/opset7.py
+++ b/onnxscript/onnx_opset/_impl/opset7.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset8.py b/onnxscript/onnx_opset/_impl/opset8.py
index 6bedb39b86..2602b8a4d8 100644
--- a/onnxscript/onnx_opset/_impl/opset8.py
+++ b/onnxscript/onnx_opset/_impl/opset8.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset9.py b/onnxscript/onnx_opset/_impl/opset9.py
index be1cec969d..9b29ff5389 100644
--- a/onnxscript/onnx_opset/_impl/opset9.py
+++ b/onnxscript/onnx_opset/_impl/opset9.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: E741, D402
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
@@ -312,18 +312,7 @@ def Constant(self, *, value: TensorProto) -> T_Constant:
     T1_ConstantOfShape: TypeAlias = INT64
 
     T2_ConstantOfShape: TypeAlias = Union[
-        BOOL,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT16,
-        INT32,
-        INT64,
-        INT8,
-        UINT16,
-        UINT32,
-        UINT64,
-        UINT8,
+        BOOL, DOUBLE, FLOAT, FLOAT16, INT16, INT32, INT64, INT8, UINT16, UINT32, UINT64, UINT8
     ]
 
     def ConstantOfShape(
@@ -412,18 +401,7 @@ def Erf(self, input: T_Erf) -> T_Erf:
     )
 
     T2_EyeLike: TypeAlias = Union[
-        BOOL,
-        DOUBLE,
-        FLOAT,
-        FLOAT16,
-        INT16,
-        INT32,
-        INT64,
-        INT8,
-        UINT16,
-        UINT32,
-        UINT64,
-        UINT8,
+        BOOL, DOUBLE, FLOAT, FLOAT16, INT16, INT32, INT64, INT8, UINT16, UINT32, UINT64, UINT8
     ]
 
     def EyeLike(
@@ -1163,12 +1141,7 @@ def Scan(
     Tind_Scatter = TypeVar("Tind_Scatter", INT32, INT64)
 
     def Scatter(
-        self,
-        data: T_Scatter,
-        indices: Tind_Scatter,
-        updates: T_Scatter,
-        *,
-        axis: int = 0,
+        self, data: T_Scatter, indices: Tind_Scatter, updates: T_Scatter, *, axis: int = 0
     ) -> T_Scatter:
         r"""[🌐 Scatter(9)](https://onnx.ai/onnx/operators/onnx__Scatter.html#scatter-9 "Online Documentation")
 
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py
index d69cc686a0..ae2ba21195 100644
--- a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml1.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801, D417
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml2.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml2.py
index 49b38d3344..fe990d72c9 100644
--- a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml2.py
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml2.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py
index 57c0d90a4e..b726dc3fc0 100644
--- a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml3.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml4.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml4.py
index 02dc271c6e..01866353e7 100644
--- a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml4.py
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml4.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml5.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml5.py
index d3f3f0b5cc..b90824c5d9 100644
--- a/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml5.py
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_ml5.py
@@ -7,7 +7,7 @@
 # --------------------------------------------------------------------------
 # pylint: disable=W0221,W0222,R0901,W0237
 # mypy: disable-error-code=override
-# ruff: noqa: N801
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
 # --------------------------------------------------------------------------
 
 from __future__ import annotations
diff --git a/onnxscript/onnx_opset/_impl/opset_ai_onnx_preview_training1.py b/onnxscript/onnx_opset/_impl/opset_ai_onnx_preview_training1.py
new file mode 100644
index 0000000000..05015d9b78
--- /dev/null
+++ b/onnxscript/onnx_opset/_impl/opset_ai_onnx_preview_training1.py
@@ -0,0 +1,576 @@
+# --------------------------------------------------------------------------
+# ⚠️ WARNING - AUTO-GENERATED CODE - DO NOT EDIT ⚠️
+# ⚙️ Generated by 'python -m opgen'
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# pylint: disable=W0221,W0222,R0901,W0237
+# mypy: disable-error-code=override
+# ruff: noqa: N801,E741,RUF036,D214,D402,D405,D411,D412,D416,D417
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+from typing import Optional, Sequence, TypeVar, Union
+
+from onnx.defs import get_schema
+from typing_extensions import TypeAlias
+
+from onnxscript.onnx_types import (
+    BOOL,
+    COMPLEX64,
+    COMPLEX128,
+    DOUBLE,
+    FLOAT,
+    FLOAT16,
+    INT8,
+    INT16,
+    INT32,
+    INT64,
+    STRING,
+    UINT8,
+    UINT16,
+    UINT32,
+    UINT64,
+)
+from onnxscript.values import Op, Opset
+
+
+class Opset_ai_onnx_preview_training1(Opset):
+    def __new__(cls):
+        return Opset.__new__(cls, "ai.onnx.preview.training", 1)
+
+    T1_Adagrad = TypeVar("T1_Adagrad", DOUBLE, FLOAT)
+
+    T2_Adagrad: TypeAlias = INT64
+
+    T3_Adagrad = TypeVar("T3_Adagrad", DOUBLE, FLOAT)
+
+    def Adagrad(
+        self,
+        R: T1_Adagrad,
+        T: T2_Adagrad,
+        *inputs: T3_Adagrad,
+        decay_factor: float = 0.0,
+        epsilon: float = 9.999999974752427e-07,
+        norm_coefficient: float = 0.0,
+    ) -> T3_Adagrad:
+        r"""[🌐 ai.onnx.preview.training::Adagrad(1)](https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adagrad.html#adagrad-1 "Online Documentation")
+
+
+            Compute one iteration of ADAGRAD, a stochastic gradient based optimization
+            algorithm. This operator can conduct the optimization of multiple tensor variables.
+
+            Let's define the behavior of this operator. As you can imagine, ADAGRAD requires
+            some parameters:
+
+             - The initial learning-rate "R".
+             - The update count "T". That is, the number of training iterations conducted.
+             - A L2-norm regularization coefficient "norm_coefficient".
+             - A learning-rate decay factor "decay_factor".
+             - A small constant "epsilon" to avoid dividing-by-zero.
+
+            At each ADAGRAD iteration, the optimized tensors are moved along a direction
+            computed based on their estimated gradient and accumulated squared gradient. Assume
+            that only a single tensor "X" is updated by this operator. We need the value of "X",
+            its gradient "G", and its accumulated squared gradient "H". Therefore, variables in
+            this operator's input list are sequentially "R", "T", "X", "G", and "H". Other
+            parameters are given as attributes because they are usually constants. Also, the
+            corresponding output tensors are the new value of "X" (called "X_new"), and then
+            the new accumulated squared gradient (called "H_new"). Those outputs are computed
+            from the given inputs following the pseudo code below.
+
+            Let "+", "-", "*", and "/" are all element-wise arithmetic operations with
+            numpy-style broadcasting support. The pseudo code to compute those outputs is:
+
+              // Compute a scalar learning-rate factor. At the first update of X, T is generally
+              // 0 (0-based update index) or 1 (1-based update index).
+              r = R / (1 + T * decay_factor);
+
+              // Add gradient of 0.5 * norm_coefficient * ||X||_2^2, where ||X||_2 is the 2-norm.
+              G_regularized = norm_coefficient * X + G;
+
+              // Compute new accumulated squared gradient.
+              H_new = H + G_regularized * G_regularized;
+
+              // Compute the adaptive part of per-coordinate learning rate. Note that Sqrt(...)
+              // computes element-wise square-root.
+              H_adaptive = Sqrt(H_new) + epsilon
+
+              // Compute the new value of "X".
+              X_new = X - r * G_regularized / H_adaptive;
+
+            If one assign this operators to optimize multiple inputs, for example, "X_1" and "X_2", the same
+            pseudo code may be extended to handle all tensors jointly. More specifically, we can view "X" as a
+            concatenation of "X_1" and "X_2" (of course, their gradient and accumulate gradient should
+            be concatenated too) and then just reuse the entire pseudo code.
+
+            Note that ADAGRAD was first proposed in http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
+            In that reference paper, this operator is a special case of the Figure 1's composite mirror
+            descent update.
+
+
+        Args:
+            R: The initial learning rate.
+
+            T: The update count of "X". It should be a scalar.
+
+            inputs: (variadic, heterogeneous) The current values of optimized tensors,
+                followed by their respective gradients, followed by their respective
+                accumulated squared gradients.For example, if two tensor "X_1" and "X_2"
+                are optimized, The input list would be ["X_1", "X_2", gradient of "X_1",
+                gradient of "X_2", accumulated squared gradient of "X_1", accumulated
+                squared gradient of "X_2"].
+
+            decay_factor: The decay factor of learning rate after one update.The
+                effective learning rate is computed by r = R / (1 + T * decay_factor).
+                Default to 0 so that increasing update counts doesn't reduce the
+                learning rate.
+
+            epsilon: Small scalar to avoid dividing by zero.
+
+            norm_coefficient: Regularization coefficient in 0.5 * norm_coefficient *
+                ||X||_2^2. Default to 0, which means no regularization.
+        """
+
+        schema = get_schema("Adagrad", 1, "ai.onnx.preview.training")
+        op = Op(self, "Adagrad", schema)
+        return op(
+            *self._prepare_inputs(schema, R, T, *inputs),
+            decay_factor=decay_factor,
+            epsilon=epsilon,
+            norm_coefficient=norm_coefficient,
+        )
+
+    T1_Adam = TypeVar("T1_Adam", DOUBLE, FLOAT)
+
+    T2_Adam: TypeAlias = INT64
+
+    T3_Adam = TypeVar("T3_Adam", DOUBLE, FLOAT)
+
+    def Adam(
+        self,
+        R: T1_Adam,
+        T: T2_Adam,
+        *inputs: T3_Adam,
+        alpha: float = 0.8999999761581421,
+        beta: float = 0.9990000128746033,
+        epsilon: float = 9.999999974752427e-07,
+        norm_coefficient: float = 0.0,
+        norm_coefficient_post: float = 0.0,
+    ) -> T3_Adam:
+        r"""[🌐 ai.onnx.preview.training::Adam(1)](https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Adam.html#adam-1 "Online Documentation")
+
+
+            Compute one iteration of Adam, a stochastic gradient based optimization
+            algorithm. This operator can conduct the optimization of multiple tensor variables.
+
+            Let's define the behavior of this operator. First of all, Adam requires
+            some parameters:
+
+             - The learning-rate "R".
+             - The update count "T". That is, the number of training iterations conducted.
+             - A L2-norm regularization coefficient "norm_coefficient".
+             - A small constant "epsilon" to avoid dividing-by-zero.
+             - Two coefficients, "alpha" and "beta".
+
+            At each Adam iteration, the optimized tensors are moved along a direction
+            computed based on their exponentially-averaged historical gradient and
+            exponentially-averaged historical squared gradient. Assume that only a tensor
+            "X" is being optimized. The rest of required information is
+
+             - the value of "X",
+             - "X"'s gradient (denoted by "G"),
+             - "X"'s exponentially-averaged historical gradient (denoted by "V"), and
+             - "X"'s exponentially-averaged historical squared gradient (denoted by "H").
+
+            Some of those parameters are passed into this operator as input tensors and others
+            are stored as this operator's attributes. Specifically, this operator's input tensor
+            list is ["R", "T", "X", "G", "V", "H"]. That is, "R" is the first input, "T" is
+            the second input, and so on. Other parameters are given as attributes because they
+            are constants. Moreover, the corresponding output tensors are
+
+             - the new value of "X" (called "X_new"),
+             - the new exponentially-averaged historical gradient (denoted by "V_new"), and
+             - the new exponentially-averaged historical squared gradient (denoted by "H_new").
+
+            Those outputs are computed following the pseudo code below.
+
+            Let "+", "-", "*", and "/" are all element-wise arithmetic operations with
+            numpy-style broadcasting support. The pseudo code to compute those outputs is:
+
+              // Add gradient of 0.5 * norm_coefficient * ||X||_2^2, where ||X||_2 is the 2-norm.
+              G_regularized = norm_coefficient * X + G
+
+              // Update exponentially-averaged historical gradient.
+              V_new = alpha * V + (1 - alpha) * G_regularized
+
+              // Update exponentially-averaged historical squared gradient.
+              H_new = beta * H + (1 - beta) * G_regularized * G_regularized
+
+              // Compute the element-wise square-root of H_new. V_new will be element-wisely
+              // divided by H_sqrt for a better update direction.
+              H_sqrt = Sqrt(H_new) + epsilon
+
+              // Compute learning-rate. Note that "alpha**T"/"beta**T" is alpha's/beta's T-th power.
+              R_adjusted = T > 0 ? R * Sqrt(1 - beta**T) / (1 - alpha**T) : R
+
+              // Compute new value of "X".
+              X_new = X - R_adjusted * V_new / H_sqrt
+
+              // Post-update regularization.
+              X_final = (1 - norm_coefficient_post) * X_new
+
+            If there are multiple inputs to be optimized, the pseudo code will be applied
+            independently to each of them.
+
+
+        Args:
+            R: The initial learning rate.
+
+            T: The update count of "X". It should be a scalar.
+
+            inputs: (variadic, heterogeneous) The tensors to be optimized, followed by
+                their respective gradients, followed by their respective accumulated
+                gradients (aka momentum), followed by their respective accumulated
+                squared gradients. For example, to optimize tensors "X_1" and "X_2,",
+                the input list would be ["X_1", "X_2", gradient of "X_1", gradient of
+                "X_2", accumulated gradient of "X_1", accumulated gradient of "X_2",
+                accumulated squared gradient of "X_1", accumulated squared gradient of
+                "X_2"].
+
+            alpha: Coefficient of previously accumulated gradient in running average.
+                Default to 0.9.
+
+            beta: Coefficient of previously accumulated squared-gradient in running
+                average. Default to 0.999.
+
+            epsilon: Small scalar to avoid dividing by zero.
+
+            norm_coefficient: Regularization coefficient of 0.5 * norm_coefficient *
+                ||X||_2^2. Default to 0, which means no regularization.
+
+            norm_coefficient_post: Regularization coefficient of 0.5 * norm_coefficient
+                * ||X||_2^2. Default to 0, which means no regularization.
+        """
+
+        schema = get_schema("Adam", 1, "ai.onnx.preview.training")
+        op = Op(self, "Adam", schema)
+        return op(
+            *self._prepare_inputs(schema, R, T, *inputs),
+            alpha=alpha,
+            beta=beta,
+            epsilon=epsilon,
+            norm_coefficient=norm_coefficient,
+            norm_coefficient_post=norm_coefficient_post,
+        )
+
+    T1_Gradient = TypeVar(
+        "T1_Gradient",
+        BOOL,
+        COMPLEX128,
+        COMPLEX64,
+        DOUBLE,
+        FLOAT,
+        FLOAT16,
+        INT16,
+        INT32,
+        INT64,
+        INT8,
+        STRING,
+        UINT16,
+        UINT32,
+        UINT64,
+        UINT8,
+    )
+
+    T2_Gradient: TypeAlias = Union[DOUBLE, FLOAT, FLOAT16]
+
+    def Gradient(
+        self,
+        *Inputs: T1_Gradient,
+        xs: Sequence[str],
+        y: str,
+        zs: Optional[Sequence[str]] = None,
+    ) -> T2_Gradient:
+        r"""[🌐 ai.onnx.preview.training::Gradient(1)](https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Gradient.html#gradient-1 "Online Documentation")
+
+
+        Gradient operator computes the partial derivatives of a specific tensor w.r.t.
+        some other tensors. This operator is widely used in gradient-based training
+        algorithms. To illustrate its use, let's consider a computation graph,
+
+        ::
+
+            X -----.
+                   |
+                   v
+            W --> Conv --> H --> Gemm --> Y
+                                  ^
+                                  |
+                                  Z
+
+
+
+        , where W and Z are trainable tensors. Note that operators' attributes are
+        omitted for the sake of simplicity. Let dY/dW (dY/dZ) be the gradient of
+        Y with respect to W (Z). The user can compute gradient by inserting Gradient
+        operator to form another graph shown below.
+
+        ::
+
+            W --> Conv --> H --> Gemm --> Y
+            |      ^              ^
+            |      |              |
+            |      X              Z
+            |      |              |
+            |      |   .----------'
+            |      |   |  (W/Z/X is the 1st/2nd/3rd input of Gradient as shown in
+            |      |   |   "xs" followed by "zs")
+            |      v   v
+            '---> Gradient(xs=["W", "Z"], zs=["X"], y="Y")
+                   |   |
+                   |   '-----------------------------------> dY/dW (1st output of Gradient)
+                   |
+                   '---------------------------------------> dY/dZ (2nd output of Gradient)
+
+
+
+        By definition, the tensor "y" is a function of independent variables in "xs"
+        and "zs". Since we only compute the gradient of "y" w.r.t. the differentiable
+        variables in "xs", this Gradient only outputs dY/dW and dY/dZ. Note that "H"
+        cannot appear in "xs" and "zs". The reason is that "H" can be determined by
+        tensors "W" and "X" and therefore "H" is not an independent variable.
+
+        All outputs are optional. If needed, for example, user can assign an empty
+        string to the 1st output name of that Gradient to skip the generation of dY/dW.
+        Note that the concept of optional outputs can also be found in ONNX's RNN, GRU,
+        and LSTM.
+
+        Gradient operator can compute derivative against intermediate tensors. For
+        example, the gradient of Y with respect to H can be done via
+
+        ::
+
+            W --> Conv --> H --> Gemm --> Y
+                   ^       |      ^
+                   |       |      |
+                   X       |      Z
+                   .-------'      |
+                   |   .----------'
+                   |   | (H/Z is the 1st/2nd input of Gradient as shown in "xs")
+                   v   v
+                  Gradient(xs=["H", "Z"], y="Y")
+                   |   |
+                   |   '-----------------------------------> dY/dH (1st output of Gradient)
+                   |
+                   '---------------------------------------> dY/dZ (2nd output of Gradient)
+
+
+
+        It is possible to represent high-order differentiation using Gradient operators.
+        For example, given the following linear model:
+
+        ::
+
+            W --> Gemm --> Y --> Loss --> O
+                   ^              ^
+                   |              |
+                   X              L
+
+
+
+        To compute the 2nd order derivative of O with respect to W (denoted by
+        d^2O/dW^2), one can do
+
+        ::
+
+            W --> Gemm --> Y --> Loss --> O
+            |      ^              ^
+            |      |              |
+            |      X .------------L
+            |      | |            |
+            |      | |            v
+            +------+-+> Gradient(xs=["X", "W"], zs=["L"], y="O") ---> dO/dX (1st output of Gradient)
+            |      | |    |
+            |      | |    '---> dO/dW (2nd output of Gradient)
+            |      v v
+            '---> Gradient(xs=["X", "W"], zs=["L"], y="dO/dW") ---> d(dO/dW)dX (1st output of
+                   |                                                  Gradient)
+                   |
+                   |
+                   '---> d^2O/dW^2 (2nd output of Gradient)
+
+
+
+        The tensors named in attributes "xs", "zs", and "y" define the differentiated
+        computation graph, and the inputs to Gradient node define the values at
+        which the gradient is computed. We can feed different tensors to the identified
+        graph. For example, one can compute the gradient of Y with respect to H at
+        a specific value of H, H_1, by providing that value as an input to the Gradient
+        node.
+
+        ::
+
+            W --> Conv --> H --> Gemm --> Y
+                   ^              ^
+                   |              |
+                   X              Z
+
+                      Z_1 (2nd input of Gradient)
+                       |
+                       v
+            H_1 --> Gradient(xs=["H", "Z"], y="Y") ---> dY/dH when H = H_1 and Y = Y_1.
+                       |
+                       '------------------------------> dY/dZ (2nd output of Gradient)
+
+
+
+        When the inputs of Gradient are the tensors named in "xs" and "zs", the
+        computation can be optimized. More specifically, intermediate variables in
+        forward pass can be reused if the gradient is computed via reverse-mode
+        auto-differentiation.
+
+
+
+        Args:
+            Inputs: (variadic, heterogeneous) The values fed into graph identified by
+                the attributes. The i-th input is the value of the i-th tensor specified
+                in the concatenated list of the attribute "xs" and the attribute  "zs".
+                For example, if xs=["A", "B"] and zs=["C"], the first input is used as
+                the value of symbol "A" and the 3rd input is substituted for all the
+                occurrences of "C".
+
+            xs: Input tensor names of the differentiated sub-graph. It contains only the
+                necessary differentiated inputs of a (sub-)graph. Variables (usually
+                called intermediate variables) that can be generated from inputs cannot
+                be included in this attribute.
+
+            y: The targeted tensor. It can be viewed as the output of the differentiated
+                function. The attribute "xs" and attribute "zs" are the minimal
+                independent variable set that determines the value of "y".
+
+            zs: Input tensor names of the differentiated sub-graph. It contains only the
+                necessary non-differentiated inputs of a (sub-)graph. Variables (usually
+                called intermediate variables) that can be generated from inputs cannot
+                be included in this attribute.
+        """
+
+        schema = get_schema("Gradient", 1, "ai.onnx.preview.training")
+        op = Op(self, "Gradient", schema)
+        return op(*self._prepare_inputs(schema, *Inputs), xs=xs, y=y, zs=zs)
+
+    T1_Momentum = TypeVar("T1_Momentum", DOUBLE, FLOAT)
+
+    T2_Momentum: TypeAlias = INT64
+
+    T3_Momentum = TypeVar("T3_Momentum", DOUBLE, FLOAT)
+
+    def Momentum(
+        self,
+        R: T1_Momentum,
+        T: T2_Momentum,
+        *inputs: T3_Momentum,
+        alpha: float,
+        beta: float,
+        mode: str,
+        norm_coefficient: float,
+    ) -> T3_Momentum:
+        r"""[🌐 ai.onnx.preview.training::Momentum(1)](https://onnx.ai/onnx/operators/onnx_aionnxpreviewtraining_Momentum.html#momentum-1 "Online Documentation")
+
+
+            Compute one iteration of stochastic gradient update with momentum.
+            This operator can conduct the optimization of multiple tensor variables.
+
+            Let's define the behavior of this operator. As you can imagine, SG with momentum requires
+            several parameters:
+
+             - The learning-rate "R".
+             - The update count "T". That is, the number of conducted training iterations. It should
+               be zero in the first training iteration.
+             - A L2-norm regularization coefficient "norm_coefficient".
+             - A decay coefficient of previous accumulated gradient (i.e., momentum) "alpha".
+             - The scaling coefficient of current gradient "beta".
+             - An attribute to choose either standard momentum or Nesterov's momentum "mode" should
+               be used.
+
+            For the sake of simplicity, assume that there is only one tensor (called "X") to be optimized.
+            Other necessary inputs are "X"'s gradient (called "G") and "X"'s momentum (called "V"). This
+            Momentum operator maps all these inputs to the new value of "X" (called "X_new") and its new
+            momentum (called "V_new").
+
+            This operator supports two different momentum algorithms. Set the attribute "mode" to
+            "nesterov" if Nesterov's momentum is desired. Otherwise, set the attribute "model" to
+            "standard" to use standard momentum. Computation details are described subsequently.
+
+            Let "+", "-", "*", and "/" are all element-wise operations with numpy-style broadcasting.
+
+            Pseudo code for SG with standard momentum:
+
+              // Add gradient of 0.5 * norm_coefficient * ||X||^2, where ||X|| is the sum of squared
+              // values of all elements in X.
+              G_regularized = norm_coefficient * X + G
+
+              // In the first training iteration, beta should always be 1.
+              beta_adjusted = T > 0 ? beta : 1
+
+              // Compute the current momentum based on previous momentum and the current gradient.
+              V_new = alpha * V + beta_adjusted * G_regularized
+
+              // Update X.
+              X_new = X - R * V_new
+
+            Pseudo code for SG with Nesterov's momentum:
+
+              // Add gradient of 0.5 * norm_coefficient * ||X||^2, where ||X|| is the sum of squared
+              // values of all elements in X.
+              G_regularized = norm_coefficient * X + G;
+
+              // In the first training iteration, beta should always be 1.
+              beta_adjusted = T > 0 ? beta : 1
+
+              // Compute the current momentum based on previous momentum and the current gradient.
+              V_new = alpha * V + beta_adjusted * G_regularized;
+
+              // Compute final update direction and then update X.
+              X_new = X - R * (G_regularized + alpha * V_new)
+
+            If one assign this operators to optimize multiple inputs, for example, "X_1" and "X_2". The same
+            pseudo code would be extended to handle all tensors jointly. More specifically, we can view "X" as a
+            concatenation of "X_1" and "X_2" (of course, their gradient and accumulate gradient should
+            be concatenated too) and then our pseudo code becomes applicable.
+
+
+        Args:
+            R: The learning rate.
+
+            T: Update count of "X". It should be a scalar.
+
+            inputs: (variadic, heterogeneous) It sequentially contains the current
+                values of optimized tensors, then their gradient tensors, and finally
+                their momentum tensors. For example, if two tensors "X_1" and "X_2" are
+                optimized, The expected input list would be ["X_1", "X_2", gradient of
+                "X_1", gradient of "X_2", momentum of "X_1", momentum of "X_2"].
+
+            alpha: The decay factor of momentum. It should be a scalar.
+
+            beta: The coefficient of gradient in computing new momentum. It should be a
+                scalar.
+
+            mode: Its value should be either "nesterov" or "standard". The value
+                "nesterov" leads to the use of Nesterov's momentum while "standard"
+                invokes stochastic gradient method using standard momentum
+
+            norm_coefficient: Coefficient of 0.5 * norm_coefficient * ||X||^2.
+        """
+
+        schema = get_schema("Momentum", 1, "ai.onnx.preview.training")
+        op = Op(self, "Momentum", schema)
+        return op(
+            *self._prepare_inputs(schema, R, T, *inputs),
+            alpha=alpha,
+            beta=beta,
+            mode=mode,
+            norm_coefficient=norm_coefficient,
+        )