From 9f724c50e2ac3c395e1309498f2667599b95bd0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@macbook-pro.t-com.me>
Date: Wed, 30 Oct 2024 16:48:46 +0100
Subject: [PATCH 01/13] Set default optimizer to :adam

---
 lib/scholar/linear/logistic_regression.ex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index c49887e2..2d8c2b60 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -34,7 +34,7 @@ defmodule Scholar.Linear.LogisticRegression do
     ],
     optimizer: [
       type: {:custom, Scholar.Options, :optimizer, []},
-      default: :sgd,
+      default: :adam,
       doc: """
       The optimizer name or {init, update} pair of functions (see `Polaris.Optimizers` for more details).
       """

From 045684f32116c861f3474049fe0025ac21b87f67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@macbook-pro.t-com.me>
Date: Wed, 30 Oct 2024 16:49:28 +0100
Subject: [PATCH 02/13] Remove :mode from docs

---
 lib/scholar/linear/logistic_regression.ex | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index 2d8c2b60..0b232c2f 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -68,10 +68,6 @@ defmodule Scholar.Linear.LogisticRegression do
 
     * `:bias` - Bias added to the decision function.
 
-    * `:mode` - Indicates whether the problem is binary classification (`:num_classes` set to 2)
-      or multinomial (`:num_classes` is bigger than 2). For binary classification set to `:binary`, otherwise
-      set to `:multinomial`.
-
   ## Examples
 
       iex> x = Nx.tensor([[1.0, 2.0], [3.0, 2.0], [4.0, 7.0]])

From eec4ebf3d09ebaf1c7aab3d40bf5891d34c6c5db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@macbook-pro.t-com.me>
Date: Thu, 31 Oct 2024 10:11:04 +0100
Subject: [PATCH 03/13] Add average reduction in loss computation (bug fix)

---
 lib/scholar/linear/logistic_regression.ex | 25 +++++++++++------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index 0b232c2f..92a84211 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -25,13 +25,6 @@ defmodule Scholar.Linear.LogisticRegression do
       regression.
       """
     ],
-    learning_loop_unroll: [
-      type: :boolean,
-      default: false,
-      doc: ~S"""
-      If `true`, the learning loop is unrolled.
-      """
-    ],
     optimizer: [
       type: {:custom, Scholar.Options, :optimizer, []},
       default: :adam,
@@ -91,8 +84,8 @@ defmodule Scholar.Linear.LogisticRegression do
             "expected x to have shape {n_samples, n_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
     end
 
-    {n_samples, _} = Nx.shape(x)
-    y = LinearHelpers.validate_y_shape(y, n_samples, __MODULE__)
+    {num_samples, num_features} = Nx.shape(x)
+    y = LinearHelpers.validate_y_shape(y, num_samples, __MODULE__)
 
     opts = NimbleOptions.validate!(opts, @opts_schema)
 
@@ -104,13 +97,12 @@ defmodule Scholar.Linear.LogisticRegression do
         {f1, f2} -> {f1, f2}
       end
 
-    n = Nx.axis_size(x, -1)
     num_classes = opts[:num_classes]
 
     coef =
       Nx.broadcast(
         Nx.tensor(1.0, type: to_float_type(x)),
-        {n, num_classes}
+        {num_features, num_classes}
       )
 
     bias = Nx.broadcast(Nx.tensor(0, type: to_float_type(x)), {num_classes})
@@ -181,7 +173,14 @@ defmodule Scholar.Linear.LogisticRegression do
 
   defnp loss_and_grad(coeff, bias, xs, ys) do
     value_and_grad({coeff, bias}, fn {coeff, bias} ->
-      -Nx.sum(ys * log_softmax(Nx.dot(xs, coeff) + bias), axes: [-1])
+      xs
+      |> Nx.dot(coeff)
+      |> Nx.add(bias)
+      |> log_softmax()
+      |> Nx.multiply(ys)
+      |> Nx.sum(axes: [1])
+      |> Nx.negate()
+      |> Nx.mean()
     end)
   end
 
@@ -242,6 +241,6 @@ defmodule Scholar.Linear.LogisticRegression do
       >
   """
   defn predict_probability(%__MODULE__{coefficients: coeff, bias: bias} = _model, x) do
-    softmax(Nx.dot(x, [1], coeff, [0]) + bias)
+    softmax(Nx.dot(x, coeff) + bias)
   end
 end

From b70631b797b4897cb70e6b7e770d778de8b3c361 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Sun, 11 Jan 2026 21:56:03 +0100
Subject: [PATCH 04/13] Update

---
 lib/scholar/linear/logistic_regression.ex     | 218 ++++++++++++------
 .../linear/logistic_regression_test.exs       |  49 +++-
 2 files changed, 194 insertions(+), 73 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index 92a84211..b8553787 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -1,12 +1,11 @@
 defmodule Scholar.Linear.LogisticRegression do
   @moduledoc """
-  Logistic regression in both binary and multinomial variants.
+  Multiclass logistic regression.
 
   Time complexity is $O(N * K * I)$ where $N$ is the number of samples, $K$ is the number of features, and $I$ is the number of iterations.
   """
   import Nx.Defn
   import Scholar.Shared
-  alias Scholar.Linear.LinearHelpers
 
   @derive {Nx.Container, containers: [:coefficients, :bias]}
   defstruct [:coefficients, :bias]
@@ -15,28 +14,44 @@ defmodule Scholar.Linear.LogisticRegression do
     num_classes: [
       required: true,
       type: :pos_integer,
-      doc: "number of classes contained in the input tensors."
+      doc: "Number of output classes."
     ],
-    iterations: [
+    max_iterations: [
       type: :pos_integer,
       default: 1000,
-      doc: """
-      number of iterations of gradient descent performed inside logistic
-      regression.
-      """
+      doc: "Maximum number of gradient descent iterations to perform."
     ],
     optimizer: [
       type: {:custom, Scholar.Options, :optimizer, []},
-      default: :adam,
+      default: :sgd,
+      doc: """
+      Optimizer name or {init, update} pair of functions (see `Polaris.Optimizers` for more details).
+      """
+    ],
+    alpha: [
+      type: {:custom, Scholar.Options, :non_negative_number, []},
+      default: 1.0,
       doc: """
-      The optimizer name or {init, update} pair of functions (see `Polaris.Optimizers` for more details).
+      Constant that multiplies the regularization term, controlling regularization strength.
+      If 0, no regularization is applied.
       """
     ],
-    eps: [
-      type: :float,
-      default: 1.0e-8,
-      doc:
-        "The convergence tolerance. If the `abs(loss) < size(x) * :eps`, the algorithm is considered to have converged."
+    l1_ratio: [
+      type: {:custom, Scholar.Options, :non_negative_number, []},
+      default: 0.0,
+      doc: """
+      The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`.
+      Setting `l1_ratio` to 0 gives pure L2 regularization, and setting it to 1 gives pure L1 regularization.
+      For values between 0 and 1, a penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2` is used.
+      """
+    ],
+    tol: [
+      type: {:custom, Scholar.Options, :non_negative_number, []},
+      default: 1.0e-4,
+      doc: """
+      Convergence tolerance. If the infinity norm of the gradient is less than `:tol`,
+      the algorithm is considered to have converged.
+      """
     ]
   ]
 
@@ -46,9 +61,6 @@ defmodule Scholar.Linear.LogisticRegression do
   Fits a logistic regression model for sample inputs `x` and sample
   targets `y`.
 
-  Depending on number of classes the function chooses either binary
-  or multinomial logistic regression.
-
   ## Options
 
   #{NimbleOptions.docs(@opts_schema)}
@@ -69,26 +81,41 @@ defmodule Scholar.Linear.LogisticRegression do
       %Scholar.Linear.LogisticRegression{
         coefficients: Nx.tensor(
           [
-            [2.5531527996063232, -0.5531544089317322],
-            [-0.35652396082878113, 2.3565237522125244]
+            [0.09002052247524261, -0.09002052992582321],
+            [-0.1521512120962143, 0.1521512120962143]
           ]
         ),
-        bias: Nx.tensor(
-          [-0.28847914934158325, 0.28847917914390564]
-        )
+        bias: Nx.tensor([-0.05300388112664223, 0.053003907203674316])
       }
   """
   deftransform fit(x, y, opts \\ []) do
     if Nx.rank(x) != 2 do
       raise ArgumentError,
-            "expected x to have shape {n_samples, n_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+            "expected x to have shape {num_samples, num_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+    end
+
+    if Nx.rank(y) != 1 do
+      raise ArgumentError,
+            "expected y to have shape {num_samples}, got tensor with shape: #{inspect(Nx.shape(y))}"
     end
 
     {num_samples, num_features} = Nx.shape(x)
-    y = LinearHelpers.validate_y_shape(y, num_samples, __MODULE__)
+
+    if Nx.axis_size(y, 0) != num_samples do
+      raise ArgumentError,
+            "expected x and y to have the same number of samples, got #{num_samples} and #{Nx.axis_size(y, 0)}"
+    end
 
     opts = NimbleOptions.validate!(opts, @opts_schema)
 
+    {l1_ratio, opts} = Keyword.pop!(opts, :l1_ratio)
+
+    unless l1_ratio >= 0.0 and l1_ratio <= 1.0 do
+      raise ArgumentError,
+            "expected l1_ratio to be between 0 and 1, got: #{inspect(l1_ratio)}"
+    end
+
+    type = to_float_type(x)
     {optimizer, opts} = Keyword.pop!(opts, :optimizer)
 
     {optimizer_init_fn, optimizer_update_fn} =
@@ -101,18 +128,35 @@ defmodule Scholar.Linear.LogisticRegression do
 
     coef =
       Nx.broadcast(
-        Nx.tensor(1.0, type: to_float_type(x)),
+        Nx.tensor(0.0, type: type),
         {num_features, num_classes}
       )
 
-    bias = Nx.broadcast(Nx.tensor(0, type: to_float_type(x)), {num_classes})
+    bias = Nx.broadcast(Nx.tensor(0.0, type: type), {num_classes})
+
+    coef_optimizer_state = optimizer_init_fn.(coef) |> as_type(type)
+    bias_optimizer_state = optimizer_init_fn.(bias) |> as_type(type)
 
-    coef_optimizer_state = optimizer_init_fn.(coef) |> as_type(to_float_type(x))
-    bias_optimizer_state = optimizer_init_fn.(bias) |> as_type(to_float_type(x))
+    {alpha, opts} = Keyword.pop!(opts, :alpha)
+    {tol, opts} = Keyword.pop!(opts, :tol)
+    alpha = Nx.tensor(alpha, type: type)
+    l1_ratio = Nx.tensor(l1_ratio, type: type)
+    tol = Nx.tensor(tol, type: type)
 
     opts = Keyword.put(opts, :optimizer_update_fn, optimizer_update_fn)
 
-    fit_n(x, y, coef, bias, coef_optimizer_state, bias_optimizer_state, opts)
+    fit_n(
+      x,
+      y,
+      coef,
+      bias,
+      alpha,
+      l1_ratio,
+      tol,
+      coef_optimizer_state,
+      bias_optimizer_state,
+      opts
+    )
   end
 
   deftransformp as_type(container, target_type) do
@@ -127,11 +171,20 @@ defmodule Scholar.Linear.LogisticRegression do
     end)
   end
 
-  # Logistic Regression training loop
-
-  defnp fit_n(x, y, coef, bias, coef_optimizer_state, bias_optimizer_state, opts) do
+  defnp fit_n(
+          x,
+          y,
+          coef,
+          bias,
+          alpha,
+          l1_ratio,
+          tol,
+          coef_optimizer_state,
+          bias_optimizer_state,
+          opts
+        ) do
     num_samples = Nx.axis_size(x, 0)
-    iterations = opts[:iterations]
+    max_iterations = opts[:max_iterations]
     num_classes = opts[:num_classes]
     optimizer_update_fn = opts[:optimizer_update_fn]
 
@@ -141,12 +194,15 @@ defmodule Scholar.Linear.LogisticRegression do
       |> Nx.broadcast({num_samples, num_classes})
       |> Nx.equal(Nx.iota({num_samples, num_classes}, axis: 1))
 
-    {{final_coef, final_bias}, _} =
-      while {{coef, bias},
-             {x, iterations, y_one_hot, coef_optimizer_state, bias_optimizer_state,
-              has_converged = Nx.u8(0), iter = 0}},
-            iter < iterations and not has_converged do
-        {loss, {coef_grad, bias_grad}} = loss_and_grad(coef, bias, x, y_one_hot)
+    {final_coef, final_bias, _} =
+      while {coef, bias,
+             {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, coef_optimizer_state,
+              bias_optimizer_state, converged? = Nx.u8(0), iter = Nx.u32(0)}},
+            iter < max_iterations and not converged? do
+        {coef_grad, bias_grad} =
+          grad({coef, bias}, fn {coef, bias} ->
+            compute_loss(coef, bias, alpha, l1_ratio, x, y_one_hot)
+          end)
 
         {coef_updates, coef_optimizer_state} =
           optimizer_update_fn.(coef_grad, coef_optimizer_state, coef)
@@ -158,11 +214,12 @@ defmodule Scholar.Linear.LogisticRegression do
 
         bias = Polaris.Updates.apply_updates(bias, bias_updates)
 
-        has_converged = Nx.sum(Nx.abs(loss)) < Nx.size(x) * opts[:eps]
+        converged? =
+          Nx.reduce_max(Nx.abs(coef_grad)) < tol and Nx.reduce_max(Nx.abs(bias_grad)) < tol
 
-        {{coef, bias},
-         {x, iterations, y_one_hot, coef_optimizer_state, bias_optimizer_state, has_converged,
-          iter + 1}}
+        {coef, bias,
+         {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, coef_optimizer_state,
+          bias_optimizer_state, converged?, iter + 1}}
       end
 
     %__MODULE__{
@@ -171,17 +228,42 @@ defmodule Scholar.Linear.LogisticRegression do
     }
   end
 
-  defnp loss_and_grad(coeff, bias, xs, ys) do
-    value_and_grad({coeff, bias}, fn {coeff, bias} ->
-      xs
-      |> Nx.dot(coeff)
-      |> Nx.add(bias)
-      |> log_softmax()
-      |> Nx.multiply(ys)
-      |> Nx.sum(axes: [1])
-      |> Nx.negate()
-      |> Nx.mean()
-    end)
+  defnp compute_regularization(coeff, alpha, l1_ratio) do
+    if alpha > 0.0 do
+      reg =
+        cond do
+          l1_ratio == 0.0 ->
+            # L2 regularization
+            Nx.sum(coeff * coeff)
+
+          l1_ratio == 1.0 ->
+            # L1 regularization
+            Nx.sum(Nx.abs(coeff))
+
+          # Elastic-Net regularization
+          true ->
+            l1_ratio * Nx.sum(Nx.abs(coeff)) +
+              (1 - l1_ratio) * Nx.sum(coeff * coeff)
+        end
+
+      alpha * reg
+    else
+      0.0
+    end
+  end
+
+  defnp compute_loss(coeff, bias, alpha, l1_ratio, xs, ys) do
+    reg = compute_regularization(coeff, alpha, l1_ratio)
+
+    xs
+    |> Nx.dot(coeff)
+    |> Nx.add(bias)
+    |> log_softmax()
+    |> Nx.multiply(ys)
+    |> Nx.sum(axes: [1])
+    |> Nx.negate()
+    |> Nx.mean()
+    |> Nx.add(reg)
   end
 
   defnp log_softmax(x) do
@@ -214,14 +296,16 @@ defmodule Scholar.Linear.LogisticRegression do
       iex> y = Nx.tensor([1, 0, 1])
       iex> model = Scholar.Linear.LogisticRegression.fit(x, y, num_classes: 2)
       iex> Scholar.Linear.LogisticRegression.predict(model, Nx.tensor([[-3.0, 5.0]]))
-      #Nx.Tensor<
-        s32[1]
-        [1]
-      >
+      Nx.tensor([1])
   """
   defn predict(%__MODULE__{coefficients: coeff, bias: bias} = _model, x) do
-    inter = Nx.dot(x, [1], coeff, [0]) + bias
-    Nx.argmax(inter, axis: 1)
+    if Nx.rank(x) != 2 do
+      raise ArgumentError,
+            "expected x to have shape {n_samples, n_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+    end
+
+    logits = Nx.dot(x, coeff) + bias
+    Nx.argmax(logits, axis: 1)
   end
 
   @doc """
@@ -233,14 +317,14 @@ defmodule Scholar.Linear.LogisticRegression do
       iex> y = Nx.tensor([1, 0, 1])
       iex> model = Scholar.Linear.LogisticRegression.fit(x, y, num_classes: 2)
       iex> Scholar.Linear.LogisticRegression.predict_probability(model, Nx.tensor([[-3.0, 5.0]]))
-      #Nx.Tensor<
-        f32[1][2]
-        [
-          [6.470913388456623e-11, 1.0]
-        ]
-      >
+      Nx.tensor([[0.10269401967525482, 0.8973060250282288]])
   """
   defn predict_probability(%__MODULE__{coefficients: coeff, bias: bias} = _model, x) do
+    if Nx.rank(x) != 2 do
+      raise ArgumentError,
+            "expected x to have shape {n_samples, n_features}, got tensor with shape: #{inspect(Nx.shape(x))}"
+    end
+
     softmax(Nx.dot(x, coeff) + bias)
   end
 end
diff --git a/test/scholar/linear/logistic_regression_test.exs b/test/scholar/linear/logistic_regression_test.exs
index 8fc2d374..e8a630a6 100644
--- a/test/scholar/linear/logistic_regression_test.exs
+++ b/test/scholar/linear/logistic_regression_test.exs
@@ -45,7 +45,11 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       y = Nx.tensor([1, 2])
 
       assert_raise NimbleOptions.ValidationError,
-                   "invalid value for :optimizer option: expected :optimizer to be either a valid 0-arity function in Polaris.Optimizers or a valid {init_fn, update_fn} tuple",
+                   """
+                   invalid value for :optimizer option: expected :optimizer to be either \
+                   a valid 0-arity function in Polaris.Optimizers or a valid {init_fn, update_fn} tuple
+                   """,
+                   #  "invalid value for :optimizer option: expected :optimizer to be either a valid 0-arity function in Polaris.Optimizers or a valid {init_fn, update_fn} tuple",
                    fn ->
                      LogisticRegression.fit(x, y,
                        num_classes: 2,
@@ -54,14 +58,14 @@ defmodule Scholar.Linear.LogisticRegressionTest do
                    end
     end
 
-    test "when :iterations is not a positive integer" do
+    test "when :max_iterations is not a positive integer" do
       x = Nx.tensor([[1, 2], [3, 4]])
       y = Nx.tensor([1, 2])
 
       assert_raise NimbleOptions.ValidationError,
-                   "invalid value for :iterations option: expected positive integer, got: 0",
+                   "invalid value for :max_iterations option: expected positive integer, got: 0",
                    fn ->
-                     LogisticRegression.fit(x, y, num_classes: 2, iterations: 0)
+                     LogisticRegression.fit(x, y, num_classes: 2, max_iterations: 0)
                    end
     end
 
@@ -70,7 +74,7 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       y = Nx.tensor([1, 2])
 
       assert_raise ArgumentError,
-                   "expected x to have shape {n_samples, n_features}, got tensor with shape: {2}",
+                   "expected x to have shape {num_samples, num_features}, got tensor with shape: {2}",
                    fn -> LogisticRegression.fit(x, y, num_classes: 2) end
     end
 
@@ -79,7 +83,10 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       y = Nx.tensor([[0, 1], [1, 0]])
 
       assert_raise ArgumentError,
-                   "Scholar.Linear.LogisticRegression expected y to have shape {n_samples}, got tensor with shape: {2, 2}",
+                   """
+                   Scholar.Linear.LogisticRegression expected y to have shape {num_samples}, \
+                   got tensor with shape: {2, 2}
+                   """,
                    fn -> LogisticRegression.fit(x, y, num_classes: 2) end
     end
   end
@@ -97,4 +104,34 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       assert pred == col_pred
     end
   end
+
+  describe "linearly separable data" do
+    test "1D" do
+      key = Nx.Random.key(12)
+      {x1, key} = Nx.Random.uniform(key, -1.0, 0.0, shape: {1000, 1})
+      {x2, _key} = Nx.Random.uniform(key, 0.0, 1.0, shape: {1000, 1})
+      x = Nx.concatenate([x1, x2])
+      y1 = Nx.broadcast(0, {1000})
+      y2 = Nx.broadcast(1, {1000})
+      y = Nx.concatenate([y1, y2])
+      model = LogisticRegression.fit(x, y, num_classes: 2)
+      y_pred = LogisticRegression.predict(model, x)
+      accuracy = Scholar.Metrics.Classification.accuracy(y, y_pred)
+      assert Nx.equal(accuracy, 1)
+    end
+
+    test "2D" do
+      key = Nx.Random.key(12)
+      {x1, key} = Nx.Random.uniform(key, -1.0, 0.0, shape: {1000, 2})
+      {x2, _key} = Nx.Random.uniform(key, 0.0, 1.0, shape: {1000, 2})
+      x = Nx.concatenate([x1, x2])
+      y1 = Nx.broadcast(0, {1000})
+      y2 = Nx.broadcast(1, {1000})
+      y = Nx.concatenate([y1, y2])
+      model = LogisticRegression.fit(x, y, num_classes: 2)
+      y_pred = LogisticRegression.predict(model, x)
+      accuracy = Scholar.Metrics.Classification.accuracy(y, y_pred)
+      assert Nx.equal(accuracy, 1)
+    end
+  end
 end

From aff58fbe0b02aac103ff482bdb8f1adee94ccdd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Sun, 11 Jan 2026 22:06:24 +0100
Subject: [PATCH 05/13] Update

---
 lib/scholar/linear/logistic_regression.ex | 70 +++++++++++------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index b8553787..0db6b75f 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -126,16 +126,16 @@ defmodule Scholar.Linear.LogisticRegression do
 
     num_classes = opts[:num_classes]
 
-    coef =
+    w =
       Nx.broadcast(
         Nx.tensor(0.0, type: type),
         {num_features, num_classes}
       )
 
-    bias = Nx.broadcast(Nx.tensor(0.0, type: type), {num_classes})
+    b = Nx.broadcast(Nx.tensor(0.0, type: type), {num_classes})
 
-    coef_optimizer_state = optimizer_init_fn.(coef) |> as_type(type)
-    bias_optimizer_state = optimizer_init_fn.(bias) |> as_type(type)
+    w_optimizer_state = optimizer_init_fn.(w) |> as_type(type)
+    b_optimizer_state = optimizer_init_fn.(b) |> as_type(type)
 
     {alpha, opts} = Keyword.pop!(opts, :alpha)
     {tol, opts} = Keyword.pop!(opts, :tol)
@@ -148,13 +148,13 @@ defmodule Scholar.Linear.LogisticRegression do
     fit_n(
       x,
       y,
-      coef,
-      bias,
+      w,
+      b,
       alpha,
       l1_ratio,
       tol,
-      coef_optimizer_state,
-      bias_optimizer_state,
+      w_optimizer_state,
+      b_optimizer_state,
       opts
     )
   end
@@ -174,13 +174,13 @@ defmodule Scholar.Linear.LogisticRegression do
   defnp fit_n(
           x,
           y,
-          coef,
-          bias,
+          w,
+          b,
           alpha,
           l1_ratio,
           tol,
-          coef_optimizer_state,
-          bias_optimizer_state,
+          w_optimizer_state,
+          b_optimizer_state,
           opts
         ) do
     num_samples = Nx.axis_size(x, 0)
@@ -194,37 +194,37 @@ defmodule Scholar.Linear.LogisticRegression do
       |> Nx.broadcast({num_samples, num_classes})
       |> Nx.equal(Nx.iota({num_samples, num_classes}, axis: 1))
 
-    {final_coef, final_bias, _} =
-      while {coef, bias,
-             {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, coef_optimizer_state,
-              bias_optimizer_state, converged? = Nx.u8(0), iter = Nx.u32(0)}},
+    {coef, bias, _} =
+      while {w, b,
+             {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, w_optimizer_state,
+              b_optimizer_state, converged? = Nx.u8(0), iter = Nx.u32(0)}},
             iter < max_iterations and not converged? do
-        {coef_grad, bias_grad} =
-          grad({coef, bias}, fn {coef, bias} ->
-            compute_loss(coef, bias, alpha, l1_ratio, x, y_one_hot)
+        {w_grad, b_grad} =
+          grad({w, b}, fn {w, b} ->
+            compute_loss(w, b, alpha, l1_ratio, x, y_one_hot)
           end)
 
-        {coef_updates, coef_optimizer_state} =
-          optimizer_update_fn.(coef_grad, coef_optimizer_state, coef)
+        {w_updates, w_optimizer_state} =
+          optimizer_update_fn.(w_grad, w_optimizer_state, w)
 
-        coef = Polaris.Updates.apply_updates(coef, coef_updates)
+        w = Polaris.Updates.apply_updates(w, w_updates)
 
-        {bias_updates, bias_optimizer_state} =
-          optimizer_update_fn.(bias_grad, bias_optimizer_state, bias)
+        {b_updates, b_optimizer_state} =
+          optimizer_update_fn.(b_grad, b_optimizer_state, b)
 
-        bias = Polaris.Updates.apply_updates(bias, bias_updates)
+        b = Polaris.Updates.apply_updates(b, bias_updates)
 
         converged? =
-          Nx.reduce_max(Nx.abs(coef_grad)) < tol and Nx.reduce_max(Nx.abs(bias_grad)) < tol
+          Nx.reduce_max(Nx.abs(w_grad)) < tol and Nx.reduce_max(Nx.abs(bias_grad)) < tol
 
-        {coef, bias,
-         {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, coef_optimizer_state,
-          bias_optimizer_state, converged?, iter + 1}}
+        {w, b,
+         {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, w_optimizer_state,
+          b_optimizer_state, converged?, iter + 1}}
       end
 
     %__MODULE__{
-      coefficients: final_coef,
-      bias: final_bias
+      coefficients: coef,
+      bias: bias
     }
   end
 
@@ -252,12 +252,12 @@ defmodule Scholar.Linear.LogisticRegression do
     end
   end
 
-  defnp compute_loss(coeff, bias, alpha, l1_ratio, xs, ys) do
-    reg = compute_regularization(coeff, alpha, l1_ratio)
+  defnp compute_loss(w, b, alpha, l1_ratio, xs, ys) do
+    reg = compute_regularization(w, alpha, l1_ratio)
 
     xs
-    |> Nx.dot(coeff)
-    |> Nx.add(bias)
+    |> Nx.dot(w)
+    |> Nx.add(b)
     |> log_softmax()
     |> Nx.multiply(ys)
     |> Nx.sum(axes: [1])

From 01e5b5d7dd6b9a671d7afffd9bf1ad2ada305f83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Sun, 11 Jan 2026 22:23:10 +0100
Subject: [PATCH 06/13] Bug fix

---
 lib/scholar/linear/logistic_regression.ex | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index 0db6b75f..d43c10d0 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -212,10 +212,10 @@ defmodule Scholar.Linear.LogisticRegression do
         {b_updates, b_optimizer_state} =
           optimizer_update_fn.(b_grad, b_optimizer_state, b)
 
-        b = Polaris.Updates.apply_updates(b, bias_updates)
+        b = Polaris.Updates.apply_updates(b, b_updates)
 
         converged? =
-          Nx.reduce_max(Nx.abs(w_grad)) < tol and Nx.reduce_max(Nx.abs(bias_grad)) < tol
+          Nx.reduce_max(Nx.abs(w_grad)) < tol and Nx.reduce_max(Nx.abs(b_grad)) < tol
 
         {w, b,
          {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, w_optimizer_state,
@@ -228,22 +228,22 @@ defmodule Scholar.Linear.LogisticRegression do
     }
   end
 
-  defnp compute_regularization(coeff, alpha, l1_ratio) do
+  defnp compute_regularization(w, alpha, l1_ratio) do
     if alpha > 0.0 do
       reg =
         cond do
           l1_ratio == 0.0 ->
             # L2 regularization
-            Nx.sum(coeff * coeff)
+            Nx.sum(w * w)
 
           l1_ratio == 1.0 ->
             # L1 regularization
-            Nx.sum(Nx.abs(coeff))
+            Nx.sum(Nx.abs(w))
 
           # Elastic-Net regularization
           true ->
-            l1_ratio * Nx.sum(Nx.abs(coeff)) +
-              (1 - l1_ratio) * Nx.sum(coeff * coeff)
+            l1_ratio * Nx.sum(Nx.abs(w)) +
+              (1 - l1_ratio) * Nx.sum(w * w)
         end
 
       alpha * reg

From f16795dfe41c920eabb2a9b629c4266664fa0fe5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Wed, 14 Jan 2026 15:07:35 +0100
Subject: [PATCH 07/13] Fix some unit tests

---
 test/scholar/linear/logistic_regression_test.exs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/scholar/linear/logistic_regression_test.exs b/test/scholar/linear/logistic_regression_test.exs
index e8a630a6..c6ac0b2b 100644
--- a/test/scholar/linear/logistic_regression_test.exs
+++ b/test/scholar/linear/logistic_regression_test.exs
@@ -47,9 +47,8 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       assert_raise NimbleOptions.ValidationError,
                    """
                    invalid value for :optimizer option: expected :optimizer to be either \
-                   a valid 0-arity function in Polaris.Optimizers or a valid {init_fn, update_fn} tuple
+                   a valid 0-arity function in Polaris.Optimizers or a valid {init_fn, update_fn} tuple\
                    """,
-                   #  "invalid value for :optimizer option: expected :optimizer to be either a valid 0-arity function in Polaris.Optimizers or a valid {init_fn, update_fn} tuple",
                    fn ->
                      LogisticRegression.fit(x, y,
                        num_classes: 2,
@@ -84,7 +83,7 @@ defmodule Scholar.Linear.LogisticRegressionTest do
 
       assert_raise ArgumentError,
                    """
-                   Scholar.Linear.LogisticRegression expected y to have shape {num_samples}, \
+                   expected y to have shape {num_samples}, \
                    got tensor with shape: {2, 2}
                    """,
                    fn -> LogisticRegression.fit(x, y, num_classes: 2) end

From dc280a6c1327a0dda1260c6f4843aeaa48342a74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Wed, 14 Jan 2026 19:32:03 +0100
Subject: [PATCH 08/13] Update tests

---
 test/scholar/linear/logistic_regression_test.exs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/scholar/linear/logistic_regression_test.exs b/test/scholar/linear/logistic_regression_test.exs
index c6ac0b2b..69cda7be 100644
--- a/test/scholar/linear/logistic_regression_test.exs
+++ b/test/scholar/linear/logistic_regression_test.exs
@@ -10,7 +10,7 @@ defmodule Scholar.Linear.LogisticRegressionTest do
     res = LogisticRegression.predict(model, x_test)
     accuracy = Scholar.Metrics.Classification.accuracy(res, y_test)
 
-    assert Nx.greater_equal(accuracy, 0.96) == Nx.u8(1)
+    assert Nx.to_number(accuracy) >= 0.96
   end
 
   describe "errors" do
@@ -84,7 +84,7 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       assert_raise ArgumentError,
                    """
                    expected y to have shape {num_samples}, \
-                   got tensor with shape: {2, 2}
+                   got tensor with shape: {2, 2}\
                    """,
                    fn -> LogisticRegression.fit(x, y, num_classes: 2) end
     end
@@ -116,7 +116,7 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       model = LogisticRegression.fit(x, y, num_classes: 2)
       y_pred = LogisticRegression.predict(model, x)
       accuracy = Scholar.Metrics.Classification.accuracy(y, y_pred)
-      assert Nx.equal(accuracy, 1)
+      assert Nx.to_number(accuracy) == 1.0
     end
 
     test "2D" do
@@ -130,7 +130,7 @@ defmodule Scholar.Linear.LogisticRegressionTest do
       model = LogisticRegression.fit(x, y, num_classes: 2)
       y_pred = LogisticRegression.predict(model, x)
       accuracy = Scholar.Metrics.Classification.accuracy(y, y_pred)
-      assert Nx.equal(accuracy, 1)
+      assert Nx.to_number(accuracy) == 1.0
     end
   end
 end

From 524d163117f902934da3f97f19c959a2d3f5acfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Fri, 16 Jan 2026 12:55:14 +0100
Subject: [PATCH 09/13] Update

---
 lib/scholar/linear/logistic_regression.ex     | 250 +++++++-----------
 .../linear/logistic_regression_test.exs       |  10 +-
 2 files changed, 106 insertions(+), 154 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index d43c10d0..9c1f1301 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -21,30 +21,14 @@ defmodule Scholar.Linear.LogisticRegression do
       default: 1000,
       doc: "Maximum number of gradient descent iterations to perform."
     ],
-    optimizer: [
-      type: {:custom, Scholar.Options, :optimizer, []},
-      default: :sgd,
-      doc: """
-      Optimizer name or {init, update} pair of functions (see `Polaris.Optimizers` for more details).
-      """
-    ],
     alpha: [
       type: {:custom, Scholar.Options, :non_negative_number, []},
       default: 1.0,
       doc: """
-      Constant that multiplies the regularization term, controlling regularization strength.
+      Constant that multiplies the L2 regularization term, controlling regularization strength.
       If 0, no regularization is applied.
       """
     ],
-    l1_ratio: [
-      type: {:custom, Scholar.Options, :non_negative_number, []},
-      default: 0.0,
-      doc: """
-      The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`.
-      Setting `l1_ratio` to 0 gives pure L2 regularization, and setting it to 1 gives pure L1 regularization.
-      For values between 0 and 1, a penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2` is used.
-      """
-    ],
     tol: [
       type: {:custom, Scholar.Options, :non_negative_number, []},
       default: 1.0e-4,
@@ -81,11 +65,10 @@ defmodule Scholar.Linear.LogisticRegression do
       %Scholar.Linear.LogisticRegression{
         coefficients: Nx.tensor(
           [
-            [0.09002052247524261, -0.09002052992582321],
-            [-0.1521512120962143, 0.1521512120962143]
-          ]
+            [0.0915902629494667, -0.09159023314714432],
+            [-0.1507941037416458, 0.1507941335439682]
         ),
-        bias: Nx.tensor([-0.05300388112664223, 0.053003907203674316])
+        bias: Nx.tensor([-0.06566660106182098, 0.06566664576530457])
       }
   """
   deftransform fit(x, y, opts \\ []) do
@@ -99,7 +82,7 @@ defmodule Scholar.Linear.LogisticRegression do
             "expected y to have shape {num_samples}, got tensor with shape: #{inspect(Nx.shape(y))}"
     end
 
-    {num_samples, num_features} = Nx.shape(x)
+    num_samples = Nx.axis_size(x, 0)
 
     if Nx.axis_size(y, 0) != num_samples do
       raise ArgumentError,
@@ -108,24 +91,25 @@ defmodule Scholar.Linear.LogisticRegression do
 
     opts = NimbleOptions.validate!(opts, @opts_schema)
 
-    {l1_ratio, opts} = Keyword.pop!(opts, :l1_ratio)
-
-    unless l1_ratio >= 0.0 and l1_ratio <= 1.0 do
-      raise ArgumentError,
-            "expected l1_ratio to be between 0 and 1, got: #{inspect(l1_ratio)}"
-    end
-
     type = to_float_type(x)
-    {optimizer, opts} = Keyword.pop!(opts, :optimizer)
 
-    {optimizer_init_fn, optimizer_update_fn} =
-      case optimizer do
-        atom when is_atom(atom) -> apply(Polaris.Optimizers, atom, [])
-        {f1, f2} -> {f1, f2}
-      end
+    {alpha, opts} = Keyword.pop!(opts, :alpha)
+    alpha = Nx.tensor(alpha, type: type)
+    {tol, opts} = Keyword.pop!(opts, :tol)
+    tol = Nx.tensor(tol, type: type)
+    {max_iterations, opts} = Keyword.pop!(opts, :max_iterations)
+    max_iterations = Nx.tensor(max_iterations, type: :u32)
+
+    fit_n(x, y, alpha, max_iterations, tol, opts)
+  end
 
+  defnp fit_n(x, y, alpha, max_iterations, tol, opts) do
     num_classes = opts[:num_classes]
+    {num_samples, num_features} = Nx.shape(x)
+
+    type = to_float_type(x)
 
+    # Initialize weights and bias with zeros
     w =
       Nx.broadcast(
         Nx.tensor(0.0, type: type),
@@ -134,92 +118,64 @@ defmodule Scholar.Linear.LogisticRegression do
 
     b = Nx.broadcast(Nx.tensor(0.0, type: type), {num_classes})
 
-    w_optimizer_state = optimizer_init_fn.(w) |> as_type(type)
-    b_optimizer_state = optimizer_init_fn.(b) |> as_type(type)
-
-    {alpha, opts} = Keyword.pop!(opts, :alpha)
-    {tol, opts} = Keyword.pop!(opts, :tol)
-    alpha = Nx.tensor(alpha, type: type)
-    l1_ratio = Nx.tensor(l1_ratio, type: type)
-    tol = Nx.tensor(tol, type: type)
-
-    opts = Keyword.put(opts, :optimizer_update_fn, optimizer_update_fn)
-
-    fit_n(
-      x,
-      y,
-      w,
-      b,
-      alpha,
-      l1_ratio,
-      tol,
-      w_optimizer_state,
-      b_optimizer_state,
-      opts
-    )
-  end
-
-  deftransformp as_type(container, target_type) do
-    Nx.Defn.Composite.traverse(container, fn t ->
-      type = Nx.type(t)
-
-      if Nx.Type.float?(type) and not Nx.Type.complex?(type) do
-        Nx.as_type(t, target_type)
-      else
-        t
-      end
-    end)
-  end
-
-  defnp fit_n(
-          x,
-          y,
-          w,
-          b,
-          alpha,
-          l1_ratio,
-          tol,
-          w_optimizer_state,
-          b_optimizer_state,
-          opts
-        ) do
-    num_samples = Nx.axis_size(x, 0)
-    max_iterations = opts[:max_iterations]
-    num_classes = opts[:num_classes]
-    optimizer_update_fn = opts[:optimizer_update_fn]
-
+    # One-hot encoding of target labels
     y_one_hot =
       y
       |> Nx.new_axis(1)
       |> Nx.broadcast({num_samples, num_classes})
       |> Nx.equal(Nx.iota({num_samples, num_classes}, axis: 1))
 
-    {coef, bias, _} =
-      while {w, b,
-             {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, w_optimizer_state,
-              b_optimizer_state, converged? = Nx.u8(0), iter = Nx.u32(0)}},
-            iter < max_iterations and not converged? do
-        {w_grad, b_grad} =
-          grad({w, b}, fn {w, b} ->
-            compute_loss(w, b, alpha, l1_ratio, x, y_one_hot)
-          end)
-
-        {w_updates, w_optimizer_state} =
-          optimizer_update_fn.(w_grad, w_optimizer_state, w)
+    # Define Armijo parameters
+    c = Nx.tensor(1.0e-4, type: type)
+    rho = Nx.tensor(0.5, type: type)
 
-        w = Polaris.Updates.apply_updates(w, w_updates)
+    eta_min =
+      case type do
+        {:f, 32} -> Nx.tensor(1.0e-6, type: type)
+        {:f, 64} -> Nx.tensor(1.0e-8, type: type)
+        _ -> Nx.tensor(1.0e-6, type: type)
+      end
 
-        {b_updates, b_optimizer_state} =
-          optimizer_update_fn.(b_grad, b_optimizer_state, b)
+    armijo_params = %{
+      c: c,
+      rho: rho,
+      eta_min: eta_min
+    }
 
-        b = Polaris.Updates.apply_updates(b, b_updates)
+    {coef, bias, _} =
+      while {w, b,
+             {alpha, x, y_one_hot, max_iterations, tol, armijo_params, iter = Nx.u32(0),
+              converged? = Nx.u8(0)}},
+            iter < max_iterations and not converged? do
+        logits = Nx.dot(x, w) + b
+        probabilities = softmax(logits)
+        residuals = probabilities - y_one_hot
+
+        # Compute loss
+        loss =
+          logits
+          |> log_softmax()
+          |> Nx.multiply(y_one_hot)
+          |> Nx.sum(axes: [1])
+          |> Nx.mean()
+          |> Nx.negate()
+          |> Nx.add(alpha * Nx.sum(w * w))
+
+        # Compute gradients
+        grad_w = Nx.dot(x, [0], residuals, [0]) / num_samples + 2 * alpha * w
+        grad_b = Nx.sum(residuals, axes: [0]) / num_samples
+
+        # Perform line search to find step size
+        eta =
+          armijo_line_search(w, b, alpha, x, y_one_hot, loss, grad_w, grad_b, armijo_params)
+
+        w = w - eta * grad_w
+        b = b - eta * grad_b
 
         converged? =
-          Nx.reduce_max(Nx.abs(w_grad)) < tol and Nx.reduce_max(Nx.abs(b_grad)) < tol
+          Nx.reduce_max(Nx.abs(grad_w)) < tol and Nx.reduce_max(Nx.abs(grad_b)) < tol
 
-        {w, b,
-         {x, y_one_hot, max_iterations, alpha, l1_ratio, tol, w_optimizer_state,
-          b_optimizer_state, converged?, iter + 1}}
+        {w, b, {alpha, x, y_one_hot, max_iterations, tol, armijo_params, iter + 1, converged?}}
       end
 
     %__MODULE__{
@@ -228,63 +184,59 @@ defmodule Scholar.Linear.LogisticRegression do
     }
   end
 
-  defnp compute_regularization(w, alpha, l1_ratio) do
-    if alpha > 0.0 do
-      reg =
-        cond do
-          l1_ratio == 0.0 ->
-            # L2 regularization
-            Nx.sum(w * w)
-
-          l1_ratio == 1.0 ->
-            # L1 regularization
-            Nx.sum(Nx.abs(w))
-
-          # Elastic-Net regularization
-          true ->
-            l1_ratio * Nx.sum(Nx.abs(w)) +
-              (1 - l1_ratio) * Nx.sum(w * w)
-        end
-
-      alpha * reg
-    else
-      0.0
-    end
-  end
+  defnp armijo_line_search(w, b, alpha, x, y, loss, grad_w, grad_b, armijo_params) do
+    c = armijo_params[:c]
+    rho = armijo_params[:rho]
+    eta_min = armijo_params[:eta_min]
 
-  defnp compute_loss(w, b, alpha, l1_ratio, xs, ys) do
-    reg = compute_regularization(w, alpha, l1_ratio)
+    type = to_float_type(x)
+    dir_w = -grad_w
+    dir_b = -grad_b
+    # Directional derivative
+    slope = Nx.sum(dir_w * grad_w) + Nx.sum(dir_b * grad_b)
+
+    {eta, _} =
+      while {eta = Nx.tensor(1.0, type: type),
+             {w, b, alpha, x, y, loss, dir_w, dir_b, slope, c, rho, eta_min}},
+            compute_loss(w + eta * dir_w, b + eta * dir_b, alpha, x, y) > loss + c * eta * slope and
+              eta > eta_min do
+        eta = eta * rho
+
+        {eta, {w, b, alpha, x, y, loss, dir_w, dir_b, slope, c, rho, eta_min}}
+      end
+
+    eta
+  end
 
-    xs
+  defnp compute_loss(w, b, alpha, x, y) do
+    x
     |> Nx.dot(w)
     |> Nx.add(b)
     |> log_softmax()
-    |> Nx.multiply(ys)
+    |> Nx.multiply(y)
     |> Nx.sum(axes: [1])
-    |> Nx.negate()
     |> Nx.mean()
-    |> Nx.add(reg)
+    |> Nx.negate()
+    |> Nx.add(alpha * Nx.sum(w * w))
+  end
+
+  defnp softmax(logits) do
+    max = stop_grad(Nx.reduce_max(logits, axes: [1], keep_axes: true))
+    normalized_exp = (logits - max) |> Nx.exp()
+    normalized_exp / Nx.sum(normalized_exp, axes: [1], keep_axes: true)
   end
 
   defnp log_softmax(x) do
-    shifted = x - stop_grad(Nx.reduce_max(x, axes: [-1], keep_axes: true))
+    shifted = x - stop_grad(Nx.reduce_max(x, axes: [1], keep_axes: true))
 
     shifted
     |> Nx.exp()
-    |> Nx.sum(axes: [-1], keep_axes: true)
+    |> Nx.sum(axes: [1], keep_axes: true)
     |> Nx.log()
     |> Nx.negate()
     |> Nx.add(shifted)
   end
 
-  # Normalized softmax
-
-  defnp softmax(t) do
-    max = stop_grad(Nx.reduce_max(t, axes: [-1], keep_axes: true))
-    normalized_exp = (t - max) |> Nx.exp()
-    normalized_exp / Nx.sum(normalized_exp, axes: [-1], keep_axes: true)
-  end
-
   @doc """
   Makes predictions with the given `model` on inputs `x`.
 
@@ -317,7 +269,7 @@ defmodule Scholar.Linear.LogisticRegression do
       iex> y = Nx.tensor([1, 0, 1])
       iex> model = Scholar.Linear.LogisticRegression.fit(x, y, num_classes: 2)
       iex> Scholar.Linear.LogisticRegression.predict_probability(model, Nx.tensor([[-3.0, 5.0]]))
-      Nx.tensor([[0.10269401967525482, 0.8973060250282288]])
+      Nx.tensor([[0.10075931251049042, 0.8992406725883484]])
   """
   defn predict_probability(%__MODULE__{coefficients: coeff, bias: bias} = _model, x) do
     if Nx.rank(x) != 2 do
diff --git a/test/scholar/linear/logistic_regression_test.exs b/test/scholar/linear/logistic_regression_test.exs
index 69cda7be..99767a1b 100644
--- a/test/scholar/linear/logistic_regression_test.exs
+++ b/test/scholar/linear/logistic_regression_test.exs
@@ -6,7 +6,7 @@ defmodule Scholar.Linear.LogisticRegressionTest do
   test "Iris Data Set - multinomial logistic regression test" do
     {x_train, x_test, y_train, y_test} = iris_data()
 
-    model = LogisticRegression.fit(x_train, y_train, num_classes: 3)
+    model = LogisticRegression.fit(x_train, y_train, num_classes: 3, alpha: 0.0)
     res = LogisticRegression.predict(model, x_test)
     accuracy = Scholar.Metrics.Classification.accuracy(res, y_test)
 
@@ -107,8 +107,8 @@ defmodule Scholar.Linear.LogisticRegressionTest do
   describe "linearly separable data" do
     test "1D" do
       key = Nx.Random.key(12)
-      {x1, key} = Nx.Random.uniform(key, -1.0, 0.0, shape: {1000, 1})
-      {x2, _key} = Nx.Random.uniform(key, 0.0, 1.0, shape: {1000, 1})
+      {x1, key} = Nx.Random.uniform(key, -2, -1, shape: {1000, 1})
+      {x2, _key} = Nx.Random.uniform(key, 1, 2, shape: {1000, 1})
       x = Nx.concatenate([x1, x2])
       y1 = Nx.broadcast(0, {1000})
       y2 = Nx.broadcast(1, {1000})
@@ -121,8 +121,8 @@ defmodule Scholar.Linear.LogisticRegressionTest do
 
     test "2D" do
       key = Nx.Random.key(12)
-      {x1, key} = Nx.Random.uniform(key, -1.0, 0.0, shape: {1000, 2})
-      {x2, _key} = Nx.Random.uniform(key, 0.0, 1.0, shape: {1000, 2})
+      {x1, key} = Nx.Random.uniform(key, -2, -1, shape: {1000, 2})
+      {x2, _key} = Nx.Random.uniform(key, 1, 2, shape: {1000, 2})
       x = Nx.concatenate([x1, x2])
       y1 = Nx.broadcast(0, {1000})
       y2 = Nx.broadcast(1, {1000})

From 6283b8d22d180b5b701e8b75476b09ea3bd3d759 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Fri, 16 Jan 2026 13:00:23 +0100
Subject: [PATCH 10/13] Fix tests

---
 lib/scholar/linear/logistic_regression.ex     |  1 +
 .../linear/logistic_regression_test.exs       | 31 -------------------
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index 9c1f1301..b6c8c79f 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -67,6 +67,7 @@ defmodule Scholar.Linear.LogisticRegression do
           [
             [0.0915902629494667, -0.09159023314714432],
             [-0.1507941037416458, 0.1507941335439682]
+          ]
         ),
         bias: Nx.tensor([-0.06566660106182098, 0.06566664576530457])
       }
diff --git a/test/scholar/linear/logistic_regression_test.exs b/test/scholar/linear/logistic_regression_test.exs
index 99767a1b..3248f0cd 100644
--- a/test/scholar/linear/logistic_regression_test.exs
+++ b/test/scholar/linear/logistic_regression_test.exs
@@ -40,23 +40,6 @@ defmodule Scholar.Linear.LogisticRegressionTest do
                    fn -> LogisticRegression.fit(x, y) end
     end
 
-    test "when :optimizer is invalid" do
-      x = Nx.tensor([[1, 2], [3, 4]])
-      y = Nx.tensor([1, 2])
-
-      assert_raise NimbleOptions.ValidationError,
-                   """
-                   invalid value for :optimizer option: expected :optimizer to be either \
-                   a valid 0-arity function in Polaris.Optimizers or a valid {init_fn, update_fn} tuple\
-                   """,
-                   fn ->
-                     LogisticRegression.fit(x, y,
-                       num_classes: 2,
-                       optimizer: :invalid_optimizer
-                     )
-                   end
-    end
-
     test "when :max_iterations is not a positive integer" do
       x = Nx.tensor([[1, 2], [3, 4]])
       y = Nx.tensor([1, 2])
@@ -90,20 +73,6 @@ defmodule Scholar.Linear.LogisticRegressionTest do
     end
   end
 
-  describe "column target tests" do
-    @tag :wip
-    test "column target" do
-      {x_train, _, y_train, _} = iris_data()
-
-      model = LogisticRegression.fit(x_train, y_train, num_classes: 3)
-      pred = LogisticRegression.predict(model, x_train)
-      col_model = LogisticRegression.fit(x_train, y_train |> Nx.new_axis(-1), num_classes: 3)
-      col_pred = LogisticRegression.predict(col_model, x_train)
-      assert model == col_model
-      assert pred == col_pred
-    end
-  end
-
   describe "linearly separable data" do
     test "1D" do
       key = Nx.Random.key(12)

From 9e7d78450bf4dea74f69625a2ce817047e4d1aef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Fri, 16 Jan 2026 13:05:21 +0100
Subject: [PATCH 11/13] Fix docstring in model_selection.ex

---
 lib/scholar/model_selection.ex | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/scholar/model_selection.ex b/lib/scholar/model_selection.ex
index 6303971e..20ce653f 100644
--- a/lib/scholar/model_selection.ex
+++ b/lib/scholar/model_selection.ex
@@ -178,8 +178,8 @@ defmodule Scholar.ModelSelection do
       iex> y = Nx.tensor([0, 1, 2, 0, 1, 1, 0])
       iex> opts = [
       ...>   num_classes: [3],
-      ...>   iterations: [10, 20, 50],
-      ...>   optimizer: [Polaris.Optimizers.adam(learning_rate: 0.005), Polaris.Optimizers.adam(learning_rate: 0.01)],
+      ...>   max_iterations: [10, 20, 50],
+      ...>   alpha: [0.0, 0.1, 1.0],
       ...> ]
       iex> Scholar.ModelSelection.grid_search(x, y, folding_fun, scoring_fun, opts)
   """

From dcc2629f7111a404c9bbcfc891f9a4fa8ca3db3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Fri, 16 Jan 2026 13:46:12 +0100
Subject: [PATCH 12/13] Move max_iterations from arguments to options

---
 lib/scholar/linear/logistic_regression.ex | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index b6c8c79f..9259f3f8 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -98,14 +98,13 @@ defmodule Scholar.Linear.LogisticRegression do
     alpha = Nx.tensor(alpha, type: type)
     {tol, opts} = Keyword.pop!(opts, :tol)
     tol = Nx.tensor(tol, type: type)
-    {max_iterations, opts} = Keyword.pop!(opts, :max_iterations)
-    max_iterations = Nx.tensor(max_iterations, type: :u32)
 
-    fit_n(x, y, alpha, max_iterations, tol, opts)
+    fit_n(x, y, alpha, tol, opts)
   end
 
-  defnp fit_n(x, y, alpha, max_iterations, tol, opts) do
+  defnp fit_n(x, y, alpha, tol, opts) do
     num_classes = opts[:num_classes]
+    max_iterations = opts[:max_iterations]
     {num_samples, num_features} = Nx.shape(x)
 
     type = to_float_type(x)
@@ -145,7 +144,7 @@ defmodule Scholar.Linear.LogisticRegression do
 
     {coef, bias, _} =
       while {w, b,
-             {alpha, x, y_one_hot, max_iterations, tol, armijo_params, iter = Nx.u32(0),
+             {alpha, x, y_one_hot, tol, armijo_params, iter = Nx.u32(0),
               converged? = Nx.u8(0)}},
             iter < max_iterations and not converged? do
         logits = Nx.dot(x, w) + b
@@ -176,7 +175,7 @@ defmodule Scholar.Linear.LogisticRegression do
         converged? =
           Nx.reduce_max(Nx.abs(grad_w)) < tol and Nx.reduce_max(Nx.abs(grad_b)) < tol
 
-        {w, b, {alpha, x, y_one_hot, max_iterations, tol, armijo_params, iter + 1, converged?}}
+        {w, b, {alpha, x, y_one_hot, tol, armijo_params, iter + 1, converged?}}
       end
 
     %__MODULE__{

From 2e230db35525a51ac0b4750199583e0b69ecdf35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krsto=20Prorokovi=C4=87?= <krstopro@MacBook-Pro-3.local>
Date: Fri, 16 Jan 2026 13:49:06 +0100
Subject: [PATCH 13/13] mix format

---
 lib/scholar/linear/logistic_regression.ex | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/scholar/linear/logistic_regression.ex b/lib/scholar/linear/logistic_regression.ex
index 9259f3f8..9230aee8 100644
--- a/lib/scholar/linear/logistic_regression.ex
+++ b/lib/scholar/linear/logistic_regression.ex
@@ -144,8 +144,7 @@ defmodule Scholar.Linear.LogisticRegression do
 
     {coef, bias, _} =
       while {w, b,
-             {alpha, x, y_one_hot, tol, armijo_params, iter = Nx.u32(0),
-              converged? = Nx.u8(0)}},
+             {alpha, x, y_one_hot, tol, armijo_params, iter = Nx.u32(0), converged? = Nx.u8(0)}},
             iter < max_iterations and not converged? do
         logits = Nx.dot(x, w) + b
         probabilities = softmax(logits)