From 0e215d4a1361caa7589cef4d179b38ba50105afb Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Wed, 28 Jan 2026 09:55:45 -0700
Subject: [PATCH] Implement LWG4302; fix #320

* Remove vector_sum_of_squares from the public interface.
  This implements the proposed fix for LWG4302, that is now
  in the Working Draft for C++26.

* Fix Issue #320.  If the "init" argument of vector_two_norm
  is nonzero, return the square root of the ((square of init)
  and the (sum of squares of the absolute values of the
  elements of x)).
---
 examples/kokkos-based/CMakeLists.txt          |   1 -
 .../vector_sum_of_squares_kokkos.cpp          |  52 --------
 .../__p1673_bits/blas1_vector_norm2.hpp       |  31 +++--
 .../blas1_vector_sum_of_squares.hpp           |  23 ++--
 tests/kokkos-based/CMakeLists.txt             |   3 -
 .../vector_sum_of_squares_kokkos.cpp          | 111 ------------------
 tests/native/norm2.cpp                        |  14 +--
 .../blas1_vector_sum_of_squares_kk.hpp        |   2 +-
 8 files changed, 42 insertions(+), 195 deletions(-)
 delete mode 100644 examples/kokkos-based/vector_sum_of_squares_kokkos.cpp
 delete mode 100644 tests/kokkos-based/vector_sum_of_squares_kokkos.cpp
diff --git a/examples/kokkos-based/CMakeLists.txt b/examples/kokkos-based/CMakeLists.txt
index d88f634b..fc46076f 100644
--- a/examples/kokkos-based/CMakeLists.txt
+++ b/examples/kokkos-based/CMakeLists.txt
@@ -5,6 +5,5 @@ linalg_add_example(dotc_kokkos)
 linalg_add_example(idx_abs_max_kokkos)
 linalg_add_example(vector_norm2_kokkos)
 linalg_add_example(vector_abs_sum_kokkos)
-linalg_add_example(vector_sum_of_squares_kokkos)
 linalg_add_example(scale_kokkos)
 linalg_add_example(matrix_vector_product_kokkos)
diff --git a/examples/kokkos-based/vector_sum_of_squares_kokkos.cpp b/examples/kokkos-based/vector_sum_of_squares_kokkos.cpp
deleted file mode 100644
index 1820b89a..00000000
--- a/examples/kokkos-based/vector_sum_of_squares_kokkos.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 4.0
-//       Copyright (2022) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
-// See https://kokkos.org/LICENSE for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// ************************************************************************
-//@HEADER
-
-#include <experimental/linalg>
-#include <iostream>
-
-int main(int argc, char* argv[])
-{
-  std::cout << "vector_sum_of_squares example: calling kokkos-kernels" << std::endl;
-
-  std::size_t N = 20;
-  Kokkos::initialize(argc,argv);
-  {
-    using value_type = double;
-    Kokkos::View<value_type*> x_view("x",N);
-    value_type* x_ptr = x_view.data();
-
-    using dyn_1d_ext_type = std::experimental::extents<std::experimental::dynamic_extent>;
-    using mdspan_type  = std::experimental::mdspan<value_type, dyn_1d_ext_type>;
-    mdspan_type x(x_ptr,N);
-    for(std::size_t i=0; i<x.extent(0); i++){
-      x(i) = i;
-    }
-
-    namespace stdla = std::experimental::linalg;
-    stdla::sum_of_squares_result<value_type> init_value{1., 1.};
-
-    const auto res = stdla::vector_sum_of_squares(x, init_value);
-    std::cout << "Default result: " << res.scaling_factor << " " << res.scaled_sum_of_squares << '\n';
-
-    // FRIZZI: Oct 27: kk currently not impl yet, just placeholder to ensure hook forwards correctly
-    const auto res_kk = stdla::vector_sum_of_squares(KokkosKernelsSTD::kokkos_exec<>(), x, init_value);
-    (void)res_kk;
-    //std::cout << "Kokkos result: " << res_kk.scaling_factor << " " << res_kk.scaled_sum_of_squares << '\n';
-
-  }
-  Kokkos::finalize();
-}
diff --git a/include/experimental/__p1673_bits/blas1_vector_norm2.hpp b/include/experimental/__p1673_bits/blas1_vector_norm2.hpp
index 61d657ac..147f523e 100644
--- a/include/experimental/__p1673_bits/blas1_vector_norm2.hpp
+++ b/include/experimental/__p1673_bits/blas1_vector_norm2.hpp
@@ -18,6 +18,7 @@
 #ifndef LINALG_INCLUDE_EXPERIMENTAL___P1673_BITS_BLAS1_VECTOR_NORM2_HPP_
 #define LINALG_INCLUDE_EXPERIMENTAL___P1673_BITS_BLAS1_VECTOR_NORM2_HPP_
 
+#include "abs_if_needed.hpp"
 #include "blas1_vector_sum_of_squares.hpp"
 #include <cmath>
 #include <cstdlib>
@@ -60,18 +61,26 @@ Scalar vector_two_norm(
   mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> x,
   Scalar init)
 {
-  // Initialize the sum of squares result
-  sum_of_squares_result<Scalar> ssq_init;
-  ssq_init.scaling_factor = Scalar{};
-  // FIXME (Hoemmen 2021/05/27) We'll need separate versions of this
-  // for types whose "one" we don't know how to construct.
-  ssq_init.scaled_sum_of_squares = 1.0;
-
-  // Compute the sum of squares using an algorithm that avoids
-  // underflow and overflow by scaling.
-  auto ssq_res = vector_sum_of_squares(exec, x, ssq_init);
   using std::sqrt;
-  return init + ssq_res.scaling_factor * sqrt(ssq_res.scaled_sum_of_squares);
+
+  if constexpr (std::is_floating_point_v<Scalar> && std::is_floating_point_v<typename decltype(x)::value_type>) {
+    // Initialize the sum of squares result
+    detail::sum_of_squares_result<Scalar> ssq_init;
+    ssq_init.scaling_factor = init == Scalar{} ? Scalar{} : impl::abs_if_needed(init);
+    ssq_init.scaled_sum_of_squares = init == Scalar{} ? Scalar{} : Scalar{1.0};
+
+    // Compute the sum of squares using an algorithm that avoids
+    // underflow and overflow by scaling.
+    auto ssq_res = detail::vector_sum_of_squares(exec, x, ssq_init);
+    return ssq_res.scaling_factor * sqrt(ssq_res.scaled_sum_of_squares);
+  }
+  else {
+    Scalar result = impl::abs_if_needed(init) * impl::abs_if_needed(init);
+    for (SizeType i = 0; i < x.extent(0); ++i) {
+      result += impl::abs_if_needed(x(i)) * impl::abs_if_needed(x(i));
+    }
+    return sqrt(result);
+  }
 }
 
 template<class ExecutionPolicy,
diff --git a/include/experimental/__p1673_bits/blas1_vector_sum_of_squares.hpp b/include/experimental/__p1673_bits/blas1_vector_sum_of_squares.hpp
index b563decf..15e4a219 100644
--- a/include/experimental/__p1673_bits/blas1_vector_sum_of_squares.hpp
+++ b/include/experimental/__p1673_bits/blas1_vector_sum_of_squares.hpp
@@ -26,6 +26,8 @@ namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
 inline namespace __p1673_version_0 {
 namespace linalg {
 
+namespace detail {
+
 // Scaled sum of squares of a vector's elements
 template<class Scalar>
 struct sum_of_squares_result {
@@ -53,19 +55,23 @@ struct is_custom_vector_sum_of_squares_avail<
     && ! impl::is_inline_exec_v<Exec>
     >
   >
-  : std::true_type{};
+  : std::bool_constant<
+      std::is_floating_point_v<typename x_t::value_type> &&
+      std::is_floating_point_v<Scalar>
+    >
+  {};
 
 } // end anonymous namespace
 
 template<class ElementType,
-	 class SizeType,
+	 class IndexType,
          ::std::size_t ext0,
          class Layout,
          class Accessor,
          class Scalar>
 sum_of_squares_result<Scalar> vector_sum_of_squares(
   impl::inline_exec_t&& /* exec */,
-  mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> x,
+  mdspan<ElementType, extents<IndexType, ext0>, Layout, Accessor> x,
   sum_of_squares_result<Scalar> init)
 {
   using std::abs;
@@ -79,7 +85,7 @@ sum_of_squares_result<Scalar> vector_sum_of_squares(
 
   Scalar scale = init.scaling_factor;
   Scalar ssq = init.scaled_sum_of_squares;
-  for (SizeType i = 0; i < x.extent(0); ++i) {
+  for (IndexType i = 0; i < x.extent(0); ++i) {
     if (abs(x(i)) != 0.0) {
       const auto absxi = abs(x(i));
       if (scale < absxi) {
@@ -102,14 +108,14 @@ sum_of_squares_result<Scalar> vector_sum_of_squares(
 
 template<class ExecutionPolicy,
          class ElementType,
-	 class SizeType,
+	 class IndexType,
          ::std::size_t ext0,
          class Layout,
          class Accessor,
          class Scalar>
 sum_of_squares_result<Scalar> vector_sum_of_squares(
   ExecutionPolicy&& exec,
-  mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> v,
+  mdspan<ElementType, extents<IndexType, ext0>, Layout, Accessor> v,
   sum_of_squares_result<Scalar> init)
 {
   constexpr bool use_custom = is_custom_vector_sum_of_squares_avail<
@@ -125,18 +131,19 @@ sum_of_squares_result<Scalar> vector_sum_of_squares(
 }
 
 template<class ElementType,
-	 class SizeType,
+	 class IndexType,
          ::std::size_t ext0,
          class Layout,
          class Accessor,
          class Scalar>
 sum_of_squares_result<Scalar> vector_sum_of_squares(
-  mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> v,
+  mdspan<ElementType, extents<IndexType, ext0>, Layout, Accessor> v,
   sum_of_squares_result<Scalar> init)
 {
   return vector_sum_of_squares(impl::default_exec_t{}, v, init);
 }
 
+} // end namespace detail
 
 } // end namespace linalg
 } // end inline namespace __p1673_version_0
diff --git a/tests/kokkos-based/CMakeLists.txt b/tests/kokkos-based/CMakeLists.txt
index aa247c04..28cfcd84 100644
--- a/tests/kokkos-based/CMakeLists.txt
+++ b/tests/kokkos-based/CMakeLists.txt
@@ -80,9 +80,6 @@ linalg_add_test_kokkos(
 linalg_add_test_kokkos(
   vector_norm2_kokkos
   "vector_norm2: kokkos impl")
-linalg_add_test_kokkos(
-  vector_sum_of_squares_kokkos
-  "vector_sum_of_squares: kokkos impl")
 
 linalg_add_test_kokkos(
   vector_abs_sum_kokkos
diff --git a/tests/kokkos-based/vector_sum_of_squares_kokkos.cpp b/tests/kokkos-based/vector_sum_of_squares_kokkos.cpp
deleted file mode 100644
index 15614147..00000000
--- a/tests/kokkos-based/vector_sum_of_squares_kokkos.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-
-#include "gtest_fixtures.hpp"
-#include "helpers.hpp"
-#include <cmath>
-
-namespace
-{
-
-template<class x_t, class T>
-std::experimental::linalg::sum_of_squares_result<T>
-vector_sum_of_squares_gold_solution(x_t x,
-				    std::experimental::linalg::sum_of_squares_result<T> init)
-{
-  using std::abs;
-
-  T scale = init.scaling_factor;
-  for (std::size_t i = 0; i < x.extent(0); ++i) {
-    scale = std::max(scale, abs(x(i)));
-  }
-
-  T ssq = (init.scaling_factor*init.scaling_factor*init.scaled_sum_of_squares)/(scale*scale);
-  T s=0.;
-  for (std::size_t i = 0; i < x.extent(0); ++i) {
-    const auto absxi = abs(x(i));
-    const auto quotient = absxi/scale;
-    ssq = ssq + quotient * quotient;
-    s += absxi*absxi;
-  }
-
-  std::experimental::linalg::sum_of_squares_result<T> result;
-  result.scaled_sum_of_squares = ssq;
-  result.scaling_factor = scale;
-
-  // verify that things are consistent according to definition
-  // scaled_sum_of_squares: is a value such that
-  // scaling_factor^2 * scaled_sum_of_squares equals the
-  // sum of squares of abs(x[i]) plus init.scaling_factor^2 * init.scaled_sum_of_squares.
-  //
-  const auto lhs = scale*scale*ssq;
-  const auto rhs = s+init.scaling_factor*init.scaling_factor*init.scaled_sum_of_squares;
-  std::cout << "Gold check : " << lhs << " " << rhs << std::endl;
-  if constexpr(std::is_same_v<T, float>){
-    EXPECT_NEAR(lhs, rhs, 1e-2);
-  }
-  if constexpr(std::is_same_v<T, double>){
-    EXPECT_NEAR(lhs, rhs, 1e-9);
-  }
-
-  return result;
-}
-
-template<class x_t, class T>
-void kokkos_blas1_vector_sum_of_squares_test_impl(x_t x,
-						  std::experimental::linalg::sum_of_squares_result<T> initValue)
-{
-  namespace stdla = std::experimental::linalg;
-
-  using value_type = typename x_t::value_type;
-  const std::size_t extent = x.extent(0);
-
-  // copy x to verify they are not changed after kernel
-  auto x_preKernel = kokkostesting::create_stdvector_and_copy(x);
-
-  const auto gold = vector_sum_of_squares_gold_solution(x, initValue);
-  auto result = stdla::vector_sum_of_squares(KokkosKernelsSTD::kokkos_exec<>(),
-					  x, initValue);
-
-  if constexpr(std::is_same_v<T, float>)
-  {
-    EXPECT_NEAR(result.scaled_sum_of_squares, gold.scaled_sum_of_squares, 1e-3);
-    EXPECT_NEAR(result.scaling_factor,	      gold.scaling_factor,	  1e-3);
-  }
-
-  if constexpr(std::is_same_v<T, double>)
-  {
-    EXPECT_NEAR(result.scaled_sum_of_squares, gold.scaled_sum_of_squares, 1e-9);
-    EXPECT_NEAR(result.scaling_factor,	      gold.scaling_factor,	  1e-9);
-  }
-
-  // x should not change after kernel
-  for (std::size_t i=0; i<extent; ++i){
-    EXPECT_TRUE(x(i) == x_preKernel[i]);
-  }
-
-}
-}//end anonym namespace
-
-TEST_F(blas1_signed_float_fixture, kokkos_vector_sum_of_squares)
-{
-  namespace stdla = std::experimental::linalg;
-  stdla::sum_of_squares_result<value_type> init_value{2.5f, 1.2f};
-  kokkos_blas1_vector_sum_of_squares_test_impl(x, init_value);
-}
-
-TEST_F(blas1_signed_double_fixture, kokkos_vector_sum_of_squares)
-{
-  namespace stdla = std::experimental::linalg;
-  stdla::sum_of_squares_result<value_type> init_value{3.0, 1.2};
-  kokkos_blas1_vector_sum_of_squares_test_impl(x, init_value);
-}
-
-TEST_F(blas1_signed_complex_double_fixture, kokkos_vector_sum_of_squares)
-{
-  namespace stdla = std::experimental::linalg;
-  using kc_t = Kokkos::complex<double>;
-  using stdc_t = value_type;
-  if constexpr (alignof(value_type) == alignof(kc_t)){
-    stdla::sum_of_squares_result<double> init_value{2.5, 1.2};
-    kokkos_blas1_vector_sum_of_squares_test_impl(x, init_value);
-  }
-}
diff --git a/tests/native/norm2.cpp b/tests/native/norm2.cpp
index 5c3877bb..f489a28b 100644
--- a/tests/native/norm2.cpp
+++ b/tests/native/norm2.cpp
@@ -31,15 +31,14 @@ namespace {
     std::vector<scalar_t> storage(vectorSize);
     vector_t x(storage.data(), vectorSize);
 
-    // Testing for absolute equality
     const auto normResult = vector_two_norm(x, mag_t{});
     static_assert( std::is_same_v<std::remove_const_t<decltype(normResult)>, mag_t> );
     const mag_t expectedNormResult{};
     EXPECT_EQ( expectedNormResult, normResult );
 
-    // Make sure that init always gets added to the result.
-    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(1.0));
-    EXPECT_EQ( expectedNormResult + mag_t(1.0), normResultPlusOne );
+    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(3.0));
+    const mag_t expectedNormResultPlusOne = mag_t(3.0);
+    EXPECT_EQ( expectedNormResultPlusOne, normResultPlusOne );
 
     // Test 'auto' overload.
     const auto normResultAuto = vector_two_norm(x);
@@ -63,15 +62,14 @@ namespace {
 
     x[0] = -3;
 
-    // Testing for absolute equality
     const auto normResult = vector_two_norm(x, mag_t{});
     static_assert( std::is_same_v<std::remove_const_t<decltype(normResult)>, mag_t> );
     const mag_t expectedNormResult = abs( x[0] );
     EXPECT_EQ( expectedNormResult, normResult );
 
-    // Make sure that init always gets added to the result.
-    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(1.0));
-    EXPECT_EQ( expectedNormResult + mag_t(1.0), normResultPlusOne );
+    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(4.0));
+    const mag_t expectedNormResultPlusOne = mag_t(5.0);
+    EXPECT_EQ( expectedNormResultPlusOne, normResultPlusOne );
 
     // Test 'auto' overload.
     const auto normResultAuto = vector_two_norm(x);
diff --git a/tpl-implementations/include/experimental/__p1673_bits/kokkos-kernels/blas1_vector_sum_of_squares_kk.hpp b/tpl-implementations/include/experimental/__p1673_bits/kokkos-kernels/blas1_vector_sum_of_squares_kk.hpp
index 413fda4c..2e8e8475 100644
--- a/tpl-implementations/include/experimental/__p1673_bits/kokkos-kernels/blas1_vector_sum_of_squares_kk.hpp
+++ b/tpl-implementations/include/experimental/__p1673_bits/kokkos-kernels/blas1_vector_sum_of_squares_kk.hpp
@@ -4,7 +4,7 @@
 
 #include "signal_kokkos_impl_called.hpp"
 
-namespace KokkosKernelsSTD {
+namespace KokkosKernelsSTD::detail {
 
 template<class ExecSpace,
          class ElementType,