kokkos · mhoemmen · Jan 28, 2026
diff --git a/examples/kokkos-based/CMakeLists.txt b/examples/kokkos-based/CMakeLists.txt
@@ -5,6 +5,5 @@ linalg_add_example(dotc_kokkos)
 linalg_add_example(idx_abs_max_kokkos)
 linalg_add_example(vector_norm2_kokkos)
 linalg_add_example(vector_abs_sum_kokkos)
-linalg_add_example(vector_sum_of_squares_kokkos)
 linalg_add_example(scale_kokkos)
 linalg_add_example(matrix_vector_product_kokkos)
diff --git a/examples/kokkos-based/vector_sum_of_squares_kokkos.cpp b/examples/kokkos-based/vector_sum_of_squares_kokkos.cpp
diff --git a/include/experimental/__p1673_bits/blas1_vector_norm2.hpp b/include/experimental/__p1673_bits/blas1_vector_norm2.hpp
@@ -18,6 +18,7 @@
 #ifndef LINALG_INCLUDE_EXPERIMENTAL___P1673_BITS_BLAS1_VECTOR_NORM2_HPP_
 #define LINALG_INCLUDE_EXPERIMENTAL___P1673_BITS_BLAS1_VECTOR_NORM2_HPP_
 
+#include "abs_if_needed.hpp"
 #include "blas1_vector_sum_of_squares.hpp"
 #include <cmath>
 #include <cstdlib>
@@ -60,18 +61,26 @@ Scalar vector_two_norm(
   mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> x,
   Scalar init)
 {
-  // Initialize the sum of squares result
-  sum_of_squares_result<Scalar> ssq_init;
-  ssq_init.scaling_factor = Scalar{};
-  // FIXME (Hoemmen 2021/05/27) We'll need separate versions of this
-  // for types whose "one" we don't know how to construct.
-  ssq_init.scaled_sum_of_squares = 1.0;
-
-  // Compute the sum of squares using an algorithm that avoids
-  // underflow and overflow by scaling.
-  auto ssq_res = vector_sum_of_squares(exec, x, ssq_init);
   using std::sqrt;
-  return init + ssq_res.scaling_factor * sqrt(ssq_res.scaled_sum_of_squares);
+
+  if constexpr (std::is_floating_point_v<Scalar> && std::is_floating_point_v<typename decltype(x)::value_type>) {
+    // Initialize the sum of squares result
+    detail::sum_of_squares_result<Scalar> ssq_init;
+    ssq_init.scaling_factor = init == Scalar{} ? Scalar{} : impl::abs_if_needed(init);
+    ssq_init.scaled_sum_of_squares = init == Scalar{} ? Scalar{} : Scalar{1.0};
+
+    // Compute the sum of squares using an algorithm that avoids
+    // underflow and overflow by scaling.
+    auto ssq_res = detail::vector_sum_of_squares(exec, x, ssq_init);
+    return ssq_res.scaling_factor * sqrt(ssq_res.scaled_sum_of_squares);
+  }
+  else {
+    Scalar result = impl::abs_if_needed(init) * impl::abs_if_needed(init);
+    for (SizeType i = 0; i < x.extent(0); ++i) {
+      result += impl::abs_if_needed(x(i)) * impl::abs_if_needed(x(i));
+    }
+    return sqrt(result);
+  }
 }
 
 template<class ExecutionPolicy,

diff --git a/include/experimental/__p1673_bits/blas1_vector_sum_of_squares.hpp b/include/experimental/__p1673_bits/blas1_vector_sum_of_squares.hpp
@@ -26,6 +26,8 @@ namespace MDSPAN_IMPL_PROPOSED_NAMESPACE {
 inline namespace __p1673_version_0 {
 namespace linalg {
 
+namespace detail {
+
 // Scaled sum of squares of a vector's elements
 template<class Scalar>
 struct sum_of_squares_result {
@@ -53,19 +55,23 @@ struct is_custom_vector_sum_of_squares_avail<
     && ! impl::is_inline_exec_v<Exec>
     >
   >
-  : std::true_type{};
+  : std::bool_constant<
+      std::is_floating_point_v<typename x_t::value_type> &&
+      std::is_floating_point_v<Scalar>
+    >
+  {};
 
 } // end anonymous namespace
 
 template<class ElementType,
-	 class SizeType,
+	 class IndexType,
          ::std::size_t ext0,
          class Layout,
          class Accessor,
          class Scalar>
 sum_of_squares_result<Scalar> vector_sum_of_squares(
   impl::inline_exec_t&& /* exec */,
-  mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> x,
+  mdspan<ElementType, extents<IndexType, ext0>, Layout, Accessor> x,
   sum_of_squares_result<Scalar> init)
 {
   using std::abs;
@@ -79,7 +85,7 @@ sum_of_squares_result<Scalar> vector_sum_of_squares(
 
   Scalar scale = init.scaling_factor;
   Scalar ssq = init.scaled_sum_of_squares;
-  for (SizeType i = 0; i < x.extent(0); ++i) {
+  for (IndexType i = 0; i < x.extent(0); ++i) {
     if (abs(x(i)) != 0.0) {
       const auto absxi = abs(x(i));
       if (scale < absxi) {
@@ -102,14 +108,14 @@ sum_of_squares_result<Scalar> vector_sum_of_squares(
 
 template<class ExecutionPolicy,
          class ElementType,
-	 class SizeType,
+	 class IndexType,
          ::std::size_t ext0,
          class Layout,
          class Accessor,
          class Scalar>
 sum_of_squares_result<Scalar> vector_sum_of_squares(
   ExecutionPolicy&& exec,
-  mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> v,
+  mdspan<ElementType, extents<IndexType, ext0>, Layout, Accessor> v,
   sum_of_squares_result<Scalar> init)
 {
   constexpr bool use_custom = is_custom_vector_sum_of_squares_avail<
@@ -125,18 +131,19 @@ sum_of_squares_result<Scalar> vector_sum_of_squares(
 }
 
 template<class ElementType,
-	 class SizeType,
+	 class IndexType,
          ::std::size_t ext0,
          class Layout,
          class Accessor,
          class Scalar>
 sum_of_squares_result<Scalar> vector_sum_of_squares(
-  mdspan<ElementType, extents<SizeType, ext0>, Layout, Accessor> v,
+  mdspan<ElementType, extents<IndexType, ext0>, Layout, Accessor> v,
   sum_of_squares_result<Scalar> init)
 {
   return vector_sum_of_squares(impl::default_exec_t{}, v, init);
 }
 
+} // end namespace detail
 
 } // end namespace linalg
 } // end inline namespace __p1673_version_0

diff --git a/tests/kokkos-based/CMakeLists.txt b/tests/kokkos-based/CMakeLists.txt
@@ -80,9 +80,6 @@ linalg_add_test_kokkos(
 linalg_add_test_kokkos(
   vector_norm2_kokkos
   "vector_norm2: kokkos impl")
-linalg_add_test_kokkos(
-  vector_sum_of_squares_kokkos
-  "vector_sum_of_squares: kokkos impl")
 
 linalg_add_test_kokkos(
   vector_abs_sum_kokkos

diff --git a/tests/kokkos-based/vector_sum_of_squares_kokkos.cpp b/tests/kokkos-based/vector_sum_of_squares_kokkos.cpp
diff --git a/tests/native/norm2.cpp b/tests/native/norm2.cpp
@@ -31,15 +31,14 @@ namespace {
     std::vector<scalar_t> storage(vectorSize);
     vector_t x(storage.data(), vectorSize);
 
-    // Testing for absolute equality
     const auto normResult = vector_two_norm(x, mag_t{});
     static_assert( std::is_same_v<std::remove_const_t<decltype(normResult)>, mag_t> );
     const mag_t expectedNormResult{};
     EXPECT_EQ( expectedNormResult, normResult );
 
-    // Make sure that init always gets added to the result.
-    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(1.0));
-    EXPECT_EQ( expectedNormResult + mag_t(1.0), normResultPlusOne );
+    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(3.0));
+    const mag_t expectedNormResultPlusOne = mag_t(3.0);
+    EXPECT_EQ( expectedNormResultPlusOne, normResultPlusOne );
 
     // Test 'auto' overload.
     const auto normResultAuto = vector_two_norm(x);
@@ -63,15 +62,14 @@ namespace {
 
     x[0] = -3;
 
-    // Testing for absolute equality
     const auto normResult = vector_two_norm(x, mag_t{});
     static_assert( std::is_same_v<std::remove_const_t<decltype(normResult)>, mag_t> );
     const mag_t expectedNormResult = abs( x[0] );
     EXPECT_EQ( expectedNormResult, normResult );
 
-    // Make sure that init always gets added to the result.
-    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(1.0));
-    EXPECT_EQ( expectedNormResult + mag_t(1.0), normResultPlusOne );
+    const mag_t normResultPlusOne = vector_two_norm(x, mag_t(4.0));
+    const mag_t expectedNormResultPlusOne = mag_t(5.0);
+    EXPECT_EQ( expectedNormResultPlusOne, normResultPlusOne );
 
     // Test 'auto' overload.
     const auto normResultAuto = vector_two_norm(x);

diff --git a/...tions/include/experimental/__p1673_bits/kokkos-kernels/blas1_vector_sum_of_squares_kk.hpp b/...tions/include/experimental/__p1673_bits/kokkos-kernels/blas1_vector_sum_of_squares_kk.hpp
@@ -4,7 +4,7 @@
 
 #include "signal_kokkos_impl_called.hpp"
 
-namespace KokkosKernelsSTD {
+namespace KokkosKernelsSTD::detail {
 
 template<class ExecSpace,
          class ElementType,