From 016dab05f57081fa6a650dcafeec592816bc12d1 Mon Sep 17 00:00:00 2001 From: Ivan Pleshkov Date: Mon, 6 May 2024 11:54:12 +0200 Subject: [PATCH] remove avx and sse c code --- Cargo.lock | 1 - quantization/Cargo.toml | 3 - quantization/build.rs | 41 ------- quantization/cpp/avx2.c | 122 --------------------- quantization/cpp/sse.c | 107 ------------------ quantization/src/encoded_vectors_binary.rs | 6 +- quantization/src/encoded_vectors_u8.rs | 13 +-- quantization/src/lib.rs | 1 + quantization/src/simd/avx2/dot_u8.rs | 40 +++++++ quantization/src/simd/avx2/manhattan_u8.rs | 56 ++++++++++ quantization/src/simd/avx2/mod.rs | 13 +++ quantization/src/simd/mod.rs | 8 ++ quantization/src/simd/neon/mod.rs | 0 quantization/src/simd/sse2/dot_u8.rs | 25 +++++ quantization/src/simd/sse2/manhattan_u8.rs | 39 +++++++ quantization/src/simd/sse2/mod.rs | 13 +++ quantization/src/simd/sse2/xor_popcnt.rs | 18 +++ 17 files changed, 218 insertions(+), 288 deletions(-) delete mode 100644 quantization/build.rs delete mode 100644 quantization/cpp/avx2.c delete mode 100644 quantization/cpp/sse.c create mode 100644 quantization/src/simd/avx2/dot_u8.rs create mode 100644 quantization/src/simd/avx2/manhattan_u8.rs create mode 100644 quantization/src/simd/avx2/mod.rs create mode 100644 quantization/src/simd/mod.rs create mode 100644 quantization/src/simd/neon/mod.rs create mode 100644 quantization/src/simd/sse2/dot_u8.rs create mode 100644 quantization/src/simd/sse2/manhattan_u8.rs create mode 100644 quantization/src/simd/sse2/mod.rs create mode 100644 quantization/src/simd/sse2/xor_popcnt.rs diff --git a/Cargo.lock b/Cargo.lock index 5a8ccb0..f439e5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1478,7 +1478,6 @@ dependencies = [ name = "quantization" version = "0.1.0" dependencies = [ - "cc", "image", "num_threads", "permutation_iterator", diff --git a/quantization/Cargo.toml b/quantization/Cargo.toml index bd20b30..df49e60 100644 --- a/quantization/Cargo.toml +++ b/quantization/Cargo.toml @@ -13,9 +13,6 @@ edition = "2021" [features] dump_image = ["dep:image"] -[build-dependencies] -cc = "1.0" - [dependencies] serde = { version = "~1.0", features = ["derive"] } serde_json = "~1.0" diff --git a/quantization/build.rs b/quantization/build.rs deleted file mode 100644 index daa854b..0000000 --- a/quantization/build.rs +++ /dev/null @@ -1,41 +0,0 @@ -use std::env; - -fn main() { - let mut builder = cc::Build::new(); - - let target_arch = env::var("CARGO_CFG_TARGET_ARCH") - .expect("CARGO_CFG_TARGET_ARCH env-var is not defined or is not UTF-8"); - - // TODO: Is `CARGO_CFG_TARGET_FEATURE` *always* defined? - // - // Cargo docs says that "boolean configurations are present if they are set, - // and not present otherwise", so, what about "target features"? - // - // https://doc.rust-lang.org/cargo/reference/environment-variables.html (Ctrl-F CARGO_CFG_) - let target_feature = env::var("CARGO_CFG_TARGET_FEATURE") - .expect("CARGO_CFG_TARGET_FEATURE env-var is not defined or is not UTF-8"); - - if target_arch == "x86_64" { - builder.file("cpp/sse.c"); - builder.file("cpp/avx2.c"); - - if builder.get_compiler().is_like_msvc() { - builder.flag("/arch:AVX"); - builder.flag("/arch:AVX2"); - builder.flag("/arch:SSE"); - builder.flag("/arch:SSE2"); - } else { - builder.flag("-march=haswell"); - } - - // O3 optimization level - builder.flag("-O3"); - // Use popcnt instruction - builder.flag("-mpopcnt"); - } else if target_arch == "aarch64" && target_feature.split(',').any(|feat| feat == "neon") { - builder.file("cpp/neon.c"); - builder.flag("-O3"); - } - - builder.compile("simd_utils"); -} diff --git a/quantization/cpp/avx2.c b/quantization/cpp/avx2.c deleted file mode 100644 index 87dcc85..0000000 --- a/quantization/cpp/avx2.c +++ /dev/null @@ -1,122 +0,0 @@ -#include -#include -#include - -#include "export_macro.h" - -#define HSUM256_PS(X, R) \ - float R = 0.0f; \ - { \ - __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(X, 1), _mm256_castps256_ps128(X)); \ - __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); \ - __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); \ - R = _mm_cvtss_f32(x32); \ - } - -#define HSUM256_EPI32(X, R) \ - int R = 0; \ - { \ - __m128i x128 = _mm_add_epi32(_mm256_extractf128_si256(X, 1), _mm256_castsi256_si128(X)); \ - __m128i x64 = _mm_add_epi32(x128, _mm_srli_si128(x128, 8)); \ - __m128i x32 = _mm_add_epi32(x64, _mm_srli_si128(x64, 4)); \ - R = _mm_cvtsi128_si32(x32); \ - } - -EXPORT float impl_score_dot_avx( - const uint8_t* query_ptr, - const uint8_t* vector_ptr, - uint32_t dim -) { - const __m256i* v_ptr = (const __m256i*)vector_ptr; - const __m256i* q_ptr = (const __m256i*)query_ptr; - - __m256i mul1 = _mm256_setzero_si256(); - __m256i mask_epu32 = _mm256_set1_epi32(0xFFFF); - for (uint32_t _i = 0; _i < dim / 32; _i++) { - __m256i v = _mm256_loadu_si256(v_ptr); - __m256i q = _mm256_loadu_si256(q_ptr); - v_ptr++; - q_ptr++; - - __m256i s = _mm256_maddubs_epi16(v, q); - __m256i s_low = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(s)); - __m256i s_high = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(s, 1)); - mul1 = _mm256_add_epi32(mul1, s_low); - mul1 = _mm256_add_epi32(mul1, s_high); - } - - // the vector sizes are assumed to be multiples of 16, check if one last 16-element part remaining - if (dim % 32 != 0) { - __m128i v_short = _mm_loadu_si128((const __m128i*)v_ptr); - __m128i q_short = _mm_loadu_si128((const __m128i*)q_ptr); - - __m256i v1 = _mm256_cvtepu8_epi16(v_short); - __m256i q1 = _mm256_cvtepu8_epi16(q_short); - - __m256i s = _mm256_mullo_epi16(v1, q1); - mul1 = _mm256_add_epi32(mul1, _mm256_and_si256(s, mask_epu32)); - mul1 = _mm256_add_epi32(mul1, _mm256_srli_epi32(s, 16)); - } - __m256 mul_ps = _mm256_cvtepi32_ps(mul1); - HSUM256_PS(mul_ps, mul_scalar); - return mul_scalar; -} - -EXPORT float impl_score_l1_avx( - const uint8_t* query_ptr, - const uint8_t* vector_ptr, - uint32_t dim -) { - const __m256i* v_ptr = (const __m256i*)vector_ptr; - const __m256i* q_ptr = (const __m256i*)query_ptr; - - uint32_t m = dim - (dim % 32); - __m256i sum256 = _mm256_setzero_si256(); - - for (uint32_t i = 0; i < m; i += 32) { - __m256i v = _mm256_loadu_si256(v_ptr); - __m256i q = _mm256_loadu_si256(q_ptr); - v_ptr++; - q_ptr++; - - // Compute the difference in both directions and take the maximum for abs - __m256i diff1 = _mm256_subs_epu8(v, q); - __m256i diff2 = _mm256_subs_epu8(q, v); - - __m256i abs_diff = _mm256_max_epu8(diff1, diff2); - - __m256i abs_diff16_lo = _mm256_unpacklo_epi8(abs_diff, _mm256_setzero_si256()); - __m256i abs_diff16_hi = _mm256_unpackhi_epi8(abs_diff, _mm256_setzero_si256()); - - sum256 = _mm256_add_epi16(sum256, abs_diff16_lo); - sum256 = _mm256_add_epi16(sum256, abs_diff16_hi); - } - - // the vector sizes are assumed to be multiples of 16, check if one last 16-element part remaining - if (m < dim) { - __m128i v_short = _mm_loadu_si128((const __m128i * ) v_ptr); - __m128i q_short = _mm_loadu_si128((const __m128i * ) q_ptr); - - __m128i diff1 = _mm_subs_epu8(v_short, q_short); - __m128i diff2 = _mm_subs_epu8(q_short, v_short); - - __m128i abs_diff = _mm_max_epu8(diff1, diff2); - - __m128i abs_diff16_lo_128 = _mm_unpacklo_epi8(abs_diff, _mm_setzero_si128()); - __m128i abs_diff16_hi_128 = _mm_unpackhi_epi8(abs_diff, _mm_setzero_si128()); - - __m256i abs_diff16_lo = _mm256_cvtepu16_epi32(abs_diff16_lo_128); - __m256i abs_diff16_hi = _mm256_cvtepu16_epi32(abs_diff16_hi_128); - - sum256 = _mm256_add_epi16(sum256, abs_diff16_lo); - sum256 = _mm256_add_epi16(sum256, abs_diff16_hi); - } - - __m256i sum_epi32 = _mm256_add_epi32( - _mm256_unpacklo_epi16(sum256, _mm256_setzero_si256()), - _mm256_unpackhi_epi16(sum256, _mm256_setzero_si256())); - - HSUM256_EPI32(sum_epi32, sum); - - return (float) sum; -} diff --git a/quantization/cpp/sse.c b/quantization/cpp/sse.c deleted file mode 100644 index f3dbd6f..0000000 --- a/quantization/cpp/sse.c +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include -#include - -#include "export_macro.h" - -#define HSUM128_PS(X, R) \ - float R = 0.0f; \ - { \ - __m128 x64 = _mm_add_ps(X, _mm_movehl_ps(X, X)); \ - __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); \ - R = _mm_cvtss_f32(x32); \ - } - -#define HSUM128_EPI16(X, R) \ - int R = 0; \ - { \ - __m128i x64 = _mm_add_epi16(X, _mm_srli_si128(X, 8)); \ - __m128i x32 = _mm_add_epi16(x64, _mm_srli_si128(x64, 4)); \ - R = _mm_extract_epi16(x32, 0) + _mm_extract_epi16(x32, 1); \ - } - -EXPORT float impl_score_dot_sse( - const uint8_t* query_ptr, - const uint8_t* vector_ptr, - uint32_t dim -) { - const __m128i* v_ptr = (const __m128i*)vector_ptr; - const __m128i* q_ptr = (const __m128i*)query_ptr; - - __m128i mul = _mm_setzero_si128(); - for (uint32_t _i = 0; _i < dim / 16; _i++) { - __m128i v = _mm_loadu_si128(v_ptr); - __m128i q = _mm_loadu_si128(q_ptr); - v_ptr++; - q_ptr++; - - __m128i s = _mm_maddubs_epi16(v, q); - __m128i s_low = _mm_cvtepi16_epi32(s); - __m128i s_high = _mm_cvtepi16_epi32(_mm_srli_si128(s, 8)); - mul = _mm_add_epi32(mul, s_low); - mul = _mm_add_epi32(mul, s_high); - } - __m128 mul_ps = _mm_cvtepi32_ps(mul); - HSUM128_PS(mul_ps, mul_scalar); - return mul_scalar; -} - -EXPORT uint64_t impl_xor_popcnt_sse( - const uint64_t* query_ptr, - const uint64_t* vector_ptr, - uint32_t count -) { - const int64_t* v_ptr = (const int64_t*)vector_ptr; - const int64_t* q_ptr = (const int64_t*)query_ptr; - int64_t result = 0; - for (uint32_t _i = 0; _i < 2 * count; _i++) { - uint64_t x = (*v_ptr) ^ (*q_ptr); - result += _mm_popcnt_u64(x); - v_ptr++; - q_ptr++; - } - return (uint32_t)result; -} - -EXPORT float impl_score_l1_sse( - const uint8_t* query_ptr, - const uint8_t* vector_ptr, - uint32_t dim -) { - const __m128i* v_ptr = (const __m128i*)vector_ptr; - const __m128i* q_ptr = (const __m128i*)query_ptr; - - uint32_t m = dim - (dim % 16); - __m128i sum128 = _mm_setzero_si128(); - - // the vector sizes are assumed to be multiples of 16, no remaining part here - for (uint32_t i = 0; i < m; i += 16) { - __m128i vec2 = _mm_loadu_si128(v_ptr); - __m128i vec1 = _mm_loadu_si128(q_ptr); - v_ptr++; - q_ptr++; - - // Compute the difference in both directions - __m128i diff1 = _mm_subs_epu8(vec1, vec2); - __m128i diff2 = _mm_subs_epu8(vec2, vec1); - - // Take the maximum - __m128i abs_diff = _mm_max_epu8(diff1, diff2); - - __m128i abs_diff16_low = _mm_unpacklo_epi8(abs_diff, _mm_setzero_si128()); - __m128i abs_diff16_high = _mm_unpackhi_epi8(abs_diff, _mm_setzero_si128()); - - sum128 = _mm_add_epi16(sum128, abs_diff16_low); - sum128 = _mm_add_epi16(sum128, abs_diff16_high); - } - - // Convert 16-bit sums to 32-bit and sum them up - __m128i sum_epi32 = _mm_add_epi32( - _mm_unpacklo_epi16(sum128, _mm_setzero_si128()), - _mm_unpackhi_epi16(sum128, _mm_setzero_si128())); - - // Horizontal sum using the macro - HSUM128_EPI16(sum_epi32, sum); - - return (float) sum; -} diff --git a/quantization/src/encoded_vectors_binary.rs b/quantization/src/encoded_vectors_binary.rs index 807b221..2070f2b 100644 --- a/quantization/src/encoded_vectors_binary.rs +++ b/quantization/src/encoded_vectors_binary.rs @@ -1,4 +1,5 @@ use crate::encoded_vectors::validate_vector_parameters; +use crate::simd::sse2::xor_popcnt::impl_xor_popcnt_sse; use crate::utils::{transmute_from_u8_to_slice, transmute_to_u8_slice}; use crate::{ DistanceType, EncodedStorage, EncodedStorageBuilder, EncodedVectors, EncodingError, @@ -214,11 +215,6 @@ impl EncodedVectors for EncodedVecto } } -#[cfg(target_arch = "x86_64")] -extern "C" { - fn impl_xor_popcnt_sse(query_ptr: *const u64, vector_ptr: *const u64, count: u32) -> u32; -} - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] extern "C" { fn impl_xor_popcnt_neon(query_ptr: *const u64, vector_ptr: *const u64, count: u32) -> u32; diff --git a/quantization/src/encoded_vectors_u8.rs b/quantization/src/encoded_vectors_u8.rs index fcc7935..26fc100 100644 --- a/quantization/src/encoded_vectors_u8.rs +++ b/quantization/src/encoded_vectors_u8.rs @@ -3,6 +3,10 @@ use std::path::Path; use crate::encoded_vectors::validate_vector_parameters; use crate::quantile::{find_min_max_from_iter, find_quantile_interval}; +use crate::simd::avx2::dot_u8::impl_score_dot_avx; +use crate::simd::avx2::manhattan_u8::impl_score_l1_avx; +use crate::simd::sse2::dot_u8::impl_score_dot_sse; +use crate::simd::sse2::manhattan_u8::impl_score_l1_sse; use crate::{ encoded_storage::{EncodedStorage, EncodedStorageBuilder}, encoded_vectors::{DistanceType, EncodedVectors, VectorParameters}, @@ -473,15 +477,6 @@ fn impl_score_l1(q_ptr: *const u8, v_ptr: *const u8, actual_dim: usize) -> i32 { } } -#[cfg(target_arch = "x86_64")] -extern "C" { - fn impl_score_dot_avx(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32; - fn impl_score_l1_avx(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32; - - fn impl_score_dot_sse(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32; - fn impl_score_l1_sse(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32; -} - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] extern "C" { fn impl_score_dot_neon(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32; diff --git a/quantization/src/lib.rs b/quantization/src/lib.rs index ee49a4a..2c82e58 100644 --- a/quantization/src/lib.rs +++ b/quantization/src/lib.rs @@ -5,6 +5,7 @@ pub mod encoded_vectors_pq; pub mod encoded_vectors_u8; pub mod kmeans; pub mod quantile; +pub mod simd; mod utils; use std::fmt::Display; diff --git a/quantization/src/simd/avx2/dot_u8.rs b/quantization/src/simd/avx2/dot_u8.rs new file mode 100644 index 0000000..1934403 --- /dev/null +++ b/quantization/src/simd/avx2/dot_u8.rs @@ -0,0 +1,40 @@ +use std::arch::x86_64::*; + +use super::hsum256_epi32_avx; + +#[target_feature(enable = "avx2")] +#[allow(clippy::missing_safety_doc)] +pub unsafe fn impl_score_dot_avx(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32 { + let mut v_ptr = vector_ptr as *const __m256i; + let mut q_ptr = query_ptr as *const __m256i; + + let mut mul1 = _mm256_setzero_si256(); + let mask_epu32 = _mm256_set1_epi32(0xFFFF); + for _ in 0..dim / 32 { + let v = _mm256_loadu_si256(v_ptr); + let q = _mm256_loadu_si256(q_ptr); + v_ptr = v_ptr.add(1); + q_ptr = q_ptr.add(1); + + let s = _mm256_maddubs_epi16(v, q); + let s_low = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(s)); + let s_high = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(s, 1)); + mul1 = _mm256_add_epi32(mul1, s_low); + mul1 = _mm256_add_epi32(mul1, s_high); + } + + // the vector sizes are assumed to be multiples of 16, check if one last 16-element part remaining + if dim % 32 != 0 { + let v_short = _mm_loadu_si128(v_ptr as *const __m128i); + let q_short = _mm_loadu_si128(q_ptr as *const __m128i); + + let v1 = _mm256_cvtepu8_epi16(v_short); + let q1 = _mm256_cvtepu8_epi16(q_short); + + let s = _mm256_mullo_epi16(v1, q1); + mul1 = _mm256_add_epi32(mul1, _mm256_and_si256(s, mask_epu32)); + mul1 = _mm256_add_epi32(mul1, _mm256_srli_epi32(s, 16)); + } + + hsum256_epi32_avx(mul1) as f32 +} diff --git a/quantization/src/simd/avx2/manhattan_u8.rs b/quantization/src/simd/avx2/manhattan_u8.rs new file mode 100644 index 0000000..0336716 --- /dev/null +++ b/quantization/src/simd/avx2/manhattan_u8.rs @@ -0,0 +1,56 @@ +use std::arch::x86_64::*; + +use super::hsum256_epi32_avx; + +#[target_feature(enable = "avx2")] +#[allow(clippy::missing_safety_doc)] +pub unsafe fn impl_score_l1_avx(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32 { + let mut v_ptr = vector_ptr as *const __m256i; + let mut q_ptr = query_ptr as *const __m256i; + + let mut sum256 = _mm256_setzero_si256(); + for _ in 0..dim / 32 { + let v = _mm256_loadu_si256(v_ptr); + let q = _mm256_loadu_si256(q_ptr); + v_ptr = v_ptr.add(1); + q_ptr = q_ptr.add(1); + + // Compute the difference in both directions and take the maximum for abs + let diff1 = _mm256_subs_epu8(v, q); + let diff2 = _mm256_subs_epu8(q, v); + + let abs_diff = _mm256_max_epu8(diff1, diff2); + + let abs_diff16_lo = _mm256_unpacklo_epi8(abs_diff, _mm256_setzero_si256()); + let abs_diff16_hi = _mm256_unpackhi_epi8(abs_diff, _mm256_setzero_si256()); + + sum256 = _mm256_add_epi16(sum256, abs_diff16_lo); + sum256 = _mm256_add_epi16(sum256, abs_diff16_hi); + } + + // the vector sizes are assumed to be multiples of 16, check if one last 16-element part remaining + if dim % 32 != 0 { + let v_short = _mm_loadu_si128(v_ptr as *const __m128i); + let q_short = _mm_loadu_si128(q_ptr as *const __m128i); + + let diff1 = _mm_subs_epu8(v_short, q_short); + let diff2 = _mm_subs_epu8(q_short, v_short); + + let abs_diff = _mm_max_epu8(diff1, diff2); + + let abs_diff16_lo_128 = _mm_unpacklo_epi8(abs_diff, _mm_setzero_si128()); + let abs_diff16_hi_128 = _mm_unpackhi_epi8(abs_diff, _mm_setzero_si128()); + + let abs_diff16_lo = _mm256_cvtepu16_epi32(abs_diff16_lo_128); + let abs_diff16_hi = _mm256_cvtepu16_epi32(abs_diff16_hi_128); + + sum256 = _mm256_add_epi16(sum256, abs_diff16_lo); + sum256 = _mm256_add_epi16(sum256, abs_diff16_hi); + } + + let sum_epi32 = _mm256_add_epi32( + _mm256_unpacklo_epi16(sum256, _mm256_setzero_si256()), + _mm256_unpackhi_epi16(sum256, _mm256_setzero_si256())); + + hsum256_epi32_avx(sum_epi32) as f32 +} diff --git a/quantization/src/simd/avx2/mod.rs b/quantization/src/simd/avx2/mod.rs new file mode 100644 index 0000000..dec276f --- /dev/null +++ b/quantization/src/simd/avx2/mod.rs @@ -0,0 +1,13 @@ +pub mod dot_u8; +pub mod manhattan_u8; + +use std::arch::x86_64::*; + +#[target_feature(enable = "avx2")] +#[allow(clippy::missing_safety_doc)] +pub unsafe fn hsum256_epi32_avx(x: __m256i) -> i32 { + let x128: __m128i = _mm_add_epi32(_mm256_extractf128_si256(x, 1), _mm256_castsi256_si128(x)); + let x64: __m128i = _mm_add_epi32(x128, _mm_srli_si128(x128, 8)); + let x32: __m128i = _mm_add_epi32(x64, _mm_srli_si128(x64, 4)); + _mm_cvtsi128_si32(x32) as i32 +} diff --git a/quantization/src/simd/mod.rs b/quantization/src/simd/mod.rs new file mode 100644 index 0000000..bb1a97c --- /dev/null +++ b/quantization/src/simd/mod.rs @@ -0,0 +1,8 @@ +#[cfg(target_arch = "x86_64")] +pub mod avx2; + +#[cfg(target_arch = "aarch64")] +pub mod neon; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub mod sse2; diff --git a/quantization/src/simd/neon/mod.rs b/quantization/src/simd/neon/mod.rs new file mode 100644 index 0000000..e69de29 diff --git a/quantization/src/simd/sse2/dot_u8.rs b/quantization/src/simd/sse2/dot_u8.rs new file mode 100644 index 0000000..7ea3737 --- /dev/null +++ b/quantization/src/simd/sse2/dot_u8.rs @@ -0,0 +1,25 @@ +use std::arch::x86_64::*; + +use super::hsum128_epi16_sse; + +#[target_feature(enable = "sse2")] +#[allow(clippy::missing_safety_doc)] +pub unsafe fn impl_score_dot_sse(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32 { + let mut v_ptr = vector_ptr as *const __m128i; + let mut q_ptr = query_ptr as *const __m128i; + + let mut mul = _mm_setzero_si128(); + for _ in 0..dim / 16 { + let v = _mm_loadu_si128(v_ptr); + let q = _mm_loadu_si128(q_ptr); + v_ptr = v_ptr.add(1); + q_ptr = q_ptr.add(1); + + let s = _mm_maddubs_epi16(v, q); + let s_low = _mm_cvtepi16_epi32(s); + let s_high = _mm_cvtepi16_epi32(_mm_srli_si128(s, 8)); + mul = _mm_add_epi32(mul, s_low); + mul = _mm_add_epi32(mul, s_high); + } + hsum128_epi16_sse(mul) as f32 +} diff --git a/quantization/src/simd/sse2/manhattan_u8.rs b/quantization/src/simd/sse2/manhattan_u8.rs new file mode 100644 index 0000000..c0eff1e --- /dev/null +++ b/quantization/src/simd/sse2/manhattan_u8.rs @@ -0,0 +1,39 @@ +use std::arch::x86_64::*; + +use super::hsum128_epi16_sse; + +#[target_feature(enable = "sse2")] +#[allow(clippy::missing_safety_doc)] +pub unsafe fn impl_score_l1_sse(query_ptr: *const u8, vector_ptr: *const u8, dim: u32) -> f32 { + let mut v_ptr = vector_ptr as *const __m128i; + let mut q_ptr = query_ptr as *const __m128i; + + let mut sum128 = _mm_setzero_si128(); + // the vector sizes are assumed to be multiples of 16, no remaining part here + for _ in 0..dim / 16 { + let vec2 = _mm_loadu_si128(v_ptr); + let vec1 = _mm_loadu_si128(q_ptr); + v_ptr = v_ptr.add(1); + q_ptr = q_ptr.add(1); + + // Compute the difference in both directions + let diff1 = _mm_subs_epu8(vec1, vec2); + let diff2 = _mm_subs_epu8(vec2, vec1); + + // Take the maximum + let abs_diff = _mm_max_epu8(diff1, diff2); + + let abs_diff16_low = _mm_unpacklo_epi8(abs_diff, _mm_setzero_si128()); + let abs_diff16_high = _mm_unpackhi_epi8(abs_diff, _mm_setzero_si128()); + + sum128 = _mm_add_epi16(sum128, abs_diff16_low); + sum128 = _mm_add_epi16(sum128, abs_diff16_high); + } + + // Convert 16-bit sums to 32-bit and sum them up + let sum_epi32 = _mm_add_epi32( + _mm_unpacklo_epi16(sum128, _mm_setzero_si128()), + _mm_unpackhi_epi16(sum128, _mm_setzero_si128())); + + hsum128_epi16_sse(sum_epi32) as f32 +} diff --git a/quantization/src/simd/sse2/mod.rs b/quantization/src/simd/sse2/mod.rs new file mode 100644 index 0000000..1c61dc3 --- /dev/null +++ b/quantization/src/simd/sse2/mod.rs @@ -0,0 +1,13 @@ +pub mod dot_u8; +pub mod manhattan_u8; +pub mod xor_popcnt; + +use std::arch::x86_64::*; + +#[target_feature(enable = "sse2")] +#[allow(clippy::missing_safety_doc)] +pub unsafe fn hsum128_epi16_sse(x: __m128i) -> i32 { + let x64 = _mm_add_epi16(x, _mm_srli_si128(x, 8)); + let x32 = _mm_add_epi16(x64, _mm_srli_si128(x64, 4)); + _mm_extract_epi16(x32, 0) + _mm_extract_epi16(x32, 1) +} diff --git a/quantization/src/simd/sse2/xor_popcnt.rs b/quantization/src/simd/sse2/xor_popcnt.rs new file mode 100644 index 0000000..a2aa212 --- /dev/null +++ b/quantization/src/simd/sse2/xor_popcnt.rs @@ -0,0 +1,18 @@ +use std::arch::x86_64::*; + + +#[target_feature(enable = "popcnt")] +#[allow(clippy::missing_safety_doc)] +#[inline] +pub unsafe fn impl_xor_popcnt_sse(query_ptr: *const u64, vector_ptr: *const u64, count: u32) -> u32 { + let mut v_ptr = vector_ptr as *const i64; + let mut q_ptr = query_ptr as *const i64; + let mut result = 0; + for _ in 0..2 * count { + let x = (*v_ptr) ^ (*q_ptr); + result += _popcnt64(x); + v_ptr = v_ptr.add(1); + q_ptr = q_ptr.add(1); + } + result as u32 +}