From a47380b274e43de808b20f8518a9f76f5775df7d Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Mon, 12 Jan 2026 20:55:31 +0900 Subject: [PATCH] allow for higher-precision pHash computation --- README.md | 1 + crates/imgdd/README.md | 2 + crates/imgdd/benches/rust_benches.rs | 2 + crates/imgdd/src/lib.rs | 20 +++++-- crates/imgdd/tests/rust_tests.rs | 13 ++++- crates/imgddcore/benches/core_benches.rs | 21 ++++++-- crates/imgddcore/src/dedupe.rs | 29 +++++----- crates/imgddcore/src/hashing.rs | 63 ++++++++++++++-------- crates/imgddcore/tests/dedupe_tests.rs | 53 +++++++++++++++---- crates/imgddcore/tests/hashing_tests.rs | 67 +++++++++++++++++------- crates/imgddpy/comparison/compare.py | 24 ++++++--- crates/imgddpy/imgdd.pyi | 7 ++- crates/imgddpy/src/lib.rs | 40 +++++++++++--- 13 files changed, 252 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index 0903bfb..e953ec5 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ duplicates = dd.dupes( path="path/to/images", algo="dhash", # Optional: default = dhash filter="triangle", # Optional: default = triangle + hash_size=8, # Optional: default = 8 (only used for pHash) remove=False # Optional: default = False ) print(duplicates) diff --git a/crates/imgdd/README.md b/crates/imgdd/README.md index 8d8e968..2a8a6b0 100644 --- a/crates/imgdd/README.md +++ b/crates/imgdd/README.md @@ -40,6 +40,7 @@ let result = hash( PathBuf::from("path/to/images"), Some("Triangle"), // Optional: default = "Triangle" Some("dHash"), // Optional: default = "dHash" + None, // Optional: default = 8 (only used for pHash) Some(false), // Optional: default = false ); println!("{:#?}", result); @@ -55,6 +56,7 @@ let result = dupes( PathBuf::from("path/to/images"), Some("Triangle"), // Optional: default = "Triangle" Some("dHash"), // Optional: default = "dHash" + None, // Optional: default = 8 (only used for pHash) false, ); println!("{:#?}", result); diff --git a/crates/imgdd/benches/rust_benches.rs b/crates/imgdd/benches/rust_benches.rs index 98ea33e..afec00b 100644 --- a/crates/imgdd/benches/rust_benches.rs +++ b/crates/imgdd/benches/rust_benches.rs @@ -27,6 +27,7 @@ fn benchmark_hash(c: &mut Criterion) { black_box(dir_path.clone()), Some("nearest"), Some("dhash"), + None, Some(false), ); let _ = black_box(result).is_ok(); // Ignore the result @@ -43,6 +44,7 @@ fn benchmark_dupes(c: &mut Criterion) { black_box(dir_path.clone()), Some("nearest"), Some("dhash"), + None, false, ); let _ = black_box(result).is_ok(); // Ignore the result diff --git a/crates/imgdd/src/lib.rs b/crates/imgdd/src/lib.rs index e5e0031..7faeff9 100644 --- a/crates/imgdd/src/lib.rs +++ b/crates/imgdd/src/lib.rs @@ -65,6 +65,10 @@ pub fn select_algo(algo: Option<&str>) -> &'static str { /// - **Default:** "dHash" /// - `sort` - Boolean to determine if the hashes should be sorted. /// - **Default:** false +/// - `hash_size` - Integer specifying the hash size to use for pHash. +/// The resulting hash will be hash_size^2 bits long. The value is +/// ignored for all hash methods other than pHash. +/// - **Default:** 8 /// /// # Returns /// @@ -79,6 +83,7 @@ pub fn select_algo(algo: Option<&str>) -> &'static str { /// PathBuf::from("path/to/images"), /// Some("Triangle"), // Optional: default = "Triangle" /// Some("dHash"), // Optional: default = "dHash" +/// None, // Optional: default = 8 /// Some(false), // Optional: default = false /// ); /// @@ -88,13 +93,14 @@ pub fn hash( path: PathBuf, filter: Option<&str>, algo: Option<&str>, + hash_size: Option, sort: Option, -) -> Result, Error> { +) -> Result, Error> { let validated_path = validate_path(&path)?; let filter_type = select_filter_type(filter); let selected_algo = select_algo(algo); - let mut hash_paths = collect_hashes(validated_path, filter_type, selected_algo)?; + let mut hash_paths = collect_hashes(&validated_path, filter_type, selected_algo, hash_size)?; // Optionally sort hashes if sort.unwrap_or(false) { @@ -115,6 +121,10 @@ pub fn hash( /// - `algo` - String specifying the hashing algorithm to use. /// - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`] /// - **Default:** "dhash" +/// - `hash_size` - Integer specifying the hash size to use for pHash. +/// The resulting hash will be hash_size^2 bits long. The value is) +/// ignored for all hash methods other than pHash. +/// - **Default:** 8 /// - `remove` - Boolean indicating whether duplicate files should be removed. /// /// # Returns @@ -130,6 +140,7 @@ pub fn hash( /// PathBuf::from("path/to/images"), /// Some("Triangle"), // Optional: default = "Triangle" /// Some("dHash"), // Optional: default = "dHash" +/// None, // Optional: default = 8 /// false, /// ); /// @@ -139,13 +150,14 @@ pub fn dupes( path: PathBuf, filter: Option<&str>, algo: Option<&str>, + hash_size: Option, remove: bool, -) -> Result>, Error> { +) -> Result>, Error> { let validated_path = validate_path(&path)?; let filter_type = select_filter_type(filter); let selected_algo = select_algo(algo); - let mut hash_paths = collect_hashes(validated_path, filter_type, selected_algo)?; + let mut hash_paths = collect_hashes(&validated_path, filter_type, selected_algo, hash_size)?; sort_hashes(&mut hash_paths); Ok(find_duplicates(&hash_paths, remove)?) diff --git a/crates/imgdd/tests/rust_tests.rs b/crates/imgdd/tests/rust_tests.rs index 2771532..73c8d21 100644 --- a/crates/imgdd/tests/rust_tests.rs +++ b/crates/imgdd/tests/rust_tests.rs @@ -50,6 +50,7 @@ mod tests { temp_dir.path().to_path_buf(), Some("nearest"), Some("dhash"), + None, Some(false), ); assert!(result.is_ok(), "Hash function failed: {:?}", result.err()); @@ -62,6 +63,7 @@ mod tests { invalid_path.clone(), Some("nearest"), Some("dhash"), + None, Some(false), ); assert!( @@ -74,7 +76,7 @@ mod tests { #[test] fn test_hash_with_sorting() { let img_dir = PathBuf::from("../../imgs/test/apple_pie"); - let result = hash(img_dir, Some("nearest"), Some("dhash"), Some(true)); + let result = hash(img_dir, Some("nearest"), Some("dhash"), None, Some(true)); assert!(result.is_ok(), "Hash function failed: {:?}", result.err()); @@ -109,6 +111,7 @@ mod tests { temp_dir.path().to_path_buf(), Some("nearest"), Some("dhash"), + None, false, ); assert!(result.is_ok(), "Dupes function failed: {:?}", result.err()); @@ -124,7 +127,13 @@ mod tests { #[test] fn test_dupes_with_invalid_path() { let invalid_path = PathBuf::from("/non/existent/path"); - let result = dupes(invalid_path.clone(), Some("nearest"), Some("dhash"), false); + let result = dupes( + invalid_path.clone(), + Some("nearest"), + Some("dhash"), + None, + false, + ); assert!( result.is_err(), "Expected error for invalid path: {:?}", diff --git a/crates/imgddcore/benches/core_benches.rs b/crates/imgddcore/benches/core_benches.rs index f9c7d6f..d351062 100644 --- a/crates/imgddcore/benches/core_benches.rs +++ b/crates/imgddcore/benches/core_benches.rs @@ -47,6 +47,7 @@ fn benchmark_collect_hashes(c: &mut Criterion) { black_box(&dir_path), black_box(image::imageops::FilterType::Triangle), black_box("dhash"), + black_box(None), ) .expect("Failed to collect hashes"); }); @@ -55,8 +56,13 @@ fn benchmark_collect_hashes(c: &mut Criterion) { fn benchmark_sort_hashes(c: &mut Criterion) { let dir_path = PathBuf::from("../../imgs/test"); - let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash") - .expect("Failed to collect hashes"); + let mut hash_paths = collect_hashes( + &dir_path, + image::imageops::FilterType::Triangle, + "dhash", + None, + ) + .expect("Failed to collect hashes"); c.bench_function("sort_hashes", |b| { b.iter(|| { @@ -67,8 +73,13 @@ fn benchmark_sort_hashes(c: &mut Criterion) { fn benchmark_find_duplicates(c: &mut Criterion) { let dir_path = PathBuf::from("../../imgs/test"); - let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash") - .expect("Failed to collect hashes"); + let mut hash_paths = collect_hashes( + &dir_path, + image::imageops::FilterType::Triangle, + "dhash", + None, + ) + .expect("Failed to collect hashes"); sort_hashes(&mut hash_paths); c.bench_function("find_duplicates", |b| { @@ -139,7 +150,7 @@ fn benchmark_phash(c: &mut Criterion) { c.bench_function("phash", |b| { b.iter(|| { // Compute pHash for the normalized image - ImageHash::phash(black_box(&normalized_image)).expect("Failed to compute phash"); + ImageHash::phash(black_box(&normalized_image), 8).expect("Failed to compute phash"); }); }); } diff --git a/crates/imgddcore/src/dedupe.rs b/crates/imgddcore/src/dedupe.rs index 51ec3e4..f02261d 100644 --- a/crates/imgddcore/src/dedupe.rs +++ b/crates/imgddcore/src/dedupe.rs @@ -19,6 +19,7 @@ use walkdir::WalkDir; /// Options: `Nearest`, `Triangle`, `CatmullRom`, `Gaussian`, `Lanczos3`. /// * `algo` - The hashing algorithm to use. /// Options: `dhash`, `ahash`, `mhash`, `phash`, `whash`. +/// * `hash_size` - The hash size for phash (ignored by other algorithms). /// /// # Returns /// @@ -27,7 +28,8 @@ pub fn collect_hashes( path: &PathBuf, filter: FilterType, algo: &str, -) -> Result, Error> { + hash_size: Option, +) -> Result, Error> { let files: Vec = WalkDir::new(path) .into_iter() .filter_map(|entry| entry.ok()) @@ -35,30 +37,31 @@ pub fn collect_hashes( .map(|entry| entry.path().to_path_buf()) .collect(); - let hash_paths: Vec<(u64, PathBuf)> = files + let hash_paths: Vec<(ImageHash, PathBuf)> = files .par_iter() .filter_map(|file_path| match open_image(file_path) { Ok(image) => { let hash = match algo { "dhash" => { let normalized = normalize::proc(&image, filter, 9, 8).ok()?; - ImageHash::dhash(&normalized).ok()?.get_hash() + ImageHash::dhash(&normalized).ok()? } "ahash" => { let normalized = normalize::proc(&image, filter, 8, 8).ok()?; - ImageHash::ahash(&normalized).ok()?.get_hash() + ImageHash::ahash(&normalized).ok()? } "mhash" => { let normalized = normalize::proc(&image, filter, 8, 8).ok()?; - ImageHash::mhash(&normalized).ok()?.get_hash() + ImageHash::mhash(&normalized).ok()? } "phash" => { + let hash_size = hash_size.unwrap_or(8); let normalized = normalize::proc(&image, filter, 32, 32).ok()?; - ImageHash::phash(&normalized).ok()?.get_hash() + ImageHash::phash(&normalized, hash_size).ok()? } "whash" => { let normalized = normalize::proc(&image, filter, 8, 8).ok()?; - ImageHash::whash(&normalized).ok()?.get_hash() + ImageHash::whash(&normalized).ok()? } _ => panic!("Unsupported hashing algorithm: {}", algo), }; @@ -80,8 +83,8 @@ pub fn collect_hashes( /// /// * `hash_paths` - A mutable reference to a vector of hash-path tuples. #[inline] -pub fn sort_hashes(hash_paths: &mut Vec<(u64, PathBuf)>) { - hash_paths.sort_by_key(|(hash, _)| *hash); +pub fn sort_hashes(hash_paths: &mut Vec<(ImageHash, PathBuf)>) { + hash_paths.sort_by_key(|(hash, _)| hash.clone()); } /// Opens an image file and decodes it. @@ -120,16 +123,16 @@ pub fn open_image(file_path: &PathBuf) -> Result { /// /// Returns an error if a file fails to be removed when `remove` is set to `true`. pub fn find_duplicates( - hash_paths: &[(u64, PathBuf)], + hash_paths: &[(ImageHash, PathBuf)], remove: bool, -) -> Result>, Error> { - let mut duplicates_map: HashMap> = HashMap::new(); +) -> Result>, Error> { + let mut duplicates_map: HashMap> = HashMap::new(); for window in hash_paths.windows(2) { if let [(hash1, path1), (hash2, path2)] = window { if hash1 == hash2 { duplicates_map - .entry(*hash1) + .entry(hash1.clone()) .or_insert_with(Vec::new) .extend(vec![path1.clone(), path2.clone()]); } diff --git a/crates/imgddcore/src/hashing.rs b/crates/imgddcore/src/hashing.rs index e4105e2..7fe7273 100644 --- a/crates/imgddcore/src/hashing.rs +++ b/crates/imgddcore/src/hashing.rs @@ -5,15 +5,29 @@ use dwt::wavelet::Haar; use dwt::{Operation, Transform}; use rustdct::DctPlanner; -/// A structure representing the hash of an image as u64. +/// A structure representing the hash of an image. /// /// The `ImageHash` structure is used to store and compare the hash of an image for deduplication purposes. -#[derive(Eq, PartialEq, Hash, Clone)] +/// The hash is stored as a vector of bytes, allowing for variable-sized hashes. +#[derive(Eq, PartialEq, Hash, Clone, Ord, PartialOrd, Debug)] pub struct ImageHash { - hash: u64, + hash: Vec, } impl ImageHash { + /// Creates a new ImageHash from a u64 value (for 64-bit hashes). + #[inline] + pub fn from_u64(hash: u64) -> Self { + let bytes = hash.to_be_bytes().to_vec(); + Self { hash: bytes } + } + + /// Returns the hash length in bytes. + #[inline] + pub fn num_bytes(&self) -> usize { + self.hash.len() + } + /// Computes the average hash (aHash) of a given image. /// /// # Arguments @@ -48,7 +62,7 @@ impl ImageHash { } } - Ok(Self { hash }) + Ok(Self::from_u64(hash)) } /// Computes the median hash (mHash) of a given image. @@ -88,7 +102,7 @@ impl ImageHash { } } - Ok(Self { hash }) + Ok(Self::from_u64(hash)) } /// Computes the difference hash (dHash) of a given image. @@ -116,13 +130,14 @@ impl ImageHash { } } - Ok(Self { hash }) + Ok(Self::from_u64(hash)) } /// Computes the perceptual hash (pHash) of a given image. /// /// # Arguments: /// * `image` - A reference to a `DynamicImage` for which the hash is to be calculated. + /// * `hash_size` - The size of the hash (e.g., 8 for 8x8, 16 for 16x16). Defaults to 8. /// /// # Returns: /// * An `ImageHash` instance containing the computed pHash value. @@ -132,9 +147,8 @@ impl ImageHash { /// - Analyzes the frequency domain using Discrete Cosine Transform (DCT). /// - Focuses on low-frequency components, which are less affected by resizing or compression. #[inline] - pub fn phash(image: &DynamicImage) -> Result { + pub fn phash(image: &DynamicImage, hash_size: usize) -> Result { const IMG_SIZE: usize = 32; - const HASH_SIZE: usize = 8; // Collect pixel values from normalized 32x32 grayscale image let mut pixels: Vec = image.pixels().map(|p| p.2[0] as f32).collect(); @@ -163,11 +177,12 @@ impl ImageHash { } } - // Extract top-left 8x8 DCT coefficients (low frequencies) - let mut dct_lowfreq = [0f32; HASH_SIZE * HASH_SIZE]; - for y in 0..HASH_SIZE { - for x in 0..HASH_SIZE { - dct_lowfreq[y * HASH_SIZE + x] = pixels[y * IMG_SIZE + x]; + // Extract top-left hash_size x hash_size DCT coefficients (low frequencies) + let hash_size_sq = hash_size * hash_size; + let mut dct_lowfreq = vec![0f32; hash_size_sq]; + for y in 0..hash_size { + for x in 0..hash_size { + dct_lowfreq[y * hash_size + x] = pixels[y * IMG_SIZE + x]; } } @@ -177,15 +192,21 @@ impl ImageHash { ac_coeffs.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap()); let median = ac_coeffs[mid]; - // Generate hash - let mut hash = 0u64; + // Generate hash as Vec + // Calculate number of bytes needed (hash_size * hash_size bits) + let num_bits = hash_size_sq; + let num_bytes = (num_bits + 7) / 8; // Round up to nearest byte + let mut hash_bytes = vec![0u8; num_bytes]; + for (i, &val) in dct_lowfreq.iter().enumerate() { if val > median { - hash |= 1 << (63 - i); + let byte_idx = i / 8; + let bit_idx = 7 - (i % 8); // MSB first (big-endian) + hash_bytes[byte_idx] |= 1 << bit_idx; } } - Ok(Self { hash }) + Ok(Self { hash: hash_bytes }) } /// Computes the wavelet hash (wHash) of a given image. @@ -242,16 +263,16 @@ impl ImageHash { } } - Ok(Self { hash }) + Ok(Self::from_u64(hash)) } /// Retrieves the computed hash value. /// /// # Returns /// - /// * Hash value as a `u64`. + /// * A reference to the hash as a slice (`&[u8]`). #[inline] - pub fn get_hash(&self) -> u64 { - self.hash + pub fn get_hash(&self) -> &[u8] { + &self.hash } } diff --git a/crates/imgddcore/tests/dedupe_tests.rs b/crates/imgddcore/tests/dedupe_tests.rs index d9dd978..7504b58 100644 --- a/crates/imgddcore/tests/dedupe_tests.rs +++ b/crates/imgddcore/tests/dedupe_tests.rs @@ -20,20 +20,30 @@ mod tests { let algorithms = ["dhash", "ahash", "mhash", "phash", "whash"]; for algo in algorithms { - let hashes = - collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, algo).unwrap(); + let hashes = collect_hashes( + &temp_dir.path().to_path_buf(), + FilterType::Nearest, + algo, + None, + ) + .unwrap(); assert_eq!(hashes.len(), 1, "Algorithm {} failed", algo); } } #[test] fn test_sort_hashes() { - let mut hashes = vec![(2, PathBuf::from("b")), (1, PathBuf::from("a"))]; + use imgddcore::hashing::ImageHash; + let hash1 = ImageHash::from_u64(2); + let hash2 = ImageHash::from_u64(1); + let mut hashes = vec![ + (hash1.clone(), PathBuf::from("b")), + (hash2.clone(), PathBuf::from("a")), + ]; sort_hashes(&mut hashes); - assert_eq!( - hashes, - vec![(1, PathBuf::from("a")), (2, PathBuf::from("b"))] - ); + // After sorting, hash with value 1 should come before hash with value 2 + assert_eq!(hashes[0].0, hash2); + assert_eq!(hashes[1].0, hash1); } #[test] @@ -47,6 +57,7 @@ mod tests { &temp_dir.path().to_path_buf(), FilterType::Nearest, "unsupported_algo", + None, ) }); @@ -62,7 +73,12 @@ mod tests { let mut file = File::create(&invalid_image_path).unwrap(); file.write_all(b"not a valid image").unwrap(); - let result = collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, "dhash"); + let result = collect_hashes( + &temp_dir.path().to_path_buf(), + FilterType::Nearest, + "dhash", + None, + ); assert!(result.is_ok()); // Valid path, but should log errors for invalid image } @@ -74,7 +90,12 @@ mod tests { // Create empty file that can't be decoded File::create(&invalid_image_path).unwrap(); - let result = collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, "dhash"); + let result = collect_hashes( + &temp_dir.path().to_path_buf(), + FilterType::Nearest, + "dhash", + None, + ); assert!(result.is_ok()); // Valid path, but decode errors should be logged } @@ -107,7 +128,12 @@ mod tests { std::fs::write(&file_path_2, b"file 2 content").unwrap(); // Mock duplicate hash paths - let hash_paths = vec![(1, file_path_1.clone()), (1, file_path_2.clone())]; + use imgddcore::hashing::ImageHash; + let hash = ImageHash::from_u64(1); + let hash_paths = vec![ + (hash.clone(), file_path_1.clone()), + (hash, file_path_2.clone()), + ]; // Test with `remove = true` to trigger file deletion let result = find_duplicates(&hash_paths, true); @@ -133,7 +159,12 @@ mod tests { assert!(file_path_1.exists()); // Mock duplicate hash paths, including a non-existent file - let hash_paths = vec![(1, file_path_1.clone()), (1, file_path_2.clone())]; + use imgddcore::hashing::ImageHash; + let hash = ImageHash::from_u64(1); + let hash_paths = vec![ + (hash.clone(), file_path_1.clone()), + (hash, file_path_2.clone()), + ]; // Test with `remove = true` to trigger file deletion let result = find_duplicates(&hash_paths, true); diff --git a/crates/imgddcore/tests/hashing_tests.rs b/crates/imgddcore/tests/hashing_tests.rs index 048332d..dd2a634 100644 --- a/crates/imgddcore/tests/hashing_tests.rs +++ b/crates/imgddcore/tests/hashing_tests.rs @@ -21,13 +21,14 @@ mod tests { fn test_ahash() -> Result<()> { let test_image = create_mock_image((8, 8)); let hash = ImageHash::ahash(&test_image)?; - println!("aHash: {:064b}", hash.get_hash()); - let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010; + let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010u64; + let expected_bytes = expected_hash.to_be_bytes(); assert_eq!( hash.get_hash(), - expected_hash, + expected_bytes.as_slice(), "aHash does not match expected value" ); + assert_eq!(hash.num_bytes(), 8, "aHash size should be 8"); Ok(()) } @@ -36,14 +37,14 @@ mod tests { fn test_mhash() -> Result<()> { let test_image = create_mock_image((8, 8)); let hash = ImageHash::mhash(&test_image)?; - println!("mHash: {:064b}", hash.get_hash()); - let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010; - + let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010u64; + let expected_bytes = expected_hash.to_be_bytes(); assert_eq!( hash.get_hash(), - expected_hash, + expected_bytes.as_slice(), "mHash does not match expected value" ); + assert_eq!(hash.num_bytes(), 8, "mHash size should be 8"); Ok(()) } @@ -52,13 +53,14 @@ mod tests { fn test_dhash() -> Result<()> { let test_image = create_mock_image((9, 8)); let hash = ImageHash::dhash(&test_image)?; - println!("dHash: {:064b}", hash.get_hash()); - let expected_hash = 0b0101010101010101010101010101010101010101010101010101010101010101; + let expected_hash = 0b0101010101010101010101010101010101010101010101010101010101010101u64; + let expected_bytes = expected_hash.to_be_bytes(); assert_eq!( hash.get_hash(), - expected_hash, + expected_bytes.as_slice(), "dHash does not match expected value" ); + assert_eq!(hash.num_bytes(), 8, "dHash size should be 8"); Ok(()) } @@ -66,14 +68,33 @@ mod tests { #[test] fn test_phash() -> Result<()> { let test_image = create_mock_image((32, 32)); - let hash = ImageHash::phash(&test_image)?; - let expected_hash = 0b1101010100000000000000000000000000000000000000000000000000000000; - println!("pHash: {:064b}", hash.get_hash()); + let hash = ImageHash::phash(&test_image, 8)?; + let expected_hash = 0b1101010100000000000000000000000000000000000000000000000000000000u64; + let expected_bytes = expected_hash.to_be_bytes(); + assert_eq!( + hash.get_hash(), + expected_bytes.as_slice(), + "pHash does not match expected value" + ); + assert_eq!(hash.num_bytes(), 8, "pHash size should be 8"); + + Ok(()) + } + + #[test] + fn test_phash_size_16() -> Result<()> { + let test_image = create_mock_image((32, 32)); + let hash = ImageHash::phash(&test_image, 16)?; + let expected_bytes: [u8; 32] = [ + 0b11010101, 0b1010101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ]; assert_eq!( hash.get_hash(), - expected_hash, + expected_bytes.as_slice(), "pHash does not match expected value" ); + assert_eq!(hash.num_bytes(), 32, "pHash size should be 32"); Ok(()) } @@ -82,14 +103,24 @@ mod tests { fn test_whash() -> Result<()> { let test_image = create_mock_image((8, 8)); let hash = ImageHash::whash(&test_image)?; - println!("wHash: {:064b}", hash.get_hash()); - let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010; - + let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010u64; + let expected_bytes = expected_hash.to_be_bytes(); assert_eq!( hash.get_hash(), - expected_hash, + expected_bytes.as_slice(), "wHash does not match expected value" ); + assert_eq!(hash.num_bytes(), 8, "wHash size should be 8"); + + Ok(()) + } + + #[test] + fn test_hash_comparison() -> Result<()> { + let test_image = create_mock_image((8, 8)); + let hash1 = ImageHash::ahash(&test_image)?; + let hash2 = ImageHash::ahash(&test_image)?; + assert_eq!(hash1, hash2, "Same image should produce same hash"); Ok(()) } diff --git a/crates/imgddpy/comparison/compare.py b/crates/imgddpy/comparison/compare.py index 999e1d4..5f71e04 100644 --- a/crates/imgddpy/comparison/compare.py +++ b/crates/imgddpy/comparison/compare.py @@ -4,6 +4,7 @@ import imgdd as dd import imagehash import os +from typing import Optional def collect_image_count(path: str) -> int: @@ -33,10 +34,10 @@ def benchmark_function(func, num_runs=50, warmup=3, **kwargs): } -def imgdd_benchmark(path: str, algo: str, num_runs: int, num_images: int) -> dict: +def imgdd_benchmark(path: str, algo: str, hash_size: Optional[int], num_runs: int, num_images: int) -> dict: """Benchmark imgdd library.""" def run_imgdd_hash(): - dd.hash(path=path, algo=algo, filter="Nearest", sort=False) + dd.hash(path=path, algo=algo, filter="Nearest", hash_size=hash_size, sort=False) results = benchmark_function(run_imgdd_hash, num_runs=num_runs) for key in results: @@ -44,7 +45,7 @@ def run_imgdd_hash(): return results -def imagehash_benchmark(path: str, algo: str, num_runs: int, num_images: int) -> dict: +def imagehash_benchmark(path: str, algo: str, hash_size: Optional[int], num_runs: int, num_images: int) -> dict: """Benchmark imagehash library.""" def run_imagehash(algo: str): for root, _, files in os.walk(path): @@ -56,7 +57,10 @@ def run_imagehash(algo: str): if algo == "ahash": imagehash.average_hash(image) elif algo == "phash": - imagehash.phash(image) + if hash_size is None: + imagehash.phash(image) + else: + imagehash.phash(image, hash_size) elif algo == "dhash": imagehash.dhash(image) elif algo == "whash": @@ -94,7 +98,7 @@ def calc_diff(imgdd_result: dict, imagehash_result: dict): if __name__ == "__main__": IMAGE_DIR = "../../../imgs/test/" - ALGORITHMS = ["dHash", "aHash", "pHash", "wHash"] # mHash has no equivalent in imagehash + ALGORITHMS = ["dHash", "aHash", "pHash", "pHash256", "wHash"] # mHash has no equivalent in imagehash NUM_RUNS = 100 WARM_UP = 5 @@ -106,12 +110,18 @@ def calc_diff(imgdd_result: dict, imagehash_result: dict): for algo in ALGORITHMS: print(f"Benchmarking {algo}...\n") + if algo == "pHash256": + coreAlgo = "pHash" + hash_size = 16 + else: + coreAlgo = algo + hash_size = None # Benchmark imgdd - imgdd_result = imgdd_benchmark(IMAGE_DIR, algo, NUM_RUNS, num_images) + imgdd_result = imgdd_benchmark(IMAGE_DIR, coreAlgo, hash_size, NUM_RUNS, num_images) # Benchmark imagehash - imagehash_result = imagehash_benchmark(IMAGE_DIR, algo, NUM_RUNS, num_images) + imagehash_result = imagehash_benchmark(IMAGE_DIR, coreAlgo, hash_size, NUM_RUNS, num_images) # Compare results compare_benchmarks(imgdd_result, imagehash_result, algo) diff --git a/crates/imgddpy/imgdd.pyi b/crates/imgddpy/imgdd.pyi index a5b6b74..9e11e9e 100644 --- a/crates/imgddpy/imgdd.pyi +++ b/crates/imgddpy/imgdd.pyi @@ -1,9 +1,10 @@ -from typing import Literal, Dict +from typing import Literal, Dict, Optional def hash( path: str, filter: Literal["Nearest", "Triangle", "CatmullRom", "Gaussian", "Lanczos3"] = "Nearest", algo: Literal["aHash", "mHash", "dHash", "pHash", "wHash"] = "dHash", + hash_size: Optional[int] = None, sort: bool = False, ) -> Dict[str, str]: """ @@ -13,6 +14,8 @@ def hash( path (str): Path to the directory containing images. filter (str): Resize filter to use. algo (str): Hashing algorithm. + hash_size (int): Hash size for pHash algorithm (e.g., 8, 16). Only used for pHash. + sort (bool): Whether to sort the results by hash values. Returns: Dict[str, str]: A dictionary mapping file paths to their hashes. @@ -23,6 +26,7 @@ def dupes( path: str, filter: Literal["Nearest", "Triangle", "CatmullRom", "Gaussian", "Lanczos3"] = "Nearest", algo: Literal["aHash", "mHash", "dHash", "pHash", "wHash"] = "dHash", + hash_size: Optional[int] = None, remove: bool = False, ) -> Dict[str, list[str]]: """ @@ -32,6 +36,7 @@ def dupes( path (str): Path to the directory containing images. filter (str): Resize filter to use. algo (str): Hashing algorithm. + hash_size (int): Hash size for pHash algorithm (e.g., 8, 16). Only used for pHash. remove (bool): Whether to remove duplicate files. Returns: diff --git a/crates/imgddpy/src/lib.rs b/crates/imgddpy/src/lib.rs index 4358333..ece2d20 100644 --- a/crates/imgddpy/src/lib.rs +++ b/crates/imgddpy/src/lib.rs @@ -30,7 +30,7 @@ fn select_algo(algo: Option<&str>) -> &'static str { } /// ```python -/// hash(path, filter="triangle", algo="dhash", sort=False) +/// hash(path, filter="triangle", algo="dhash", hash_size=None, sort=False) /// ``` /// /// Calculate the hash of images in a directory. @@ -43,6 +43,10 @@ fn select_algo(algo: Option<&str>) -> &'static str { /// - `algo (str)`: Hashing algorithm. /// - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`] /// - **Default:** `dHash` +/// - `hash_size (int)`: Hash size for pHash algorithm (e.g., 8, 16). +/// The resulting hash will be hash_size^2 bits long. The value is ignored +/// for all hash methods other than pHash. +/// - **Default:** `8` /// - `sort (bool)`: Whether to sort the results by hash values. /// - **Default:** `False` /// @@ -61,11 +65,12 @@ fn select_algo(algo: Option<&str>) -> &'static str { /// ) /// print(results) /// ``` -#[pyfunction(signature = (path, filter = None, algo = None, sort = false))] +#[pyfunction(signature = (path, filter = None, algo = None, hash_size = None, sort = false))] pub fn hash( path: PathBuf, filter: Option<&str>, algo: Option<&str>, + hash_size: Option, sort: Option, ) -> PyResult> { let validated_path = validate_path(&path) @@ -73,7 +78,7 @@ pub fn hash( let filter_type = select_filter_type(filter); let algo = select_algo(algo); - let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo) + let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo, hash_size) .map_err(|e| PyErr::new::(format!("{}", e)))?; // Optionally sort hashes @@ -83,12 +88,19 @@ pub fn hash( Ok(hash_paths .into_iter() - .map(|(hash, path)| (path, format!("{:x}", hash))) + .map(|(hash, path)| { + let hash_hex = hash + .get_hash() + .iter() + .map(|b| format!("{:02x}", b)) + .collect::(); + (path, hash_hex) + }) .collect()) } /// ```python -/// dupes(path, filter="triangle", algo="dhash", remove=False) +/// dupes(path, filter="triangle", algo="dhash", hash_size=None, remove=False) /// ``` /// /// Find duplicate images in a directory. @@ -101,6 +113,10 @@ pub fn hash( /// - `algo (str)`: Hashing algorithm. /// - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`] /// - **Default:** `dHash` +/// - `hash_size (int)`: Hash size for pHash algorithm (e.g., 8, 16). +/// The resulting hash will be hash_size^2 bits long. The value is ignored +/// for all hash methods other than pHash. +/// - **Default:** `8` /// - `remove (bool)`: Whether to remove duplicate files /// - **Default:** `False` /// @@ -119,11 +135,12 @@ pub fn hash( /// ) /// print(duplicates) /// ``` -#[pyfunction(signature = (path, filter = None, algo = None, remove = false))] +#[pyfunction(signature = (path, filter = None, algo = None, hash_size = None, remove = false))] pub fn dupes( path: PathBuf, filter: Option<&str>, algo: Option<&str>, + hash_size: Option, remove: bool, ) -> PyResult>> { let validated_path = validate_path(&path) @@ -131,7 +148,7 @@ pub fn dupes( let filter_type = select_filter_type(filter); let algo = select_algo(algo); - let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo) + let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo, hash_size) .map_err(|e| PyErr::new::(format!("{}", e)))?; sort_hashes(&mut hash_paths); @@ -140,7 +157,14 @@ pub fn dupes( Ok(duplicates .into_iter() - .map(|(hash, paths)| (format!("{:x}", hash), paths)) + .map(|(hash, paths)| { + let hash_hex = hash + .get_hash() + .iter() + .map(|b| format!("{:02x}", b)) + .collect::(); + (hash_hex, paths) + }) .collect()) }