From a47380b274e43de808b20f8518a9f76f5775df7d Mon Sep 17 00:00:00 2001
From: Tobias Pfeiffer <tgpfeiffer@web.de>
Date: Mon, 12 Jan 2026 20:55:31 +0900
Subject: [PATCH] allow for higher-precision pHash computation

---
 README.md                                |  1 +
 crates/imgdd/README.md                   |  2 +
 crates/imgdd/benches/rust_benches.rs     |  2 +
 crates/imgdd/src/lib.rs                  | 20 +++++--
 crates/imgdd/tests/rust_tests.rs         | 13 ++++-
 crates/imgddcore/benches/core_benches.rs | 21 ++++++--
 crates/imgddcore/src/dedupe.rs           | 29 +++++-----
 crates/imgddcore/src/hashing.rs          | 63 ++++++++++++++--------
 crates/imgddcore/tests/dedupe_tests.rs   | 53 +++++++++++++++----
 crates/imgddcore/tests/hashing_tests.rs  | 67 +++++++++++++++++-------
 crates/imgddpy/comparison/compare.py     | 24 ++++++---
 crates/imgddpy/imgdd.pyi                 |  7 ++-
 crates/imgddpy/src/lib.rs                | 40 +++++++++++---
 13 files changed, 252 insertions(+), 90 deletions(-)
diff --git a/README.md b/README.md
index 0903bfb..e953ec5 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,7 @@ duplicates = dd.dupes(
     path="path/to/images",
     algo="dhash", # Optional: default = dhash
     filter="triangle", # Optional: default = triangle
+    hash_size=8, # Optional: default = 8 (only used for pHash)
     remove=False # Optional: default = False
 )
 print(duplicates)
diff --git a/crates/imgdd/README.md b/crates/imgdd/README.md
index 8d8e968..2a8a6b0 100644
--- a/crates/imgdd/README.md
+++ b/crates/imgdd/README.md
@@ -40,6 +40,7 @@ let result = hash(
     PathBuf::from("path/to/images"),
     Some("Triangle"), // Optional: default = "Triangle"
     Some("dHash"),   // Optional: default = "dHash"
+    None,  // Optional: default = 8 (only used for pHash)
     Some(false),     // Optional: default = false
 );
 println!("{:#?}", result);
@@ -55,6 +56,7 @@ let result = dupes(
     PathBuf::from("path/to/images"),
     Some("Triangle"), // Optional: default = "Triangle"
     Some("dHash"),   // Optional: default = "dHash"
+    None,  // Optional: default = 8 (only used for pHash)
     false,
 );
 println!("{:#?}", result);
diff --git a/crates/imgdd/benches/rust_benches.rs b/crates/imgdd/benches/rust_benches.rs
index 98ea33e..afec00b 100644
--- a/crates/imgdd/benches/rust_benches.rs
+++ b/crates/imgdd/benches/rust_benches.rs
@@ -27,6 +27,7 @@ fn benchmark_hash(c: &mut Criterion) {
                 black_box(dir_path.clone()),
                 Some("nearest"),
                 Some("dhash"),
+                None,
                 Some(false),
             );
             let _ = black_box(result).is_ok(); // Ignore the result
@@ -43,6 +44,7 @@ fn benchmark_dupes(c: &mut Criterion) {
                 black_box(dir_path.clone()),
                 Some("nearest"),
                 Some("dhash"),
+                None,
                 false,
             );
             let _ = black_box(result).is_ok(); // Ignore the result
diff --git a/crates/imgdd/src/lib.rs b/crates/imgdd/src/lib.rs
index e5e0031..7faeff9 100644
--- a/crates/imgdd/src/lib.rs
+++ b/crates/imgdd/src/lib.rs
@@ -65,6 +65,10 @@ pub fn select_algo(algo: Option<&str>) -> &'static str {
 ///     - **Default:** "dHash"
 /// - `sort` - Boolean to determine if the hashes should be sorted.
 ///     - **Default:** false
+/// - `hash_size` - Integer specifying the hash size to use for pHash.
+///   The resulting hash will be hash_size^2 bits long. The value is
+///   ignored for all hash methods other than pHash.
+///     - **Default:**  8
 ///
 /// # Returns
 ///
@@ -79,6 +83,7 @@ pub fn select_algo(algo: Option<&str>) -> &'static str {
 ///     PathBuf::from("path/to/images"),
 ///     Some("Triangle"), // Optional: default = "Triangle"
 ///     Some("dHash"),   // Optional: default = "dHash"
+///     None,            // Optional: default = 8
 ///     Some(false),     // Optional: default = false
 /// );
 ///
@@ -88,13 +93,14 @@ pub fn hash(
     path: PathBuf,
     filter: Option<&str>,
     algo: Option<&str>,
+    hash_size: Option<usize>,
     sort: Option<bool>,
-) -> Result<Vec<(u64, PathBuf)>, Error> {
+) -> Result<Vec<(imgddcore::hashing::ImageHash, PathBuf)>, Error> {
     let validated_path = validate_path(&path)?;
     let filter_type = select_filter_type(filter);
     let selected_algo = select_algo(algo);
 
-    let mut hash_paths = collect_hashes(validated_path, filter_type, selected_algo)?;
+    let mut hash_paths = collect_hashes(&validated_path, filter_type, selected_algo, hash_size)?;
 
     // Optionally sort hashes
     if sort.unwrap_or(false) {
@@ -115,6 +121,10 @@ pub fn hash(
 /// - `algo` - String specifying the hashing algorithm to use.
 ///     - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`]
 ///     - **Default:** "dhash"
+/// - `hash_size` - Integer specifying the hash size to use for pHash.
+///   The resulting hash will be hash_size^2 bits long. The value is)
+///   ignored for all hash methods other than pHash.
+///     - **Default:**  8
 /// - `remove` - Boolean indicating whether duplicate files should be removed.
 ///
 /// # Returns
@@ -130,6 +140,7 @@ pub fn hash(
 ///     PathBuf::from("path/to/images"),
 ///     Some("Triangle"), // Optional: default = "Triangle"
 ///     Some("dHash"),   // Optional: default = "dHash"
+///     None,            // Optional: default = 8
 ///     false,
 /// );
 ///
@@ -139,13 +150,14 @@ pub fn dupes(
     path: PathBuf,
     filter: Option<&str>,
     algo: Option<&str>,
+    hash_size: Option<usize>,
     remove: bool,
-) -> Result<HashMap<u64, Vec<PathBuf>>, Error> {
+) -> Result<HashMap<imgddcore::hashing::ImageHash, Vec<PathBuf>>, Error> {
     let validated_path = validate_path(&path)?;
     let filter_type = select_filter_type(filter);
     let selected_algo = select_algo(algo);
 
-    let mut hash_paths = collect_hashes(validated_path, filter_type, selected_algo)?;
+    let mut hash_paths = collect_hashes(&validated_path, filter_type, selected_algo, hash_size)?;
     sort_hashes(&mut hash_paths);
 
     Ok(find_duplicates(&hash_paths, remove)?)
diff --git a/crates/imgdd/tests/rust_tests.rs b/crates/imgdd/tests/rust_tests.rs
index 2771532..73c8d21 100644
--- a/crates/imgdd/tests/rust_tests.rs
+++ b/crates/imgdd/tests/rust_tests.rs
@@ -50,6 +50,7 @@ mod tests {
             temp_dir.path().to_path_buf(),
             Some("nearest"),
             Some("dhash"),
+            None,
             Some(false),
         );
         assert!(result.is_ok(), "Hash function failed: {:?}", result.err());
@@ -62,6 +63,7 @@ mod tests {
             invalid_path.clone(),
             Some("nearest"),
             Some("dhash"),
+            None,
             Some(false),
         );
         assert!(
@@ -74,7 +76,7 @@ mod tests {
     #[test]
     fn test_hash_with_sorting() {
         let img_dir = PathBuf::from("../../imgs/test/apple_pie");
-        let result = hash(img_dir, Some("nearest"), Some("dhash"), Some(true));
+        let result = hash(img_dir, Some("nearest"), Some("dhash"), None, Some(true));
 
         assert!(result.is_ok(), "Hash function failed: {:?}", result.err());
 
@@ -109,6 +111,7 @@ mod tests {
             temp_dir.path().to_path_buf(),
             Some("nearest"),
             Some("dhash"),
+            None,
             false,
         );
         assert!(result.is_ok(), "Dupes function failed: {:?}", result.err());
@@ -124,7 +127,13 @@ mod tests {
     #[test]
     fn test_dupes_with_invalid_path() {
         let invalid_path = PathBuf::from("/non/existent/path");
-        let result = dupes(invalid_path.clone(), Some("nearest"), Some("dhash"), false);
+        let result = dupes(
+            invalid_path.clone(),
+            Some("nearest"),
+            Some("dhash"),
+            None,
+            false,
+        );
         assert!(
             result.is_err(),
             "Expected error for invalid path: {:?}",
diff --git a/crates/imgddcore/benches/core_benches.rs b/crates/imgddcore/benches/core_benches.rs
index f9c7d6f..d351062 100644
--- a/crates/imgddcore/benches/core_benches.rs
+++ b/crates/imgddcore/benches/core_benches.rs
@@ -47,6 +47,7 @@ fn benchmark_collect_hashes(c: &mut Criterion) {
                 black_box(&dir_path),
                 black_box(image::imageops::FilterType::Triangle),
                 black_box("dhash"),
+                black_box(None),
             )
             .expect("Failed to collect hashes");
         });
@@ -55,8 +56,13 @@ fn benchmark_collect_hashes(c: &mut Criterion) {
 
 fn benchmark_sort_hashes(c: &mut Criterion) {
     let dir_path = PathBuf::from("../../imgs/test");
-    let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash")
-        .expect("Failed to collect hashes");
+    let mut hash_paths = collect_hashes(
+        &dir_path,
+        image::imageops::FilterType::Triangle,
+        "dhash",
+        None,
+    )
+    .expect("Failed to collect hashes");
 
     c.bench_function("sort_hashes", |b| {
         b.iter(|| {
@@ -67,8 +73,13 @@ fn benchmark_sort_hashes(c: &mut Criterion) {
 
 fn benchmark_find_duplicates(c: &mut Criterion) {
     let dir_path = PathBuf::from("../../imgs/test");
-    let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash")
-        .expect("Failed to collect hashes");
+    let mut hash_paths = collect_hashes(
+        &dir_path,
+        image::imageops::FilterType::Triangle,
+        "dhash",
+        None,
+    )
+    .expect("Failed to collect hashes");
     sort_hashes(&mut hash_paths);
 
     c.bench_function("find_duplicates", |b| {
@@ -139,7 +150,7 @@ fn benchmark_phash(c: &mut Criterion) {
     c.bench_function("phash", |b| {
         b.iter(|| {
             // Compute pHash for the normalized image
-            ImageHash::phash(black_box(&normalized_image)).expect("Failed to compute phash");
+            ImageHash::phash(black_box(&normalized_image), 8).expect("Failed to compute phash");
         });
     });
 }
diff --git a/crates/imgddcore/src/dedupe.rs b/crates/imgddcore/src/dedupe.rs
index 51ec3e4..f02261d 100644
--- a/crates/imgddcore/src/dedupe.rs
+++ b/crates/imgddcore/src/dedupe.rs
@@ -19,6 +19,7 @@ use walkdir::WalkDir;
 ///              Options: `Nearest`, `Triangle`, `CatmullRom`, `Gaussian`, `Lanczos3`.
 /// * `algo` - The hashing algorithm to use.
 ///              Options: `dhash`, `ahash`, `mhash`, `phash`, `whash`.
+/// * `hash_size` - The hash size for phash (ignored by other algorithms).
 ///
 /// # Returns
 ///
@@ -27,7 +28,8 @@ pub fn collect_hashes(
     path: &PathBuf,
     filter: FilterType,
     algo: &str,
-) -> Result<Vec<(u64, PathBuf)>, Error> {
+    hash_size: Option<usize>,
+) -> Result<Vec<(ImageHash, PathBuf)>, Error> {
     let files: Vec<PathBuf> = WalkDir::new(path)
         .into_iter()
         .filter_map(|entry| entry.ok())
@@ -35,30 +37,31 @@ pub fn collect_hashes(
         .map(|entry| entry.path().to_path_buf())
         .collect();
 
-    let hash_paths: Vec<(u64, PathBuf)> = files
+    let hash_paths: Vec<(ImageHash, PathBuf)> = files
         .par_iter()
         .filter_map(|file_path| match open_image(file_path) {
             Ok(image) => {
                 let hash = match algo {
                     "dhash" => {
                         let normalized = normalize::proc(&image, filter, 9, 8).ok()?;
-                        ImageHash::dhash(&normalized).ok()?.get_hash()
+                        ImageHash::dhash(&normalized).ok()?
                     }
                     "ahash" => {
                         let normalized = normalize::proc(&image, filter, 8, 8).ok()?;
-                        ImageHash::ahash(&normalized).ok()?.get_hash()
+                        ImageHash::ahash(&normalized).ok()?
                     }
                     "mhash" => {
                         let normalized = normalize::proc(&image, filter, 8, 8).ok()?;
-                        ImageHash::mhash(&normalized).ok()?.get_hash()
+                        ImageHash::mhash(&normalized).ok()?
                     }
                     "phash" => {
+                        let hash_size = hash_size.unwrap_or(8);
                         let normalized = normalize::proc(&image, filter, 32, 32).ok()?;
-                        ImageHash::phash(&normalized).ok()?.get_hash()
+                        ImageHash::phash(&normalized, hash_size).ok()?
                     }
                     "whash" => {
                         let normalized = normalize::proc(&image, filter, 8, 8).ok()?;
-                        ImageHash::whash(&normalized).ok()?.get_hash()
+                        ImageHash::whash(&normalized).ok()?
                     }
                     _ => panic!("Unsupported hashing algorithm: {}", algo),
                 };
@@ -80,8 +83,8 @@ pub fn collect_hashes(
 ///
 /// * `hash_paths` - A mutable reference to a vector of hash-path tuples.
 #[inline]
-pub fn sort_hashes(hash_paths: &mut Vec<(u64, PathBuf)>) {
-    hash_paths.sort_by_key(|(hash, _)| *hash);
+pub fn sort_hashes(hash_paths: &mut Vec<(ImageHash, PathBuf)>) {
+    hash_paths.sort_by_key(|(hash, _)| hash.clone());
 }
 
 /// Opens an image file and decodes it.
@@ -120,16 +123,16 @@ pub fn open_image(file_path: &PathBuf) -> Result<DynamicImage> {
 ///
 /// Returns an error if a file fails to be removed when `remove` is set to `true`.
 pub fn find_duplicates(
-    hash_paths: &[(u64, PathBuf)],
+    hash_paths: &[(ImageHash, PathBuf)],
     remove: bool,
-) -> Result<HashMap<u64, Vec<PathBuf>>, Error> {
-    let mut duplicates_map: HashMap<u64, Vec<PathBuf>> = HashMap::new();
+) -> Result<HashMap<ImageHash, Vec<PathBuf>>, Error> {
+    let mut duplicates_map: HashMap<ImageHash, Vec<PathBuf>> = HashMap::new();
 
     for window in hash_paths.windows(2) {
         if let [(hash1, path1), (hash2, path2)] = window {
             if hash1 == hash2 {
                 duplicates_map
-                    .entry(*hash1)
+                    .entry(hash1.clone())
                     .or_insert_with(Vec::new)
                     .extend(vec![path1.clone(), path2.clone()]);
             }
diff --git a/crates/imgddcore/src/hashing.rs b/crates/imgddcore/src/hashing.rs
index e4105e2..7fe7273 100644
--- a/crates/imgddcore/src/hashing.rs
+++ b/crates/imgddcore/src/hashing.rs
@@ -5,15 +5,29 @@ use dwt::wavelet::Haar;
 use dwt::{Operation, Transform};
 use rustdct::DctPlanner;
 
-/// A structure representing the hash of an image as u64.
+/// A structure representing the hash of an image.
 ///
 /// The `ImageHash` structure is used to store and compare the hash of an image for deduplication purposes.
-#[derive(Eq, PartialEq, Hash, Clone)]
+/// The hash is stored as a vector of bytes, allowing for variable-sized hashes.
+#[derive(Eq, PartialEq, Hash, Clone, Ord, PartialOrd, Debug)]
 pub struct ImageHash {
-    hash: u64,
+    hash: Vec<u8>,
 }
 
 impl ImageHash {
+    /// Creates a new ImageHash from a u64 value (for 64-bit hashes).
+    #[inline]
+    pub fn from_u64(hash: u64) -> Self {
+        let bytes = hash.to_be_bytes().to_vec();
+        Self { hash: bytes }
+    }
+
+    /// Returns the hash length in bytes.
+    #[inline]
+    pub fn num_bytes(&self) -> usize {
+        self.hash.len()
+    }
+
     /// Computes the average hash (aHash) of a given image.
     ///
     /// # Arguments
@@ -48,7 +62,7 @@ impl ImageHash {
             }
         }
 
-        Ok(Self { hash })
+        Ok(Self::from_u64(hash))
     }
 
     /// Computes the median hash (mHash) of a given image.
@@ -88,7 +102,7 @@ impl ImageHash {
             }
         }
 
-        Ok(Self { hash })
+        Ok(Self::from_u64(hash))
     }
 
     /// Computes the difference hash (dHash) of a given image.
@@ -116,13 +130,14 @@ impl ImageHash {
             }
         }
 
-        Ok(Self { hash })
+        Ok(Self::from_u64(hash))
     }
 
     /// Computes the perceptual hash (pHash) of a given image.
     ///
     /// # Arguments:
     /// * `image` - A reference to a `DynamicImage` for which the hash is to be calculated.
+    /// * `hash_size` - The size of the hash (e.g., 8 for 8x8, 16 for 16x16). Defaults to 8.
     ///
     /// # Returns:
     /// * An `ImageHash` instance containing the computed pHash value.
@@ -132,9 +147,8 @@ impl ImageHash {
     /// - Analyzes the frequency domain using Discrete Cosine Transform (DCT).
     /// - Focuses on low-frequency components, which are less affected by resizing or compression.
     #[inline]
-    pub fn phash(image: &DynamicImage) -> Result<Self> {
+    pub fn phash(image: &DynamicImage, hash_size: usize) -> Result<Self> {
         const IMG_SIZE: usize = 32;
-        const HASH_SIZE: usize = 8;
 
         // Collect pixel values from normalized 32x32 grayscale image
         let mut pixels: Vec<f32> = image.pixels().map(|p| p.2[0] as f32).collect();
@@ -163,11 +177,12 @@ impl ImageHash {
             }
         }
 
-        // Extract top-left 8x8 DCT coefficients (low frequencies)
-        let mut dct_lowfreq = [0f32; HASH_SIZE * HASH_SIZE];
-        for y in 0..HASH_SIZE {
-            for x in 0..HASH_SIZE {
-                dct_lowfreq[y * HASH_SIZE + x] = pixels[y * IMG_SIZE + x];
+        // Extract top-left hash_size x hash_size DCT coefficients (low frequencies)
+        let hash_size_sq = hash_size * hash_size;
+        let mut dct_lowfreq = vec![0f32; hash_size_sq];
+        for y in 0..hash_size {
+            for x in 0..hash_size {
+                dct_lowfreq[y * hash_size + x] = pixels[y * IMG_SIZE + x];
             }
         }
 
@@ -177,15 +192,21 @@ impl ImageHash {
         ac_coeffs.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap());
         let median = ac_coeffs[mid];
 
-        // Generate hash
-        let mut hash = 0u64;
+        // Generate hash as Vec<u8>
+        // Calculate number of bytes needed (hash_size * hash_size bits)
+        let num_bits = hash_size_sq;
+        let num_bytes = (num_bits + 7) / 8; // Round up to nearest byte
+        let mut hash_bytes = vec![0u8; num_bytes];
+
         for (i, &val) in dct_lowfreq.iter().enumerate() {
             if val > median {
-                hash |= 1 << (63 - i);
+                let byte_idx = i / 8;
+                let bit_idx = 7 - (i % 8); // MSB first (big-endian)
+                hash_bytes[byte_idx] |= 1 << bit_idx;
             }
         }
 
-        Ok(Self { hash })
+        Ok(Self { hash: hash_bytes })
     }
 
     /// Computes the wavelet hash (wHash) of a given image.
@@ -242,16 +263,16 @@ impl ImageHash {
             }
         }
 
-        Ok(Self { hash })
+        Ok(Self::from_u64(hash))
     }
 
     /// Retrieves the computed hash value.
     ///
     /// # Returns
     ///
-    /// * Hash value as a `u64`.
+    /// * A reference to the hash as a slice (`&[u8]`).
     #[inline]
-    pub fn get_hash(&self) -> u64 {
-        self.hash
+    pub fn get_hash(&self) -> &[u8] {
+        &self.hash
     }
 }
diff --git a/crates/imgddcore/tests/dedupe_tests.rs b/crates/imgddcore/tests/dedupe_tests.rs
index d9dd978..7504b58 100644
--- a/crates/imgddcore/tests/dedupe_tests.rs
+++ b/crates/imgddcore/tests/dedupe_tests.rs
@@ -20,20 +20,30 @@ mod tests {
 
         let algorithms = ["dhash", "ahash", "mhash", "phash", "whash"];
         for algo in algorithms {
-            let hashes =
-                collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, algo).unwrap();
+            let hashes = collect_hashes(
+                &temp_dir.path().to_path_buf(),
+                FilterType::Nearest,
+                algo,
+                None,
+            )
+            .unwrap();
             assert_eq!(hashes.len(), 1, "Algorithm {} failed", algo);
         }
     }
 
     #[test]
     fn test_sort_hashes() {
-        let mut hashes = vec![(2, PathBuf::from("b")), (1, PathBuf::from("a"))];
+        use imgddcore::hashing::ImageHash;
+        let hash1 = ImageHash::from_u64(2);
+        let hash2 = ImageHash::from_u64(1);
+        let mut hashes = vec![
+            (hash1.clone(), PathBuf::from("b")),
+            (hash2.clone(), PathBuf::from("a")),
+        ];
         sort_hashes(&mut hashes);
-        assert_eq!(
-            hashes,
-            vec![(1, PathBuf::from("a")), (2, PathBuf::from("b"))]
-        );
+        // After sorting, hash with value 1 should come before hash with value 2
+        assert_eq!(hashes[0].0, hash2);
+        assert_eq!(hashes[1].0, hash1);
     }
 
     #[test]
@@ -47,6 +57,7 @@ mod tests {
                 &temp_dir.path().to_path_buf(),
                 FilterType::Nearest,
                 "unsupported_algo",
+                None,
             )
         });
 
@@ -62,7 +73,12 @@ mod tests {
         let mut file = File::create(&invalid_image_path).unwrap();
         file.write_all(b"not a valid image").unwrap();
 
-        let result = collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, "dhash");
+        let result = collect_hashes(
+            &temp_dir.path().to_path_buf(),
+            FilterType::Nearest,
+            "dhash",
+            None,
+        );
         assert!(result.is_ok()); // Valid path, but should log errors for invalid image
     }
 
@@ -74,7 +90,12 @@ mod tests {
         // Create empty file that can't be decoded
         File::create(&invalid_image_path).unwrap();
 
-        let result = collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, "dhash");
+        let result = collect_hashes(
+            &temp_dir.path().to_path_buf(),
+            FilterType::Nearest,
+            "dhash",
+            None,
+        );
         assert!(result.is_ok()); // Valid path, but decode errors should be logged
     }
 
@@ -107,7 +128,12 @@ mod tests {
         std::fs::write(&file_path_2, b"file 2 content").unwrap();
 
         // Mock duplicate hash paths
-        let hash_paths = vec![(1, file_path_1.clone()), (1, file_path_2.clone())];
+        use imgddcore::hashing::ImageHash;
+        let hash = ImageHash::from_u64(1);
+        let hash_paths = vec![
+            (hash.clone(), file_path_1.clone()),
+            (hash, file_path_2.clone()),
+        ];
 
         // Test with `remove = true` to trigger file deletion
         let result = find_duplicates(&hash_paths, true);
@@ -133,7 +159,12 @@ mod tests {
         assert!(file_path_1.exists());
 
         // Mock duplicate hash paths, including a non-existent file
-        let hash_paths = vec![(1, file_path_1.clone()), (1, file_path_2.clone())];
+        use imgddcore::hashing::ImageHash;
+        let hash = ImageHash::from_u64(1);
+        let hash_paths = vec![
+            (hash.clone(), file_path_1.clone()),
+            (hash, file_path_2.clone()),
+        ];
 
         // Test with `remove = true` to trigger file deletion
         let result = find_duplicates(&hash_paths, true);
diff --git a/crates/imgddcore/tests/hashing_tests.rs b/crates/imgddcore/tests/hashing_tests.rs
index 048332d..dd2a634 100644
--- a/crates/imgddcore/tests/hashing_tests.rs
+++ b/crates/imgddcore/tests/hashing_tests.rs
@@ -21,13 +21,14 @@ mod tests {
     fn test_ahash() -> Result<()> {
         let test_image = create_mock_image((8, 8));
         let hash = ImageHash::ahash(&test_image)?;
-        println!("aHash: {:064b}", hash.get_hash());
-        let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010;
+        let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010u64;
+        let expected_bytes = expected_hash.to_be_bytes();
         assert_eq!(
             hash.get_hash(),
-            expected_hash,
+            expected_bytes.as_slice(),
             "aHash does not match expected value"
         );
+        assert_eq!(hash.num_bytes(), 8, "aHash size should be 8");
 
         Ok(())
     }
@@ -36,14 +37,14 @@ mod tests {
     fn test_mhash() -> Result<()> {
         let test_image = create_mock_image((8, 8));
         let hash = ImageHash::mhash(&test_image)?;
-        println!("mHash: {:064b}", hash.get_hash());
-        let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010;
-
+        let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010u64;
+        let expected_bytes = expected_hash.to_be_bytes();
         assert_eq!(
             hash.get_hash(),
-            expected_hash,
+            expected_bytes.as_slice(),
             "mHash does not match expected value"
         );
+        assert_eq!(hash.num_bytes(), 8, "mHash size should be 8");
 
         Ok(())
     }
@@ -52,13 +53,14 @@ mod tests {
     fn test_dhash() -> Result<()> {
         let test_image = create_mock_image((9, 8));
         let hash = ImageHash::dhash(&test_image)?;
-        println!("dHash: {:064b}", hash.get_hash());
-        let expected_hash = 0b0101010101010101010101010101010101010101010101010101010101010101;
+        let expected_hash = 0b0101010101010101010101010101010101010101010101010101010101010101u64;
+        let expected_bytes = expected_hash.to_be_bytes();
         assert_eq!(
             hash.get_hash(),
-            expected_hash,
+            expected_bytes.as_slice(),
             "dHash does not match expected value"
         );
+        assert_eq!(hash.num_bytes(), 8, "dHash size should be 8");
 
         Ok(())
     }
@@ -66,14 +68,33 @@ mod tests {
     #[test]
     fn test_phash() -> Result<()> {
         let test_image = create_mock_image((32, 32));
-        let hash = ImageHash::phash(&test_image)?;
-        let expected_hash = 0b1101010100000000000000000000000000000000000000000000000000000000;
-        println!("pHash: {:064b}", hash.get_hash());
+        let hash = ImageHash::phash(&test_image, 8)?;
+        let expected_hash = 0b1101010100000000000000000000000000000000000000000000000000000000u64;
+        let expected_bytes = expected_hash.to_be_bytes();
+        assert_eq!(
+            hash.get_hash(),
+            expected_bytes.as_slice(),
+            "pHash does not match expected value"
+        );
+        assert_eq!(hash.num_bytes(), 8, "pHash size should be 8");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_phash_size_16() -> Result<()> {
+        let test_image = create_mock_image((32, 32));
+        let hash = ImageHash::phash(&test_image, 16)?;
+        let expected_bytes: [u8; 32] = [
+            0b11010101, 0b1010101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0,
+        ];
         assert_eq!(
             hash.get_hash(),
-            expected_hash,
+            expected_bytes.as_slice(),
             "pHash does not match expected value"
         );
+        assert_eq!(hash.num_bytes(), 32, "pHash size should be 32");
 
         Ok(())
     }
@@ -82,14 +103,24 @@ mod tests {
     fn test_whash() -> Result<()> {
         let test_image = create_mock_image((8, 8));
         let hash = ImageHash::whash(&test_image)?;
-        println!("wHash: {:064b}", hash.get_hash());
-        let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010;
-
+        let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010u64;
+        let expected_bytes = expected_hash.to_be_bytes();
         assert_eq!(
             hash.get_hash(),
-            expected_hash,
+            expected_bytes.as_slice(),
             "wHash does not match expected value"
         );
+        assert_eq!(hash.num_bytes(), 8, "wHash size should be 8");
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_hash_comparison() -> Result<()> {
+        let test_image = create_mock_image((8, 8));
+        let hash1 = ImageHash::ahash(&test_image)?;
+        let hash2 = ImageHash::ahash(&test_image)?;
+        assert_eq!(hash1, hash2, "Same image should produce same hash");
 
         Ok(())
     }
diff --git a/crates/imgddpy/comparison/compare.py b/crates/imgddpy/comparison/compare.py
index 999e1d4..5f71e04 100644
--- a/crates/imgddpy/comparison/compare.py
+++ b/crates/imgddpy/comparison/compare.py
@@ -4,6 +4,7 @@
 import imgdd as dd
 import imagehash
 import os
+from typing import Optional
 
 
 def collect_image_count(path: str) -> int:
@@ -33,10 +34,10 @@ def benchmark_function(func, num_runs=50, warmup=3, **kwargs):
     }
 
 
-def imgdd_benchmark(path: str, algo: str, num_runs: int, num_images: int) -> dict:
+def imgdd_benchmark(path: str, algo: str, hash_size: Optional[int], num_runs: int, num_images: int) -> dict:
     """Benchmark imgdd library."""
     def run_imgdd_hash():
-        dd.hash(path=path, algo=algo, filter="Nearest", sort=False)
+        dd.hash(path=path, algo=algo, filter="Nearest", hash_size=hash_size, sort=False)
 
     results = benchmark_function(run_imgdd_hash, num_runs=num_runs)
     for key in results:
@@ -44,7 +45,7 @@ def run_imgdd_hash():
     return results
 
 
-def imagehash_benchmark(path: str, algo: str, num_runs: int, num_images: int) -> dict:
+def imagehash_benchmark(path: str, algo: str, hash_size: Optional[int], num_runs: int, num_images: int) -> dict:
     """Benchmark imagehash library."""
     def run_imagehash(algo: str):
         for root, _, files in os.walk(path):
@@ -56,7 +57,10 @@ def run_imagehash(algo: str):
                     if algo == "ahash":
                         imagehash.average_hash(image)
                     elif algo == "phash":
-                        imagehash.phash(image)
+                        if hash_size is None:
+                            imagehash.phash(image)
+                        else:
+                            imagehash.phash(image, hash_size)
                     elif algo == "dhash":
                         imagehash.dhash(image)
                     elif algo == "whash":
@@ -94,7 +98,7 @@ def calc_diff(imgdd_result: dict, imagehash_result: dict):
 
 if __name__ == "__main__":
     IMAGE_DIR = "../../../imgs/test/"
-    ALGORITHMS = ["dHash", "aHash", "pHash", "wHash"] # mHash has no equivalent in imagehash
+    ALGORITHMS = ["dHash", "aHash", "pHash", "pHash256", "wHash"] # mHash has no equivalent in imagehash
     NUM_RUNS = 100
     WARM_UP = 5
 
@@ -106,12 +110,18 @@ def calc_diff(imgdd_result: dict, imagehash_result: dict):
 
     for algo in ALGORITHMS:
         print(f"Benchmarking {algo}...\n")
+        if algo == "pHash256":
+            coreAlgo = "pHash"
+            hash_size = 16
+        else:
+            coreAlgo = algo
+            hash_size = None
         
         # Benchmark imgdd
-        imgdd_result = imgdd_benchmark(IMAGE_DIR, algo, NUM_RUNS, num_images)
+        imgdd_result = imgdd_benchmark(IMAGE_DIR, coreAlgo, hash_size, NUM_RUNS, num_images)
 
         # Benchmark imagehash
-        imagehash_result = imagehash_benchmark(IMAGE_DIR, algo, NUM_RUNS, num_images)
+        imagehash_result = imagehash_benchmark(IMAGE_DIR, coreAlgo, hash_size, NUM_RUNS, num_images)
 
         # Compare results
         compare_benchmarks(imgdd_result, imagehash_result, algo)
diff --git a/crates/imgddpy/imgdd.pyi b/crates/imgddpy/imgdd.pyi
index a5b6b74..9e11e9e 100644
--- a/crates/imgddpy/imgdd.pyi
+++ b/crates/imgddpy/imgdd.pyi
@@ -1,9 +1,10 @@
-from typing import Literal, Dict
+from typing import Literal, Dict, Optional
 
 def hash(
     path: str,
     filter: Literal["Nearest", "Triangle", "CatmullRom", "Gaussian", "Lanczos3"] = "Nearest",
     algo: Literal["aHash", "mHash", "dHash", "pHash", "wHash"] = "dHash",
+    hash_size: Optional[int] = None,
     sort: bool = False,
 ) -> Dict[str, str]:
     """
@@ -13,6 +14,8 @@ def hash(
         path (str): Path to the directory containing images.
         filter (str): Resize filter to use.
         algo (str): Hashing algorithm.
+        hash_size (int): Hash size for pHash algorithm (e.g., 8, 16). Only used for pHash.
+        sort (bool): Whether to sort the results by hash values.
 
     Returns:
         Dict[str, str]: A dictionary mapping file paths to their hashes.
@@ -23,6 +26,7 @@ def dupes(
     path: str,
     filter: Literal["Nearest", "Triangle", "CatmullRom", "Gaussian", "Lanczos3"] = "Nearest",
     algo: Literal["aHash", "mHash", "dHash", "pHash", "wHash"] = "dHash",
+    hash_size: Optional[int] = None,
     remove: bool = False,
 ) -> Dict[str, list[str]]:
     """
@@ -32,6 +36,7 @@ def dupes(
         path (str): Path to the directory containing images.
         filter (str): Resize filter to use.
         algo (str): Hashing algorithm.
+        hash_size (int): Hash size for pHash algorithm (e.g., 8, 16). Only used for pHash.
         remove (bool): Whether to remove duplicate files.
 
     Returns:
diff --git a/crates/imgddpy/src/lib.rs b/crates/imgddpy/src/lib.rs
index 4358333..ece2d20 100644
--- a/crates/imgddpy/src/lib.rs
+++ b/crates/imgddpy/src/lib.rs
@@ -30,7 +30,7 @@ fn select_algo(algo: Option<&str>) -> &'static str {
 }
 
 /// ```python
-/// hash(path, filter="triangle", algo="dhash", sort=False)
+/// hash(path, filter="triangle", algo="dhash", hash_size=None, sort=False)
 /// ```
 ///
 /// Calculate the hash of images in a directory.
@@ -43,6 +43,10 @@ fn select_algo(algo: Option<&str>) -> &'static str {
 /// - `algo (str)`: Hashing algorithm.
 ///     - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`]
 ///     - **Default:** `dHash`
+/// - `hash_size (int)`: Hash size for pHash algorithm (e.g., 8, 16).
+///   The resulting hash will be hash_size^2 bits long. The value is ignored
+///   for all hash methods other than pHash.
+///     - **Default:** `8`
 /// - `sort (bool)`: Whether to sort the results by hash values.
 ///     - **Default:** `False`
 ///
@@ -61,11 +65,12 @@ fn select_algo(algo: Option<&str>) -> &'static str {
 /// )
 /// print(results)
 /// ```
-#[pyfunction(signature = (path, filter = None, algo = None, sort = false))]
+#[pyfunction(signature = (path, filter = None, algo = None, hash_size = None, sort = false))]
 pub fn hash(
     path: PathBuf,
     filter: Option<&str>,
     algo: Option<&str>,
+    hash_size: Option<usize>,
     sort: Option<bool>,
 ) -> PyResult<HashMap<PathBuf, String>> {
     let validated_path = validate_path(&path)
@@ -73,7 +78,7 @@ pub fn hash(
     let filter_type = select_filter_type(filter);
     let algo = select_algo(algo);
 
-    let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo)
+    let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo, hash_size)
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(format!("{}", e)))?;
 
     // Optionally sort hashes
@@ -83,12 +88,19 @@ pub fn hash(
 
     Ok(hash_paths
         .into_iter()
-        .map(|(hash, path)| (path, format!("{:x}", hash)))
+        .map(|(hash, path)| {
+            let hash_hex = hash
+                .get_hash()
+                .iter()
+                .map(|b| format!("{:02x}", b))
+                .collect::<String>();
+            (path, hash_hex)
+        })
         .collect())
 }
 
 /// ```python
-/// dupes(path, filter="triangle", algo="dhash", remove=False)
+/// dupes(path, filter="triangle", algo="dhash", hash_size=None, remove=False)
 /// ```
 ///
 /// Find duplicate images in a directory.
@@ -101,6 +113,10 @@ pub fn hash(
 /// - `algo (str)`: Hashing algorithm.
 ///     - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`]
 ///     - **Default:** `dHash`
+/// - `hash_size (int)`: Hash size for pHash algorithm (e.g., 8, 16).
+///   The resulting hash will be hash_size^2 bits long. The value is ignored
+///   for all hash methods other than pHash.
+///     - **Default:** `8`
 /// - `remove (bool)`: Whether to remove duplicate files
 ///     - **Default:** `False`
 ///
@@ -119,11 +135,12 @@ pub fn hash(
 /// )
 /// print(duplicates)
 /// ```
-#[pyfunction(signature = (path, filter = None, algo = None, remove = false))]
+#[pyfunction(signature = (path, filter = None, algo = None, hash_size = None, remove = false))]
 pub fn dupes(
     path: PathBuf,
     filter: Option<&str>,
     algo: Option<&str>,
+    hash_size: Option<usize>,
     remove: bool,
 ) -> PyResult<HashMap<String, Vec<PathBuf>>> {
     let validated_path = validate_path(&path)
@@ -131,7 +148,7 @@ pub fn dupes(
     let filter_type = select_filter_type(filter);
     let algo = select_algo(algo);
 
-    let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo)
+    let mut hash_paths = collect_hashes(&validated_path, filter_type, &algo, hash_size)
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(format!("{}", e)))?;
     sort_hashes(&mut hash_paths);
 
@@ -140,7 +157,14 @@ pub fn dupes(
 
     Ok(duplicates
         .into_iter()
-        .map(|(hash, paths)| (format!("{:x}", hash), paths))
+        .map(|(hash, paths)| {
+            let hash_hex = hash
+                .get_hash()
+                .iter()
+                .map(|b| format!("{:02x}", b))
+                .collect::<String>();
+            (hash_hex, paths)
+        })
         .collect())
 }