diff --git a/.github/workflows/docs-python.yml b/.github/workflows/docs-python.yml index 1cd4324..f044d3b 100644 --- a/.github/workflows/docs-python.yml +++ b/.github/workflows/docs-python.yml @@ -91,5 +91,5 @@ jobs: # Build and deploy using mike cd crates/imgddpy/ - mike deploy --push --branch gh-pages "$VERSION" latest + mike deploy --push --update-aliases --branch gh-pages "$VERSION" latest mike set-default --push --branch gh-pages latest \ No newline at end of file diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index c91d00e..038f199 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -188,7 +188,7 @@ jobs: - name: Upload wheel artifact uses: actions/upload-artifact@v4 with: - name: wheels-${{ github.run_id }} + name: wheels-${{ github.run_id }}-${{ matrix.os }}-${{ matrix.compatibility }} path: target/wheels/*.whl overwrite: true @@ -209,8 +209,9 @@ jobs: - name: Merge all wheels and source distributions into `output/` run: | mkdir -p output - mv root/wheels-${{ github.run_id }}/*.whl output/ || true - mv root/sdist-${{ github.run_id }}/*.tar.gz output/ || true + # Move all wheel and sdist files from any artifact subfolder into output/ + find root -type f -name "*.whl" -exec mv {} output/ \; + find root -type f -name "*.tar.gz" -exec mv {} output/ \; ls -l output/ - name: Publish to PyPI @@ -233,11 +234,11 @@ jobs: - name: Merge all wheels and source distributions into `output/` run: | mkdir -p output - mv root/wheels-${{ github.run_id }}/*.whl output/ || true - mv root/sdist-${{ github.run_id }}/*.tar.gz output/ || true + find root -type f -name "*.whl" -exec mv {} output/ \; + find root -type f -name "*.tar.gz" -exec mv {} output/ \; ls -l output/ - name: Dry run output run: | echo "Dry run completed. Artifacts are built and available:" - ls -l output/ \ No newline at end of file + ls -l output/ diff --git a/Cargo.lock b/Cargo.lock index 25db9d1..0020bc3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -501,7 +501,7 @@ dependencies = [ [[package]] name = "imgdd" -version = "0.1.2" +version = "0.1.3" dependencies = [ "anyhow", "criterion", @@ -512,7 +512,7 @@ dependencies = [ [[package]] name = "imgddcore" -version = "0.1.2" +version = "0.1.3" dependencies = [ "anyhow", "codspeed-criterion-compat", @@ -527,7 +527,7 @@ dependencies = [ [[package]] name = "imgddpy" -version = "0.1.4" +version = "0.1.5" dependencies = [ "image", "imgddcore", diff --git a/Cargo.toml b/Cargo.toml index 7345090..ab3c29c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ ] [workspace.package] +version = "0.1.3" edition = "2021" license = "GPL-3.0-or-later" authors = ["Aaron Stopher "] diff --git a/README.md b/README.md index 5afb584..0903bfb 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ +[![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/) [![imgdd pypi](https://img.shields.io/pypi/v/imgdd?label=imgdd%20pypi)](https://pypi.org/project/imgdd) [![imgdd crate](https://img.shields.io/crates/v/imgdd?label=imgdd)](https://crates.io/crates/imgdd) [![imgddcore crate](https://img.shields.io/crates/v/imgddcore?label=imgddcore)](https://crates.io/crates/imgddcore) [![codecov](https://codecov.io/gh/aastopher/imgdd/graph/badge.svg?token=XZ1O2X04SO)](https://codecov.io/gh/aastopher/imgdd) -[![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/) [![DeepSource](https://app.deepsource.com/gh/aastopher/imgdd.svg/?label=active+issues&show_trend=true&token=IiuhCO6n1pK-GAJ800k6Z_9t)](https://app.deepsource.com/gh/aastopher/imgdd/) # imgdd: Image DeDuplication @@ -17,7 +17,7 @@ ## Why imgdd? -`imgdd` has been inspired by [imagehash](https://github.com/JohannesBuchner/imagehash) and aims to be a lightning-fast replacement with additional features. To ensure enhanced performance, `imgdd` has been benchmarked against `imagehash`. In Python, **imgdd consistently outperforms imagehash by ~60%–95%**, demonstrating a significant reduction in hashing time per image. +`imgdd` has been inspired by [imagehash](https://github.com/JohannesBuchner/imagehash) and aims to be a lightning-fast replacement with additional features. To ensure enhanced performance, `imgdd` has been benchmarked against `imagehash`. In Python, [**imgdd consistently outperforms imagehash by ~60%–95%**](https://aastopher.github.io/imgdd/latest/benches), demonstrating a significant reduction in hashing time per image. --- diff --git a/crates/imgdd/Cargo.toml b/crates/imgdd/Cargo.toml index ea449f6..abc20da 100644 --- a/crates/imgdd/Cargo.toml +++ b/crates/imgdd/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "imgdd" -version = "0.1.2" +version.workspace = true edition.workspace = true license.workspace = true authors.workspace = true @@ -13,7 +13,7 @@ categories.workspace = true readme = "README.md" [dependencies] -imgddcore = { path = "../imgddcore", version = "0.1.2" } +imgddcore = { path = "../imgddcore", version = "0.1.3" } image.workspace = true anyhow.workspace = true criterion = { version = "0.5.1", optional = true } diff --git a/crates/imgdd/README.md b/crates/imgdd/README.md index e7c3dbb..8d8e968 100644 --- a/crates/imgdd/README.md +++ b/crates/imgdd/README.md @@ -1,7 +1,9 @@ -[![codecov](https://codecov.io/gh/aastopher/imgdd/graph/badge.svg?token=XZ1O2X04SO)](https://codecov.io/gh/aastopher/imgdd) [![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/) +[![codecov](https://codecov.io/gh/aastopher/imgdd/graph/badge.svg?token=XZ1O2X04SO)](https://codecov.io/gh/aastopher/imgdd) [![DeepSource](https://app.deepsource.com/gh/aastopher/imgdd.svg/?label=active+issues&show_trend=true&token=IiuhCO6n1pK-GAJ800k6Z_9t)](https://app.deepsource.com/gh/aastopher/imgdd/) + + # imgdd: Image DeDuplication `imgdd` is a performance-first perceptual hashing library that combines Rust's speed with Python's accessibility, making it perfect for handling large datasets. Designed to quickly process nested folder structures, commonly found in image datasets. diff --git a/crates/imgdd/benches/rust_benches.rs b/crates/imgdd/benches/rust_benches.rs index 4c03506..98ea33e 100644 --- a/crates/imgdd/benches/rust_benches.rs +++ b/crates/imgdd/benches/rust_benches.rs @@ -1,4 +1,4 @@ -use criterion::{criterion_group, criterion_main, Criterion, black_box}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; use imgdd::*; use std::path::PathBuf; @@ -23,7 +23,12 @@ fn benchmark_hash(c: &mut Criterion) { c.bench_function("hash_function", |b| { b.iter(|| { - let result = hash(black_box(dir_path.clone()), Some("nearest"), Some("dhash"), Some(false)); + let result = hash( + black_box(dir_path.clone()), + Some("nearest"), + Some("dhash"), + Some(false), + ); let _ = black_box(result).is_ok(); // Ignore the result }); }); diff --git a/crates/imgdd/src/lib.rs b/crates/imgdd/src/lib.rs index 4795da6..e5e0031 100644 --- a/crates/imgdd/src/lib.rs +++ b/crates/imgdd/src/lib.rs @@ -1,12 +1,12 @@ //! Rust interface for fast and efficient image deduplication. //! Leverages perceptual hashing algorithms to identify duplicate or visually similar images in a directory. +use anyhow::Error; +use image::imageops::FilterType; use imgddcore::dedupe::*; use imgddcore::validate::*; -use image::imageops::FilterType; use std::collections::HashMap; use std::path::PathBuf; -use anyhow::Error; /// Converts a string to a `FilterType`. /// @@ -52,7 +52,6 @@ pub fn select_algo(algo: Option<&str>) -> &'static str { } } - /// Calculates hashes for all images in a directory recursively. /// /// # Arguments @@ -105,7 +104,6 @@ pub fn hash( Ok(hash_paths) } - /// Finds duplicate images in a directory. /// /// # Arguments diff --git a/crates/imgdd/tests/rust_tests.rs b/crates/imgdd/tests/rust_tests.rs index 5c7af6f..2771532 100644 --- a/crates/imgdd/tests/rust_tests.rs +++ b/crates/imgdd/tests/rust_tests.rs @@ -1,23 +1,28 @@ #[cfg(test)] mod tests { + use image::imageops::FilterType; use imgdd::*; - use tempfile::tempdir; use std::fs::File; use std::io::Write; use std::path::PathBuf; - use image::imageops::FilterType; - + use tempfile::tempdir; #[test] fn test_select_filter_type() { assert_eq!(select_filter_type(Some("nearest")), FilterType::Nearest); assert_eq!(select_filter_type(Some("triangle")), FilterType::Triangle); - assert_eq!(select_filter_type(Some("catmullrom")), FilterType::CatmullRom); + assert_eq!( + select_filter_type(Some("catmullrom")), + FilterType::CatmullRom + ); assert_eq!(select_filter_type(Some("gaussian")), FilterType::Gaussian); assert_eq!(select_filter_type(Some("lanczos3")), FilterType::Lanczos3); let result = std::panic::catch_unwind(|| select_filter_type(Some("unsupported"))); - assert!(result.is_err(), "Expected panic for unsupported filter type"); + assert!( + result.is_err(), + "Expected panic for unsupported filter type" + ); } #[test] @@ -41,15 +46,29 @@ mod tests { let mut file = File::create(&image_path).unwrap(); file.write_all(b"not a valid image").unwrap(); - let result = hash(temp_dir.path().to_path_buf(), Some("nearest"), Some("dhash"), Some(false)); + let result = hash( + temp_dir.path().to_path_buf(), + Some("nearest"), + Some("dhash"), + Some(false), + ); assert!(result.is_ok(), "Hash function failed: {:?}", result.err()); } #[test] fn test_hash_with_invalid_path() { let invalid_path = PathBuf::from("/non/existent/path"); - let result = hash(invalid_path.clone(), Some("nearest"), Some("dhash"), Some(false)); - assert!(result.is_err(), "Expected error for invalid path: {:?}", invalid_path); + let result = hash( + invalid_path.clone(), + Some("nearest"), + Some("dhash"), + Some(false), + ); + assert!( + result.is_err(), + "Expected error for invalid path: {:?}", + invalid_path + ); } #[test] @@ -73,7 +92,6 @@ mod tests { assert!(sorted, "Hashes are not sorted: {:?}", hash_paths); } - #[test] fn test_dupes_with_valid_inputs() { let temp_dir = tempdir().unwrap(); @@ -87,18 +105,30 @@ mod tests { let mut file2 = File::create(&image_path_2).unwrap(); file2.write_all(b"not a valid image").unwrap(); - let result = dupes(temp_dir.path().to_path_buf(), Some("nearest"), Some("dhash"), false); + let result = dupes( + temp_dir.path().to_path_buf(), + Some("nearest"), + Some("dhash"), + false, + ); assert!(result.is_ok(), "Dupes function failed: {:?}", result.err()); let duplicates = result.unwrap(); - assert_eq!(duplicates.len(), 0, "Expected no duplicates, but found some"); + assert_eq!( + duplicates.len(), + 0, + "Expected no duplicates, but found some" + ); } #[test] fn test_dupes_with_invalid_path() { let invalid_path = PathBuf::from("/non/existent/path"); let result = dupes(invalid_path.clone(), Some("nearest"), Some("dhash"), false); - assert!(result.is_err(), "Expected error for invalid path: {:?}", invalid_path); + assert!( + result.is_err(), + "Expected error for invalid path: {:?}", + invalid_path + ); } - -} \ No newline at end of file +} diff --git a/crates/imgddcore/Cargo.toml b/crates/imgddcore/Cargo.toml index 9a8d0a2..e419c26 100644 --- a/crates/imgddcore/Cargo.toml +++ b/crates/imgddcore/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "imgddcore" -version = "0.1.2" +version.workspace = true edition.workspace = true license.workspace = true authors.workspace = true diff --git a/crates/imgddcore/benches/core_benches.rs b/crates/imgddcore/benches/core_benches.rs index 871cebd..f9c7d6f 100644 --- a/crates/imgddcore/benches/core_benches.rs +++ b/crates/imgddcore/benches/core_benches.rs @@ -1,14 +1,14 @@ -use criterion::{criterion_group, criterion_main, Criterion, black_box}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use imgddcore::dedupe::{open_image, collect_hashes, sort_hashes, find_duplicates}; +use imgddcore::dedupe::{collect_hashes, find_duplicates, open_image, sort_hashes}; use imgddcore::hashing::ImageHash; use imgddcore::normalize::proc as normalize; use std::path::PathBuf; -// WARNING! +// WARNING! // dd.hash function benchmarks will be inaccurate because; this metric relies heavily on system calls. // Since they cannot be consistently instrumented, those calls are not included in the final measure. -// To resolve this we must use hosted codspeed macro runners which require a pro plan. +// To resolve this we must use hosted codspeed macro runners which require a pro plan. // For now I will just leave this warning here. fn open_image_bench(c: &mut Criterion) { @@ -27,8 +27,13 @@ fn benchmark_normalize(c: &mut Criterion) { c.bench_function("normalize", |b| { b.iter(|| { - normalize(black_box(&image), black_box(image::imageops::FilterType::Triangle), black_box(9), black_box(8)) - .expect("Failed to normalize image"); + normalize( + black_box(&image), + black_box(image::imageops::FilterType::Triangle), + black_box(9), + black_box(8), + ) + .expect("Failed to normalize image"); }); }); } @@ -50,12 +55,8 @@ fn benchmark_collect_hashes(c: &mut Criterion) { fn benchmark_sort_hashes(c: &mut Criterion) { let dir_path = PathBuf::from("../../imgs/test"); - let mut hash_paths = collect_hashes( - &dir_path, - image::imageops::FilterType::Triangle, - "dhash", - ) - .expect("Failed to collect hashes"); + let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash") + .expect("Failed to collect hashes"); c.bench_function("sort_hashes", |b| { b.iter(|| { @@ -66,18 +67,14 @@ fn benchmark_sort_hashes(c: &mut Criterion) { fn benchmark_find_duplicates(c: &mut Criterion) { let dir_path = PathBuf::from("../../imgs/test"); - let mut hash_paths = collect_hashes( - &dir_path, - image::imageops::FilterType::Triangle, - "dhash", - ) - .expect("Failed to collect hashes"); + let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash") + .expect("Failed to collect hashes"); sort_hashes(&mut hash_paths); c.bench_function("find_duplicates", |b| { b.iter(|| { - let _ = find_duplicates(black_box(&hash_paths), false) - .expect("Failed to find duplicates"); + let _ = + find_duplicates(black_box(&hash_paths), false).expect("Failed to find duplicates"); }); }); } @@ -163,7 +160,6 @@ fn benchmark_whash(c: &mut Criterion) { }); } - criterion_group! { name = group1; config = Criterion::default().sample_size(40); diff --git a/crates/imgddcore/src/dedupe.rs b/crates/imgddcore/src/dedupe.rs index c2160a7..51ec3e4 100644 --- a/crates/imgddcore/src/dedupe.rs +++ b/crates/imgddcore/src/dedupe.rs @@ -1,23 +1,23 @@ use crate::hashing::ImageHash; use crate::normalize; +use anyhow::Error; +use anyhow::{anyhow, Result}; use image::imageops::FilterType; use image::{DynamicImage, ImageReader}; use rayon::prelude::*; -use walkdir::WalkDir; use std::collections::HashMap; use std::fs; use std::path::PathBuf; -use anyhow::{anyhow, Result}; -use anyhow::Error; +use walkdir::WalkDir; /// Collects hashes for all image files in a directory recursively. /// /// # Arguments /// /// * `path` - The directory containing images to process. -/// * `filter` - The resize filter to use. +/// * `filter` - The resize filter to use. /// Options: `Nearest`, `Triangle`, `CatmullRom`, `Gaussian`, `Lanczos3`. -/// * `algo` - The hashing algorithm to use. +/// * `algo` - The hashing algorithm to use. /// Options: `dhash`, `ahash`, `mhash`, `phash`, `whash`. /// /// # Returns @@ -35,40 +35,38 @@ pub fn collect_hashes( .map(|entry| entry.path().to_path_buf()) .collect(); - let hash_paths: Vec<(u64, PathBuf)> = files + let hash_paths: Vec<(u64, PathBuf)> = files .par_iter() - .filter_map(|file_path| { - match open_image(file_path) { - Ok(image) => { - let hash = match algo { - "dhash" => { - let normalized = normalize::proc(&image, filter, 9, 8).ok()?; - ImageHash::dhash(&normalized).ok()?.get_hash() - } - "ahash" => { - let normalized = normalize::proc(&image, filter, 8, 8).ok()?; - ImageHash::ahash(&normalized).ok()?.get_hash() - } - "mhash" => { - let normalized = normalize::proc(&image, filter, 8, 8).ok()?; - ImageHash::mhash(&normalized).ok()?.get_hash() - } - "phash" => { - let normalized = normalize::proc(&image, filter, 32, 32).ok()?; - ImageHash::phash(&normalized).ok()?.get_hash() - } - "whash" => { - let normalized = normalize::proc(&image, filter, 8, 8).ok()?; - ImageHash::whash(&normalized).ok()?.get_hash() - } - _ => panic!("Unsupported hashing algorithm: {}", algo), - }; - Some((hash, file_path.clone())) - } - Err(e) => { - eprintln!("Failed to open image {}: {}", file_path.display(), e); - None - } + .filter_map(|file_path| match open_image(file_path) { + Ok(image) => { + let hash = match algo { + "dhash" => { + let normalized = normalize::proc(&image, filter, 9, 8).ok()?; + ImageHash::dhash(&normalized).ok()?.get_hash() + } + "ahash" => { + let normalized = normalize::proc(&image, filter, 8, 8).ok()?; + ImageHash::ahash(&normalized).ok()?.get_hash() + } + "mhash" => { + let normalized = normalize::proc(&image, filter, 8, 8).ok()?; + ImageHash::mhash(&normalized).ok()?.get_hash() + } + "phash" => { + let normalized = normalize::proc(&image, filter, 32, 32).ok()?; + ImageHash::phash(&normalized).ok()?.get_hash() + } + "whash" => { + let normalized = normalize::proc(&image, filter, 8, 8).ok()?; + ImageHash::whash(&normalized).ok()?.get_hash() + } + _ => panic!("Unsupported hashing algorithm: {}", algo), + }; + Some((hash, file_path.clone())) + } + Err(e) => { + eprintln!("Failed to open image {}: {}", file_path.display(), e); + None } }) .collect(); @@ -150,4 +148,3 @@ pub fn find_duplicates( Ok(duplicates_map) } - diff --git a/crates/imgddcore/src/hashing.rs b/crates/imgddcore/src/hashing.rs index 0309494..e4105e2 100644 --- a/crates/imgddcore/src/hashing.rs +++ b/crates/imgddcore/src/hashing.rs @@ -1,10 +1,9 @@ -use image::{DynamicImage, GenericImageView}; use anyhow::Result; +use image::{DynamicImage, GenericImageView}; -use rustdct::DctPlanner; -use dwt::{Transform, Operation}; use dwt::wavelet::Haar; - +use dwt::{Operation, Transform}; +use rustdct::DctPlanner; /// A structure representing the hash of an image as u64. /// @@ -31,13 +30,13 @@ impl ImageHash { pub fn ahash(image: &DynamicImage) -> Result { let mut sum = 0u64; let mut pixels = [0u8; 64]; - + // Collect pixel values and compute sum for (i, (_, _, pixel)) in image.pixels().enumerate().take(64) { pixels[i] = pixel[0]; // Grayscale value sum += pixels[i] as u64; } - + // Collect average pixel value let avg = sum / 64; @@ -48,11 +47,10 @@ impl ImageHash { hash |= 1 << (63 - i); // reverse order } } - + Ok(Self { hash }) } - /// Computes the median hash (mHash) of a given image. /// /// # Arguments @@ -68,20 +66,20 @@ impl ImageHash { #[inline] pub fn mhash(image: &DynamicImage) -> Result { let mut pixels = [0u8; 64]; - + // Collect 64 pixel values for (i, pixel) in image.pixels().map(|p| p.2[0]).take(64).enumerate() { pixels[i] = pixel; } - + // Copy pixels so we don't modify the original array let mut pixels_copy = pixels; - + // Find median O(n) let mid = 32; let (low, median, _high) = pixels_copy.select_nth_unstable(mid); let median = (*median as u64 + low[mid - 1] as u64) / 2; // Compute true median - + // Compute hash let mut hash = 0u64; for (i, &pixel) in pixels.iter().enumerate() { @@ -89,11 +87,9 @@ impl ImageHash { hash |= 1 << (63 - i); // reverse order } } - + Ok(Self { hash }) } - - /// Computes the difference hash (dHash) of a given image. /// @@ -123,7 +119,6 @@ impl ImageHash { Ok(Self { hash }) } - /// Computes the perceptual hash (pHash) of a given image. /// /// # Arguments: @@ -140,37 +135,34 @@ impl ImageHash { pub fn phash(image: &DynamicImage) -> Result { const IMG_SIZE: usize = 32; const HASH_SIZE: usize = 8; - + // Collect pixel values from normalized 32x32 grayscale image - let mut pixels: Vec = image - .pixels() - .map(|p| p.2[0] as f32) - .collect(); - + let mut pixels: Vec = image.pixels().map(|p| p.2[0] as f32).collect(); + // Plan DCT once for both rows and columns let mut planner = DctPlanner::new(); let dct = planner.plan_dct2(IMG_SIZE); - + // Apply DCT row-wise in-place for row in pixels.chunks_exact_mut(IMG_SIZE) { dct.process_dct2(row); } - + // Apply DCT column-wise in-place for col in 0..IMG_SIZE { let mut col_values: [f32; IMG_SIZE] = [0.0; IMG_SIZE]; - + for row in 0..IMG_SIZE { col_values[row] = pixels[row * IMG_SIZE + col]; } - + dct.process_dct2(&mut col_values); - + for row in 0..IMG_SIZE { pixels[row * IMG_SIZE + col] = col_values[row]; } } - + // Extract top-left 8x8 DCT coefficients (low frequencies) let mut dct_lowfreq = [0f32; HASH_SIZE * HASH_SIZE]; for y in 0..HASH_SIZE { @@ -178,13 +170,13 @@ impl ImageHash { dct_lowfreq[y * HASH_SIZE + x] = pixels[y * IMG_SIZE + x]; } } - + // Compute median excluding DC coefficient let mut ac_coeffs = dct_lowfreq[1..].to_vec(); let mid = ac_coeffs.len() / 2; ac_coeffs.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap()); let median = ac_coeffs[mid]; - + // Generate hash let mut hash = 0u64; for (i, &val) in dct_lowfreq.iter().enumerate() { @@ -192,10 +184,9 @@ impl ImageHash { hash |= 1 << (63 - i); } } - + Ok(Self { hash }) } - /// Computes the wavelet hash (wHash) of a given image. /// @@ -213,7 +204,7 @@ impl ImageHash { pub fn whash(image: &DynamicImage) -> Result { const HASH_SIZE: u32 = 8; let ll_max_level: usize = 3; - + // Allocate flat vector of normalized pixels (row–major order). let total_pixels = (HASH_SIZE * HASH_SIZE) as usize; let mut pixels = Vec::with_capacity(total_pixels); @@ -223,17 +214,17 @@ impl ImageHash { pixels.push(pixel[0] as f32 / 255.0); } } - + // ---------- Remove low-level frequency (DC) component ---------- // // Perform a full forward Haar transform - 8×8 image (3 levels). pixels.transform(Operation::Forward, &Haar::new(), ll_max_level); - + // Zero out the DC coefficient. pixels[0] = 0.0; - + // Perform inverse Haar transform (reconstruct image). pixels.transform(Operation::Inverse, &Haar::new(), ll_max_level); - + // ---------- Compute median O(n) ---------- // let mid: usize = 32; // Clone flat pixel vector. @@ -242,7 +233,7 @@ impl ImageHash { flat.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap()); // Compute median. let median = (flat[mid - 1] + flat[mid]) / 2.0; - + // Generate hash. let mut hash = 0u64; for (i, &val) in pixels.iter().enumerate() { @@ -250,7 +241,7 @@ impl ImageHash { hash |= 1 << (63 - i); } } - + Ok(Self { hash }) } diff --git a/crates/imgddcore/src/lib.rs b/crates/imgddcore/src/lib.rs index 2ddff02..ed75035 100644 --- a/crates/imgddcore/src/lib.rs +++ b/crates/imgddcore/src/lib.rs @@ -1,5 +1,4 @@ -pub mod hashing; pub mod dedupe; +pub mod hashing; pub mod normalize; pub mod validate; - diff --git a/crates/imgddcore/src/normalize.rs b/crates/imgddcore/src/normalize.rs index b86e737..6228bfb 100644 --- a/crates/imgddcore/src/normalize.rs +++ b/crates/imgddcore/src/normalize.rs @@ -1,11 +1,11 @@ use anyhow::Result; -use image::{DynamicImage, imageops::FilterType}; +use image::{imageops::FilterType, DynamicImage}; /// Normalizes an image by resizing it to a given resolution and converting it to grayscale. /// /// # Arguments /// * `image` - A reference to a `DynamicImage` to be normalized. -/// * `filter` - The down sampling method to use during resizing. +/// * `filter` - The down sampling method to use during resizing. /// - **Options:** [`Nearest`, `Triangle`, `CatmullRom`, `Gaussian`, `Lanczos3`] /// * `width` - The desired width of the resized image. /// * `height` - The desired height of the resized image. @@ -13,6 +13,11 @@ use image::{DynamicImage, imageops::FilterType}; /// # Returns /// * A `DynamicImage` that has been resized to the given dimensions and converted to grayscale. #[inline] -pub fn proc(image: &DynamicImage, filter: FilterType, width: u32, height: u32) -> Result { +pub fn proc( + image: &DynamicImage, + filter: FilterType, + width: u32, + height: u32, +) -> Result { Ok(image.resize_exact(width, height, filter).grayscale()) } diff --git a/crates/imgddcore/src/validate.rs b/crates/imgddcore/src/validate.rs index 85a7f36..9ba56f0 100644 --- a/crates/imgddcore/src/validate.rs +++ b/crates/imgddcore/src/validate.rs @@ -1,7 +1,6 @@ use anyhow::{anyhow, Result}; use std::path::PathBuf; - /// Validates a given path to ensure it exists and is a directory. /// /// This function checks whether the provided path exists and is a directory, diff --git a/crates/imgddcore/tests/dedupe_tests.rs b/crates/imgddcore/tests/dedupe_tests.rs index 831f7f6..d9dd978 100644 --- a/crates/imgddcore/tests/dedupe_tests.rs +++ b/crates/imgddcore/tests/dedupe_tests.rs @@ -1,12 +1,12 @@ #[cfg(test)] mod tests { - use imgddcore::dedupe::*; use image::imageops::FilterType; use image::{DynamicImage, Rgba}; - use std::path::PathBuf; + use imgddcore::dedupe::*; use std::fs::File; use std::io::Write; use std::panic; + use std::path::PathBuf; fn create_mock_image() -> DynamicImage { DynamicImage::ImageRgba8(image::ImageBuffer::from_pixel(9, 8, Rgba([255, 0, 0, 255]))) @@ -17,21 +17,23 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let image_path = temp_dir.path().join("test_image.png"); create_mock_image().save(&image_path).unwrap(); - + let algorithms = ["dhash", "ahash", "mhash", "phash", "whash"]; for algo in algorithms { - let hashes = collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, algo) - .unwrap(); + let hashes = + collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, algo).unwrap(); assert_eq!(hashes.len(), 1, "Algorithm {} failed", algo); } } - #[test] fn test_sort_hashes() { let mut hashes = vec![(2, PathBuf::from("b")), (1, PathBuf::from("a"))]; sort_hashes(&mut hashes); - assert_eq!(hashes, vec![(1, PathBuf::from("a")), (2, PathBuf::from("b"))]); + assert_eq!( + hashes, + vec![(1, PathBuf::from("a")), (2, PathBuf::from("b"))] + ); } #[test] @@ -41,7 +43,11 @@ mod tests { create_mock_image().save(&image_path).unwrap(); let result = panic::catch_unwind(|| { - collect_hashes(&temp_dir.path().to_path_buf(), FilterType::Nearest, "unsupported_algo") + collect_hashes( + &temp_dir.path().to_path_buf(), + FilterType::Nearest, + "unsupported_algo", + ) }); assert!(result.is_err()); // Should panic due to unsupported algorithm @@ -95,24 +101,21 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let file_path_1 = temp_dir.path().join("test_file_1.txt"); let file_path_2 = temp_dir.path().join("test_file_2.txt"); - + // Create two dummy files std::fs::write(&file_path_1, b"file 1 content").unwrap(); std::fs::write(&file_path_2, b"file 2 content").unwrap(); - + // Mock duplicate hash paths - let hash_paths = vec![ - (1, file_path_1.clone()), - (1, file_path_2.clone()), - ]; - + let hash_paths = vec![(1, file_path_1.clone()), (1, file_path_2.clone())]; + // Test with `remove = true` to trigger file deletion let result = find_duplicates(&hash_paths, true); assert!(result.is_ok()); - + // First file should remain assert!(file_path_1.exists()); - + // Duplicate should be removed assert!(!file_path_2.exists()); } @@ -122,28 +125,28 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let file_path_1 = temp_dir.path().join("test_file_1.txt"); let file_path_2 = temp_dir.path().join("nonexistent_file.txt"); - + // Create single dummy file std::fs::write(&file_path_1, b"file 1 content").unwrap(); - + // Expect first file exists before test starts assert!(file_path_1.exists()); - + // Mock duplicate hash paths, including a non-existent file - let hash_paths = vec![ - (1, file_path_1.clone()), - (1, file_path_2.clone()), - ]; - + let hash_paths = vec![(1, file_path_1.clone()), (1, file_path_2.clone())]; + // Test with `remove = true` to trigger file deletion let result = find_duplicates(&hash_paths, true); assert!(result.is_ok()); - + // First file should remain untouched assert!(file_path_1.exists()); - + // Second file should not exist, and removal should fail gracefully - assert!(!file_path_2.exists(), "File {} should not exist.", file_path_2.display()); + assert!( + !file_path_2.exists(), + "File {} should not exist.", + file_path_2.display() + ); } - } diff --git a/crates/imgddcore/tests/hashing_tests.rs b/crates/imgddcore/tests/hashing_tests.rs index 46fec83..048332d 100644 --- a/crates/imgddcore/tests/hashing_tests.rs +++ b/crates/imgddcore/tests/hashing_tests.rs @@ -1,8 +1,8 @@ #[cfg(test)] mod tests { - use imgddcore::hashing::ImageHash; - use image::{DynamicImage, Rgba}; use anyhow::Result; + use image::{DynamicImage, Rgba}; + use imgddcore::hashing::ImageHash; /// Creates a mock image with alternating pixel values for testing. fn create_mock_image(size: (u32, u32)) -> DynamicImage { @@ -13,17 +13,21 @@ mod tests { } else { Rgba([0, 0, 0, 255]) // Black pixel } - })).grayscale() + })) + .grayscale() } - #[test] fn test_ahash() -> Result<()> { let test_image = create_mock_image((8, 8)); let hash = ImageHash::ahash(&test_image)?; println!("aHash: {:064b}", hash.get_hash()); let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010; - assert_eq!(hash.get_hash(), expected_hash, "aHash does not match expected value"); + assert_eq!( + hash.get_hash(), + expected_hash, + "aHash does not match expected value" + ); Ok(()) } @@ -34,8 +38,12 @@ mod tests { let hash = ImageHash::mhash(&test_image)?; println!("mHash: {:064b}", hash.get_hash()); let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010; - - assert_eq!(hash.get_hash(), expected_hash, "mHash does not match expected value"); + + assert_eq!( + hash.get_hash(), + expected_hash, + "mHash does not match expected value" + ); Ok(()) } @@ -46,7 +54,11 @@ mod tests { let hash = ImageHash::dhash(&test_image)?; println!("dHash: {:064b}", hash.get_hash()); let expected_hash = 0b0101010101010101010101010101010101010101010101010101010101010101; - assert_eq!(hash.get_hash(), expected_hash, "dHash does not match expected value"); + assert_eq!( + hash.get_hash(), + expected_hash, + "dHash does not match expected value" + ); Ok(()) } @@ -57,7 +69,11 @@ mod tests { let hash = ImageHash::phash(&test_image)?; let expected_hash = 0b1101010100000000000000000000000000000000000000000000000000000000; println!("pHash: {:064b}", hash.get_hash()); - assert_eq!(hash.get_hash(), expected_hash, "pHash does not match expected value"); + assert_eq!( + hash.get_hash(), + expected_hash, + "pHash does not match expected value" + ); Ok(()) } @@ -69,9 +85,12 @@ mod tests { println!("wHash: {:064b}", hash.get_hash()); let expected_hash = 0b1010101010101010101010101010101010101010101010101010101010101010; - assert_eq!(hash.get_hash(), expected_hash, "wHash does not match expected value"); + assert_eq!( + hash.get_hash(), + expected_hash, + "wHash does not match expected value" + ); Ok(()) } - -} \ No newline at end of file +} diff --git a/crates/imgddcore/tests/normalize_tests.rs b/crates/imgddcore/tests/normalize_tests.rs index 3d4c04c..0a526fb 100644 --- a/crates/imgddcore/tests/normalize_tests.rs +++ b/crates/imgddcore/tests/normalize_tests.rs @@ -1,11 +1,15 @@ #[cfg(test)] mod tests { - use imgddcore::normalize::*; use image::imageops::FilterType; use image::{DynamicImage, Rgba}; + use imgddcore::normalize::*; fn create_mock_image() -> DynamicImage { - DynamicImage::ImageRgba8(image::ImageBuffer::from_pixel(16, 16, Rgba([255, 0, 0, 255]))) + DynamicImage::ImageRgba8(image::ImageBuffer::from_pixel( + 16, + 16, + Rgba([255, 0, 0, 255]), + )) } #[test] diff --git a/crates/imgddcore/tests/validate_tests.rs b/crates/imgddcore/tests/validate_tests.rs index 7421fca..4796239 100644 --- a/crates/imgddcore/tests/validate_tests.rs +++ b/crates/imgddcore/tests/validate_tests.rs @@ -19,10 +19,7 @@ mod tests { assert!(result.is_err()); if let Err(err) = result { - assert_eq!( - err.to_string(), - "Path does not exist: /non/existent/path" - ); + assert_eq!(err.to_string(), "Path does not exist: /non/existent/path"); } } diff --git a/crates/imgddpy/Cargo.toml b/crates/imgddpy/Cargo.toml index 89f8574..dd60f54 100644 --- a/crates/imgddpy/Cargo.toml +++ b/crates/imgddpy/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "imgddpy" -version = "0.1.4" +version = "0.1.5" edition.workspace = true license.workspace = true authors.workspace = true @@ -12,7 +12,7 @@ homepage.workspace = true readme = "README.md" [dependencies] -imgddcore = { path = "../imgddcore", version = "0.1.2" } +imgddcore = { path = "../imgddcore", version = "0.1.3" } pyo3 = { version = "0.23", features = ["extension-module", "abi3-py39"] } image.workspace = true diff --git a/crates/imgddpy/README.md b/crates/imgddpy/README.md index f55c593..930b612 100644 --- a/crates/imgddpy/README.md +++ b/crates/imgddpy/README.md @@ -1,8 +1,8 @@ +[![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/) [![imgdd pypi](https://img.shields.io/pypi/v/imgdd?label=imgdd%20pypi)](https://pypi.org/project/imgdd) [![imgdd crate](https://img.shields.io/crates/v/imgdd?label=imgdd)](https://crates.io/crates/imgdd) [![imgddcore crate](https://img.shields.io/crates/v/imgddcore?label=imgddcore)](https://crates.io/crates/imgddcore) [![codecov](https://codecov.io/gh/aastopher/imgdd/graph/badge.svg?token=XZ1O2X04SO)](https://codecov.io/gh/aastopher/imgdd) -[![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/) [![DeepSource](https://app.deepsource.com/gh/aastopher/imgdd.svg/?label=active+issues&show_trend=true&token=IiuhCO6n1pK-GAJ800k6Z_9t)](https://app.deepsource.com/gh/aastopher/imgdd/) # imgdd: Image DeDuplication diff --git a/crates/imgddpy/src/lib.rs b/crates/imgddpy/src/lib.rs index 2952419..4358333 100644 --- a/crates/imgddpy/src/lib.rs +++ b/crates/imgddpy/src/lib.rs @@ -1,7 +1,7 @@ +use image::imageops::FilterType; use imgddcore::dedupe::*; use imgddcore::validate::*; use pyo3::prelude::*; -use image::imageops::FilterType; use std::collections::HashMap; use std::path::PathBuf; @@ -32,15 +32,15 @@ fn select_algo(algo: Option<&str>) -> &'static str { /// ```python /// hash(path, filter="triangle", algo="dhash", sort=False) /// ``` -/// +/// /// Calculate the hash of images in a directory. /// /// # Arguments /// - `path (str)`: Path to the directory containing images. -/// - `filter (str)`: Resize filter to use. +/// - `filter (str)`: Resize filter to use. /// - **Options:** [`Nearest`, `Triangle`, `CatmullRom`, `Gaussian`, `Lanczos3`] /// - **Default:** `Triangle` -/// - `algo (str)`: Hashing algorithm. +/// - `algo (str)`: Hashing algorithm. /// - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`] /// - **Default:** `dHash` /// - `sort (bool)`: Whether to sort the results by hash values. @@ -90,15 +90,15 @@ pub fn hash( /// ```python /// dupes(path, filter="triangle", algo="dhash", remove=False) /// ``` -/// +/// /// Find duplicate images in a directory. /// /// # Arguments /// - `path (str)`: Path to the directory containing images. -/// - `filter (str)`: Resize filter to use. +/// - `filter (str)`: Resize filter to use. /// - **Options:** [`Nearest`, `Triangle`, `CatmullRom`, `Gaussian`, `Lanczos3`] /// - **Default:** `Triangle` -/// - `algo (str)`: Hashing algorithm. +/// - `algo (str)`: Hashing algorithm. /// - **Options:** [`aHash`, `mHash`, `dHash`, `pHash`, `wHash`] /// - **Default:** `dHash` /// - `remove (bool)`: Whether to remove duplicate files @@ -144,7 +144,6 @@ pub fn dupes( .collect()) } - #[pymodule] fn imgdd(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(dupes, m)?)?;