Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docs-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,5 +91,5 @@ jobs:

# Build and deploy using mike
cd crates/imgddpy/
mike deploy --push --branch gh-pages "$VERSION" latest
mike deploy --push --update-aliases --branch gh-pages "$VERSION" latest
mike set-default --push --branch gh-pages latest
13 changes: 7 additions & 6 deletions .github/workflows/release-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ jobs:
- name: Upload wheel artifact
uses: actions/upload-artifact@v4
with:
name: wheels-${{ github.run_id }}
name: wheels-${{ github.run_id }}-${{ matrix.os }}-${{ matrix.compatibility }}
path: target/wheels/*.whl
overwrite: true

Expand All @@ -209,8 +209,9 @@ jobs:
- name: Merge all wheels and source distributions into `output/`
run: |
mkdir -p output
mv root/wheels-${{ github.run_id }}/*.whl output/ || true
mv root/sdist-${{ github.run_id }}/*.tar.gz output/ || true
# Move all wheel and sdist files from any artifact subfolder into output/
find root -type f -name "*.whl" -exec mv {} output/ \;
find root -type f -name "*.tar.gz" -exec mv {} output/ \;
ls -l output/

- name: Publish to PyPI
Expand All @@ -233,11 +234,11 @@ jobs:
- name: Merge all wheels and source distributions into `output/`
run: |
mkdir -p output
mv root/wheels-${{ github.run_id }}/*.whl output/ || true
mv root/sdist-${{ github.run_id }}/*.tar.gz output/ || true
find root -type f -name "*.whl" -exec mv {} output/ \;
find root -type f -name "*.tar.gz" -exec mv {} output/ \;
ls -l output/

- name: Dry run output
run: |
echo "Dry run completed. Artifacts are built and available:"
ls -l output/
ls -l output/
6 changes: 3 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ members = [
]

[workspace.package]
version = "0.1.3"
edition = "2021"
license = "GPL-3.0-or-later"
authors = ["Aaron Stopher <aaron.stopher@gmail.com>"]
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/)
[![imgdd pypi](https://img.shields.io/pypi/v/imgdd?label=imgdd%20pypi)](https://pypi.org/project/imgdd)
[![imgdd crate](https://img.shields.io/crates/v/imgdd?label=imgdd)](https://crates.io/crates/imgdd)
[![imgddcore crate](https://img.shields.io/crates/v/imgddcore?label=imgddcore)](https://crates.io/crates/imgddcore)
[![codecov](https://codecov.io/gh/aastopher/imgdd/graph/badge.svg?token=XZ1O2X04SO)](https://codecov.io/gh/aastopher/imgdd)
[![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/)
[![DeepSource](https://app.deepsource.com/gh/aastopher/imgdd.svg/?label=active+issues&show_trend=true&token=IiuhCO6n1pK-GAJ800k6Z_9t)](https://app.deepsource.com/gh/aastopher/imgdd/)

# imgdd: Image DeDuplication
Expand All @@ -17,7 +17,7 @@

## Why imgdd?

`imgdd` has been inspired by [imagehash](https://github.com/JohannesBuchner/imagehash) and aims to be a lightning-fast replacement with additional features. To ensure enhanced performance, `imgdd` has been benchmarked against `imagehash`. In Python, **imgdd consistently outperforms imagehash by ~60%–95%**, demonstrating a significant reduction in hashing time per image.
`imgdd` has been inspired by [imagehash](https://github.com/JohannesBuchner/imagehash) and aims to be a lightning-fast replacement with additional features. To ensure enhanced performance, `imgdd` has been benchmarked against `imagehash`. In Python, [**imgdd consistently outperforms imagehash by ~60%–95%**](https://aastopher.github.io/imgdd/latest/benches), demonstrating a significant reduction in hashing time per image.

---

Expand Down
4 changes: 2 additions & 2 deletions crates/imgdd/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "imgdd"
version = "0.1.2"
version.workspace = true
edition.workspace = true
license.workspace = true
authors.workspace = true
Expand All @@ -13,7 +13,7 @@ categories.workspace = true
readme = "README.md"

[dependencies]
imgddcore = { path = "../imgddcore", version = "0.1.2" }
imgddcore = { path = "../imgddcore", version = "0.1.3" }
image.workspace = true
anyhow.workspace = true
criterion = { version = "0.5.1", optional = true }
Expand Down
4 changes: 3 additions & 1 deletion crates/imgdd/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
[![codecov](https://codecov.io/gh/aastopher/imgdd/graph/badge.svg?token=XZ1O2X04SO)](https://codecov.io/gh/aastopher/imgdd)
[![Documentation Status](https://img.shields.io/badge/docs-online-brightgreen)](https://aastopher.github.io/imgdd/)
[![codecov](https://codecov.io/gh/aastopher/imgdd/graph/badge.svg?token=XZ1O2X04SO)](https://codecov.io/gh/aastopher/imgdd)
[![DeepSource](https://app.deepsource.com/gh/aastopher/imgdd.svg/?label=active+issues&show_trend=true&token=IiuhCO6n1pK-GAJ800k6Z_9t)](https://app.deepsource.com/gh/aastopher/imgdd/)



# imgdd: Image DeDuplication

`imgdd` is a performance-first perceptual hashing library that combines Rust's speed with Python's accessibility, making it perfect for handling large datasets. Designed to quickly process nested folder structures, commonly found in image datasets.
Expand Down
9 changes: 7 additions & 2 deletions crates/imgdd/benches/rust_benches.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use criterion::{criterion_group, criterion_main, Criterion, black_box};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use imgdd::*;
use std::path::PathBuf;

Expand All @@ -23,7 +23,12 @@ fn benchmark_hash(c: &mut Criterion) {

c.bench_function("hash_function", |b| {
b.iter(|| {
let result = hash(black_box(dir_path.clone()), Some("nearest"), Some("dhash"), Some(false));
let result = hash(
black_box(dir_path.clone()),
Some("nearest"),
Some("dhash"),
Some(false),
);
let _ = black_box(result).is_ok(); // Ignore the result
});
});
Expand Down
6 changes: 2 additions & 4 deletions crates/imgdd/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
//! Rust interface for fast and efficient image deduplication.
//! Leverages perceptual hashing algorithms to identify duplicate or visually similar images in a directory.

use anyhow::Error;
use image::imageops::FilterType;
use imgddcore::dedupe::*;
use imgddcore::validate::*;
use image::imageops::FilterType;
use std::collections::HashMap;
use std::path::PathBuf;
use anyhow::Error;

/// Converts a string to a `FilterType`.
///
Expand Down Expand Up @@ -52,7 +52,6 @@ pub fn select_algo(algo: Option<&str>) -> &'static str {
}
}


/// Calculates hashes for all images in a directory recursively.
///
/// # Arguments
Expand Down Expand Up @@ -105,7 +104,6 @@ pub fn hash(
Ok(hash_paths)
}


/// Finds duplicate images in a directory.
///
/// # Arguments
Expand Down
58 changes: 44 additions & 14 deletions crates/imgdd/tests/rust_tests.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
#[cfg(test)]
mod tests {
use image::imageops::FilterType;
use imgdd::*;
use tempfile::tempdir;
use std::fs::File;
use std::io::Write;
use std::path::PathBuf;
use image::imageops::FilterType;

use tempfile::tempdir;

#[test]
fn test_select_filter_type() {
assert_eq!(select_filter_type(Some("nearest")), FilterType::Nearest);
assert_eq!(select_filter_type(Some("triangle")), FilterType::Triangle);
assert_eq!(select_filter_type(Some("catmullrom")), FilterType::CatmullRom);
assert_eq!(
select_filter_type(Some("catmullrom")),
FilterType::CatmullRom
);
assert_eq!(select_filter_type(Some("gaussian")), FilterType::Gaussian);
assert_eq!(select_filter_type(Some("lanczos3")), FilterType::Lanczos3);

let result = std::panic::catch_unwind(|| select_filter_type(Some("unsupported")));
assert!(result.is_err(), "Expected panic for unsupported filter type");
assert!(
result.is_err(),
"Expected panic for unsupported filter type"
);
}

#[test]
Expand All @@ -41,15 +46,29 @@ mod tests {
let mut file = File::create(&image_path).unwrap();
file.write_all(b"not a valid image").unwrap();

let result = hash(temp_dir.path().to_path_buf(), Some("nearest"), Some("dhash"), Some(false));
let result = hash(
temp_dir.path().to_path_buf(),
Some("nearest"),
Some("dhash"),
Some(false),
);
assert!(result.is_ok(), "Hash function failed: {:?}", result.err());
}

#[test]
fn test_hash_with_invalid_path() {
let invalid_path = PathBuf::from("/non/existent/path");
let result = hash(invalid_path.clone(), Some("nearest"), Some("dhash"), Some(false));
assert!(result.is_err(), "Expected error for invalid path: {:?}", invalid_path);
let result = hash(
invalid_path.clone(),
Some("nearest"),
Some("dhash"),
Some(false),
);
assert!(
result.is_err(),
"Expected error for invalid path: {:?}",
invalid_path
);
}

#[test]
Expand All @@ -73,7 +92,6 @@ mod tests {
assert!(sorted, "Hashes are not sorted: {:?}", hash_paths);
}


#[test]
fn test_dupes_with_valid_inputs() {
let temp_dir = tempdir().unwrap();
Expand All @@ -87,18 +105,30 @@ mod tests {
let mut file2 = File::create(&image_path_2).unwrap();
file2.write_all(b"not a valid image").unwrap();

let result = dupes(temp_dir.path().to_path_buf(), Some("nearest"), Some("dhash"), false);
let result = dupes(
temp_dir.path().to_path_buf(),
Some("nearest"),
Some("dhash"),
false,
);
assert!(result.is_ok(), "Dupes function failed: {:?}", result.err());

let duplicates = result.unwrap();
assert_eq!(duplicates.len(), 0, "Expected no duplicates, but found some");
assert_eq!(
duplicates.len(),
0,
"Expected no duplicates, but found some"
);
}

#[test]
fn test_dupes_with_invalid_path() {
let invalid_path = PathBuf::from("/non/existent/path");
let result = dupes(invalid_path.clone(), Some("nearest"), Some("dhash"), false);
assert!(result.is_err(), "Expected error for invalid path: {:?}", invalid_path);
assert!(
result.is_err(),
"Expected error for invalid path: {:?}",
invalid_path
);
}

}
}
2 changes: 1 addition & 1 deletion crates/imgddcore/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "imgddcore"
version = "0.1.2"
version.workspace = true
edition.workspace = true
license.workspace = true
authors.workspace = true
Expand Down
38 changes: 17 additions & 21 deletions crates/imgddcore/benches/core_benches.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
use criterion::{criterion_group, criterion_main, Criterion, black_box};
use criterion::{black_box, criterion_group, criterion_main, Criterion};

use imgddcore::dedupe::{open_image, collect_hashes, sort_hashes, find_duplicates};
use imgddcore::dedupe::{collect_hashes, find_duplicates, open_image, sort_hashes};
use imgddcore::hashing::ImageHash;
use imgddcore::normalize::proc as normalize;
use std::path::PathBuf;

// WARNING!
// WARNING!
// dd.hash function benchmarks will be inaccurate because; this metric relies heavily on system calls.
// Since they cannot be consistently instrumented, those calls are not included in the final measure.
// To resolve this we must use hosted codspeed macro runners which require a pro plan.
// To resolve this we must use hosted codspeed macro runners which require a pro plan.
// For now I will just leave this warning here.

fn open_image_bench(c: &mut Criterion) {
Expand All @@ -27,8 +27,13 @@ fn benchmark_normalize(c: &mut Criterion) {

c.bench_function("normalize", |b| {
b.iter(|| {
normalize(black_box(&image), black_box(image::imageops::FilterType::Triangle), black_box(9), black_box(8))
.expect("Failed to normalize image");
normalize(
black_box(&image),
black_box(image::imageops::FilterType::Triangle),
black_box(9),
black_box(8),
)
.expect("Failed to normalize image");
});
});
}
Expand All @@ -50,12 +55,8 @@ fn benchmark_collect_hashes(c: &mut Criterion) {

fn benchmark_sort_hashes(c: &mut Criterion) {
let dir_path = PathBuf::from("../../imgs/test");
let mut hash_paths = collect_hashes(
&dir_path,
image::imageops::FilterType::Triangle,
"dhash",
)
.expect("Failed to collect hashes");
let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash")
.expect("Failed to collect hashes");

c.bench_function("sort_hashes", |b| {
b.iter(|| {
Expand All @@ -66,18 +67,14 @@ fn benchmark_sort_hashes(c: &mut Criterion) {

fn benchmark_find_duplicates(c: &mut Criterion) {
let dir_path = PathBuf::from("../../imgs/test");
let mut hash_paths = collect_hashes(
&dir_path,
image::imageops::FilterType::Triangle,
"dhash",
)
.expect("Failed to collect hashes");
let mut hash_paths = collect_hashes(&dir_path, image::imageops::FilterType::Triangle, "dhash")
.expect("Failed to collect hashes");
sort_hashes(&mut hash_paths);

c.bench_function("find_duplicates", |b| {
b.iter(|| {
let _ = find_duplicates(black_box(&hash_paths), false)
.expect("Failed to find duplicates");
let _ =
find_duplicates(black_box(&hash_paths), false).expect("Failed to find duplicates");
});
});
}
Expand Down Expand Up @@ -163,7 +160,6 @@ fn benchmark_whash(c: &mut Criterion) {
});
}


criterion_group! {
name = group1;
config = Criterion::default().sample_size(40);
Expand Down
Loading
Loading