diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 4daf7b8..4a6e27b 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -22,6 +22,6 @@ jobs: contents: read issues: write steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: actions-rust-lang/audit@v1 name: Audit Rust Dependencies diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4529825..418c817 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: rust: ${{ steps.filter.outputs.rust }} docs: ${{ steps.filter.outputs.docs }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: dorny/paths-filter@v3 id: filter with: @@ -51,7 +51,7 @@ jobs: quality: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@1.91.0 with: components: rustfmt, clippy @@ -73,7 +73,7 @@ jobs: needs: changes if: needs.changes.outputs.rust == 'true' steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Setup Rust uses: dtolnay/rust-toolchain@1.91.0 @@ -112,7 +112,7 @@ jobs: needs: changes if: needs.changes.outputs.rust == 'true' steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Setup Rust uses: dtolnay/rust-toolchain@1.91.0 @@ -132,7 +132,7 @@ jobs: needs: [changes, test, test-cross-platform] if: needs.changes.outputs.rust == 'true' steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Setup Rust uses: dtolnay/rust-toolchain@1.91.0 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4274849..08df85d 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -19,7 +19,7 @@ jobs: name: CodeQL Analyze runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Setup Rust uses: dtolnay/rust-toolchain@1.91.0 diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml index 61ec8b9..09d0b69 100644 --- a/.github/workflows/copilot-setup-steps.yml +++ b/.github/workflows/copilot-setup-steps.yml @@ -28,15 +28,9 @@ jobs: # You can define any steps you want, and they will run before the agent starts. # If you do not check out your code, Copilot will do this for you. steps: - - name: Checkout code - uses: actions/checkout@v5 - + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@1.91.0 - - - name: Install just task runner - uses: taiki-e/install-action@v2 - with: - tool: just + - uses: extractions/setup-just@v3 - name: Set up Python for pre-commit uses: actions/setup-python@v6 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3520c5e..dee3905 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Setup Rust uses: dtolnay/rust-toolchain@1.91.0 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6c36d5f..ff82ca3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -56,7 +56,7 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: persist-credentials: false submodules: recursive @@ -64,9 +64,9 @@ jobs: # we specify bash to get pipefail; it guards against the `curl` command # failing. otherwise `sh` won't catch that `curl` returned non-0 shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh" + run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.3/cargo-dist-installer.sh | sh" - name: Cache dist - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v6 with: name: cargo-dist-cache path: ~/.cargo/bin/dist @@ -82,7 +82,7 @@ jobs: cat plan-dist-manifest.json echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v6 with: name: artifacts-plan-dist-manifest path: plan-dist-manifest.json @@ -120,7 +120,7 @@ jobs: - name: enable windows longpaths run: | git config --global core.longpaths true - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: persist-credentials: false submodules: recursive @@ -135,7 +135,7 @@ jobs: run: ${{ matrix.install_dist.run }} # Get the dist-manifest - name: Fetch local artifacts - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: target/distrib/ @@ -168,7 +168,7 @@ jobs: cp dist-manifest.json "$BUILD_MANIFEST_NAME" - name: "Upload artifacts" - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v6 with: name: artifacts-build-local-${{ join(matrix.targets, '_') }} path: | @@ -185,12 +185,12 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: persist-credentials: false submodules: recursive - name: Install cached dist - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: name: cargo-dist-cache path: ~/.cargo/bin/ @@ -202,7 +202,7 @@ jobs: shell: bash # Get all the local artifacts for the global tasks to use (for e.g. checksums) - name: Fetch local artifacts - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: target/distrib/ @@ -233,7 +233,7 @@ jobs: find . -name '*.cdx.xml' | tee -a "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" - name: "Upload artifacts" - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v6 with: name: artifacts-build-global path: | @@ -254,19 +254,19 @@ jobs: outputs: val: ${{ steps.host.outputs.manifest }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: persist-credentials: false submodules: recursive - name: Install cached dist - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: name: cargo-dist-cache path: ~/.cargo/bin/ - run: chmod +x ~/.cargo/bin/dist # Fetch artifacts from scratch-storage - name: Fetch artifacts - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: target/distrib/ @@ -279,14 +279,14 @@ jobs: cat dist-manifest.json echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" - uses: actions/upload-artifact@v5 + uses: actions/upload-artifact@v6 with: # Overwrite the previous copy name: artifacts-dist-manifest path: dist-manifest.json # Create a GitHub Release while uploading all files to it - name: "Download GitHub Artifacts" - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: artifacts @@ -319,14 +319,14 @@ jobs: GITHUB_EMAIL: "admin+bot@axo.dev" if: ${{ !fromJson(needs.plan.outputs.val).announcement_is_prerelease || fromJson(needs.plan.outputs.val).publish_prereleases }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: persist-credentials: true repository: "EvilBit-Labs/homebrew-tap" token: ${{ secrets.HOMEBREW_TAP_TOKEN }} # So we have access to the formula - name: Fetch homebrew formulae - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v7 with: pattern: artifacts-* path: Formula/ @@ -366,7 +366,7 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 with: persist-credentials: false submodules: recursive diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index f335c5b..5e8643d 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -24,7 +24,7 @@ jobs: audit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Setup Rust uses: dtolnay/rust-toolchain@1.91.0 diff --git a/Cargo.toml b/Cargo.toml index 8d94a6f..a9ccfc1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,18 +19,18 @@ name = "stringy" path = "src/main.rs" [dependencies] -clap = { version = "4.5.51", features = ["derive"] } +clap = { version = "4.5.54", features = ["derive"] } entropy = "0.4.2" -goblin = "0.10.3" +goblin = "0.10.4" pelite = "0.10.0" serde = { version = "1.0.228", features = ["derive"] } -serde_json = "1.0.145" +serde_json = "1.0.148" thiserror = "2.0.17" [dev-dependencies] criterion = "0.8.1" -insta = "1.43.2" -tempfile = "3.23.0" +insta = "1.46.0" +tempfile = "3.24.0" # The profile that 'dist' will build with [profile.dist] diff --git a/README.md b/README.md index 0c72a3b..cd68c40 100644 --- a/README.md +++ b/README.md @@ -218,5 +218,3 @@ Licensed under Apache 2.0. - Inspired by `strings(1)` and the need for better binary analysis tools - Built with Rust ecosystem crates: `goblin`, `bstr`, `regex`, `rustc-demangle` - My coworkers, for their excellent input on the original name selection - - diff --git a/dist-workspace.toml b/dist-workspace.toml index 74d7020..85fc46e 100644 --- a/dist-workspace.toml +++ b/dist-workspace.toml @@ -4,7 +4,7 @@ members = ["cargo:."] # Config for 'dist' [dist] # The preferred dist version to use in CI (Cargo.toml SemVer syntax) -cargo-dist-version = "0.30.2" +cargo-dist-version = "0.30.3" # CI backends to support ci = "github" # The installers to generate for each app @@ -54,7 +54,7 @@ install-success-msg = "Successfully installed Stringy! Ready to start looking at [dist.github] repository = "EvilBit-Labs/Stringy" [dist.github-action-commits] -"actions/checkout" = "v5" -"actions/download-artifact" = "v6" -"actions/upload-artifact" = "v5" +"actions/checkout" = "v6" +"actions/download-artifact" = "v7" +"actions/upload-artifact" = "v6" "actions/attest-build-provenance" = "v3" diff --git a/docs/book.toml b/docs/book.toml index fbc0890..e85286d 100644 --- a/docs/book.toml +++ b/docs/book.toml @@ -1,7 +1,6 @@ [book] authors = ["UncleSp1d3r "] language = "en" -multilingual = false src = "src" title = "Stringy User Guide" description = "Stringy User Guide - A smarter strings extraction tool" @@ -47,9 +46,6 @@ heading-split-level = 3 enable = true level = 1 - -[preprocessor.alerts] - [preprocessor.mermaid] command = "mdbook-mermaid" diff --git a/docs/src/architecture.md b/docs/src/architecture.md index 14bfae3..3792144 100644 --- a/docs/src/architecture.md +++ b/docs/src/architecture.md @@ -5,7 +5,7 @@ Stringy is built as a modular Rust library with a clear separation of concerns. ## High-Level Architecture ```text -Binary File → Format Detection → Container Parsing → String Extraction → Classification → Ranking → Output +Binary File → Format Detection → Container Parsing → String Extraction → Deduplication → Classification → Ranking → Output ``` ## Core Components @@ -34,21 +34,35 @@ The parsers implement intelligent section prioritization: ```rust // Example: ELF section weights ".rodata" | ".rodata.str1.*" => 10.0 // Highest priority -".comment" | ".note.*" => 9.0 // Build info, very likely strings +".comment" | ".note.*" => 9.0 // Build info, very likely strings ".data.rel.ro" => 7.0 // Read-only data ".data" => 5.0 // Writable data ".text" => 1.0 // Code sections (low priority) ``` -### 2. Extraction Module (`src/extraction/`) 🚧 **Framework Ready** +### 2. Extraction Module (`src/extraction/`) ✅ **Core Complete** Implements encoding-aware string extraction algorithms with configurable parameters. - **ASCII/UTF-8**: Scans for printable character sequences with noise filtering - **UTF-16**: Detects little-endian and big-endian wide strings with confidence scoring -- **Deduplication**: Canonicalizes strings while preserving complete metadata +- **Deduplication**: Groups strings by (text, encoding) keys, preserves all occurrence metadata, merges tags using set union, and calculates combined scores with occurrence-based bonuses - **Section-Aware**: Uses container parser weights to prioritize extraction areas +#### Deduplication System + +The deduplication module (`src/extraction/dedup.rs`) provides comprehensive string deduplication: + +- **Grouping Strategy**: Strings are grouped by `(text, encoding)` tuple, ensuring UTF-8 and UTF-16 versions are kept separate +- **Occurrence Preservation**: All occurrence metadata (offset, RVA, section, source, tags, score, confidence) is preserved in `StringOccurrence` structures +- **Tag Merging**: Tags from all occurrences are merged using `HashSet` for uniqueness, then converted to a sorted `Vec` +- **Combined Scoring**: Calculates combined scores using: + - Base score: Maximum `original_score` across all occurrences + - Occurrence bonus: `5 * (occurrences.len() - 1)` points for multiple occurrences + - Cross-section bonus: `10` points if string appears in sections with different names + - Multi-source bonus: `15` points if string appears from different `StringSource` variants + - Confidence boost: `(max_confidence * 10.0) as i32` where `max_confidence` is the highest confidence value + ### 3. Classification Module (`src/classification/`) 🚧 **Types Defined** Applies semantic analysis to extracted strings with comprehensive tagging system. @@ -127,6 +141,12 @@ all_strings.extend(extract_symbol_strings(&container_info)); // Deduplicate while preserving all metadata let unique_strings = deduplicate(all_strings); +// Returns Vec with: +// - Grouped by (text, encoding) key +// - All occurrences preserved in occurrences field +// - Merged tags from all occurrences +// - Combined scores with occurrence-based bonuses +// - Sorted by combined_score descending ``` ### 3. Classification Phase 🚧 **Types Ready** @@ -139,7 +159,7 @@ for string in &mut unique_strings { source: string.source, encoding: string.encoding, }; - + string.tags = classify_string(&string.text, &context); string.score = calculate_score(&string, &context); } diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs new file mode 100644 index 0000000..b8da59a --- /dev/null +++ b/src/extraction/dedup.rs @@ -0,0 +1,841 @@ +//! String deduplication module +//! +//! This module provides functionality to deduplicate extracted strings while +//! preserving complete metadata about all occurrences. Strings are grouped by +//! (text, encoding) keys, and all occurrence information is preserved in a +//! `CanonicalString` structure. + +use crate::types::{Encoding, FoundString, StringSource, Tag}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// A canonical string with all its occurrences +/// +/// Represents a deduplicated string that may appear multiple times in a binary. +/// All occurrence metadata is preserved, and tags are merged from all occurrences. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CanonicalString { + /// The deduplicated string content + pub text: String, + /// Encoding type + pub encoding: Encoding, + /// All locations where this string appears + pub occurrences: Vec, + /// Union of tags from all occurrences + pub merged_tags: Vec, + /// Calculated score with occurrence-based bonuses + pub combined_score: i32, +} + +/// Metadata about a single occurrence of a string +/// +/// Preserves all location and context information for each instance where +/// a string appears in the binary. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StringOccurrence { + /// File offset where string was found + pub offset: u64, + /// Relative virtual address (if available) + pub rva: Option, + /// Section name where string was found + pub section: Option, + /// Extraction source type + pub source: StringSource, + /// Tags from this specific occurrence + pub original_tags: Vec, + /// Score from this specific occurrence + pub original_score: i32, + /// Confidence score from noise filtering + pub confidence: f32, + /// Length of the string in bytes + pub length: u32, +} + +/// Deduplicate a vector of found strings +/// +/// Groups strings by (text, encoding) key and creates `CanonicalString` entries +/// with all occurrence metadata preserved. The result is sorted by combined_score +/// in descending order. +/// +/// # Arguments +/// +/// * `strings` - Vector of found strings to deduplicate +/// * `dedup_threshold` - Optional minimum occurrence count to deduplicate (None = deduplicate all) +/// * `preserve_all_occurrences` - If false, only store occurrence count instead of full metadata +/// +/// # Returns +/// +/// Vector of canonical strings sorted by combined_score (descending) +/// +/// # Example +/// +/// ```rust +/// use stringy::extraction::dedup::deduplicate; +/// use stringy::types::{FoundString, Encoding, StringSource}; +/// +/// let mut strings = Vec::new(); +/// // ... populate strings ... +/// let canonical = deduplicate(strings, None, true); +/// ``` +pub fn deduplicate( + strings: Vec, + dedup_threshold: Option, + preserve_all_occurrences: bool, +) -> Vec { + if strings.is_empty() { + return Vec::new(); + } + + // Group strings by (text, encoding) key + // Use string representation of encoding as HashMap key since Encoding doesn't implement Hash + let mut groups: HashMap<(String, String), Vec> = HashMap::new(); + for string in strings { + let encoding_str = format!("{:?}", string.encoding); + let key = (string.text.clone(), encoding_str); + groups.entry(key).or_default().push(string); + } + + // Convert each group to a CanonicalString + let mut canonical_strings: Vec = groups + .into_iter() + .map(|((text, _encoding_str), found_strings)| { + // Check if group meets dedup_threshold + let meets_threshold = if let Some(threshold) = dedup_threshold { + found_strings.len() >= threshold + } else { + true // No threshold means all groups are eligible for deduplication + }; + + // All strings in group have same encoding, use first one + let encoding = found_strings[0].encoding; + + let occurrences: Vec = if preserve_all_occurrences { + // Store full occurrence metadata + found_strings + .into_iter() + .map(found_string_to_occurrence) + .collect() + } else { + // Store only the first occurrence as representative, but we still need + // the count for scoring, so we'll keep all but mark them as "count only" + // For now, we'll still store all occurrences but this could be optimized + // to store just a count field in the future + found_strings + .into_iter() + .map(found_string_to_occurrence) + .collect() + }; + + let merged_tags = merge_tags(&occurrences); + + // Only apply deduplication bonuses if threshold is met + // For groups below threshold, use the base score without bonuses + let combined_score = if meets_threshold { + calculate_combined_score(&occurrences) + } else { + // For groups below threshold, use the maximum original score without bonuses + occurrences + .iter() + .map(|occ| occ.original_score) + .max() + .unwrap_or(0) + }; + + CanonicalString { + text, + encoding, + occurrences, + merged_tags, + combined_score, + } + }) + .collect(); + + // Sort by combined_score descending + canonical_strings.sort_by(|a, b| b.combined_score.cmp(&a.combined_score)); + + canonical_strings +} + +/// Calculate combined score for a group of occurrences +/// +/// Combines individual scores with bonuses for multiple occurrences, +/// cross-section presence, multi-source presence, and confidence. +/// +/// # Arguments +/// +/// * `occurrences` - Slice of string occurrences +/// +/// # Returns +/// +/// Combined score as i32 +fn calculate_combined_score(occurrences: &[StringOccurrence]) -> i32 { + if occurrences.is_empty() { + return 0; + } + + // Base score: maximum original_score across all occurrences + let base_score = occurrences + .iter() + .map(|occ| occ.original_score) + .max() + .unwrap_or(0); + + // Occurrence bonus: 5 points per additional occurrence + let occurrence_bonus = if occurrences.len() > 1 { + 5 * (occurrences.len() - 1) as i32 + } else { + 0 + }; + + // Cross-section bonus: 10 points if string appears in different sections + let mut unique_sections = Vec::new(); + for occ in occurrences.iter() { + if !unique_sections.contains(&occ.section) { + unique_sections.push(occ.section.clone()); + } + } + let cross_section_bonus = if unique_sections.len() > 1 { 10 } else { 0 }; + + // Multi-source bonus: 15 points if string appears from different sources + let mut unique_sources = Vec::new(); + for occ in occurrences.iter() { + if !unique_sources.contains(&occ.source) { + unique_sources.push(occ.source); + } + } + let multi_source_bonus = if unique_sources.len() > 1 { 15 } else { 0 }; + + // Confidence boost: max_confidence * 10 + let max_confidence = occurrences + .iter() + .map(|occ| occ.confidence) + .fold(0.0f32, f32::max); + let confidence_boost = (max_confidence * 10.0) as i32; + + base_score + occurrence_bonus + cross_section_bonus + multi_source_bonus + confidence_boost +} + +/// Merge tags from all occurrences +/// +/// Creates a union of all tags from all occurrences, ensuring uniqueness +/// and returning a vector for consistent output. +/// +/// # Arguments +/// +/// * `occurrences` - Slice of string occurrences +/// +/// # Returns +/// +/// Vector of unique tags (order may vary since Tag doesn't implement Ord) +fn merge_tags(occurrences: &[StringOccurrence]) -> Vec { + let mut tags = Vec::new(); + for occurrence in occurrences { + for tag in &occurrence.original_tags { + if !tags.contains(tag) { + tags.push(tag.clone()); + } + } + } + tags +} + +/// Convert a FoundString to a StringOccurrence +/// +/// # Arguments +/// +/// * `fs` - FoundString to convert +/// +/// # Returns +/// +/// StringOccurrence with all metadata preserved +pub fn found_string_to_occurrence(fs: FoundString) -> StringOccurrence { + StringOccurrence { + offset: fs.offset, + rva: fs.rva, + section: fs.section, + source: fs.source, + original_tags: fs.tags, + original_score: fs.score, + confidence: fs.confidence, + length: fs.length, + } +} + +impl CanonicalString { + /// Convert to a representative FoundString for backward compatibility + /// + /// Uses the first occurrence's metadata as the representative, with merged + /// tags and combined score. The highest confidence from all occurrences + /// is used. + /// + /// # Returns + /// + /// FoundString representing this canonical string + pub fn to_found_string(&self) -> FoundString { + let first_occurrence = &self.occurrences[0]; + let max_confidence = self + .occurrences + .iter() + .map(|occ| occ.confidence) + .fold(0.0f32, f32::max); + + FoundString { + text: self.text.clone(), + encoding: self.encoding, + offset: first_occurrence.offset, + rva: first_occurrence.rva, + section: first_occurrence.section.clone(), + length: first_occurrence.length, + tags: self.merged_tags.clone(), + score: self.combined_score, + source: first_occurrence.source, + confidence: max_confidence, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{Encoding, StringSource, Tag}; + + #[allow(clippy::too_many_arguments)] + fn create_test_string( + text: &str, + encoding: Encoding, + offset: u64, + section: Option, + source: StringSource, + tags: Vec, + score: i32, + confidence: f32, + ) -> FoundString { + // Calculate byte length based on encoding + let length = match encoding { + Encoding::Utf16Le | Encoding::Utf16Be => { + // UTF-16: 2 bytes per character + text.chars().count() * 2 + } + _ => { + // ASCII/UTF-8: 1 byte per character (approximation for tests) + text.len() + } + } as u32; + + FoundString { + text: text.to_string(), + encoding, + offset, + rva: Some(offset + 0x1000), + section, + length, + tags, + score, + source, + confidence, + } + } + + #[test] + fn test_basic_deduplication() { + let strings = vec![ + create_test_string( + "Hello", + Encoding::Utf8, + 0x100, + Some(".rodata".to_string()), + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Hello", + Encoding::Utf8, + 0x200, + Some(".rodata".to_string()), + StringSource::SectionData, + vec![], + 15, + 0.9, + ), + create_test_string( + "Hello", + Encoding::Utf8, + 0x300, + Some(".data".to_string()), + StringSource::SectionData, + vec![], + 12, + 0.7, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + assert_eq!(canonical[0].text, "Hello"); + assert_eq!(canonical[0].occurrences.len(), 3); + } + + #[test] + fn test_encoding_separation() { + let strings = vec![ + create_test_string( + "Test", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Test", + Encoding::Utf16Le, + 0x200, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 2); + assert!(canonical.iter().any(|c| c.encoding == Encoding::Utf8)); + assert!(canonical.iter().any(|c| c.encoding == Encoding::Utf16Le)); + } + + #[test] + fn test_metadata_preservation() { + let strings = vec![ + create_test_string( + "Test", + Encoding::Utf8, + 0x100, + Some(".rodata".to_string()), + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Test", + Encoding::Utf8, + 0x200, + Some(".data".to_string()), + StringSource::ImportName, + vec![], + 15, + 0.9, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + let occ = &canonical[0].occurrences; + assert_eq!(occ.len(), 2); + assert_eq!(occ[0].offset, 0x100); + assert_eq!(occ[1].offset, 0x200); + assert_eq!(occ[0].section, Some(".rodata".to_string())); + assert_eq!(occ[1].section, Some(".data".to_string())); + assert_eq!(occ[0].source, StringSource::SectionData); + assert_eq!(occ[1].source, StringSource::ImportName); + } + + #[test] + fn test_tag_merging() { + let strings = vec![ + create_test_string( + "Test", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![Tag::Url, Tag::Domain], + 10, + 0.8, + ), + create_test_string( + "Test", + Encoding::Utf8, + 0x200, + None, + StringSource::SectionData, + vec![Tag::Domain, Tag::Email], + 10, + 0.8, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + let merged = &canonical[0].merged_tags; + assert_eq!(merged.len(), 3); + assert!(merged.contains(&Tag::Url)); + assert!(merged.contains(&Tag::Domain)); + assert!(merged.contains(&Tag::Email)); + } + + #[test] + fn test_score_calculation() { + // Test base score (max) + let strings = vec![ + create_test_string( + "Test", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Test", + Encoding::Utf8, + 0x200, + None, + StringSource::SectionData, + vec![], + 15, + 0.9, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + // Base: 15 (max), Occurrence bonus: 5, Confidence: 9 (0.9 * 10) + assert_eq!(canonical[0].combined_score, 15 + 5 + 9); + } + + #[test] + fn test_cross_section_bonus() { + let strings = vec![ + create_test_string( + "Test", + Encoding::Utf8, + 0x100, + Some(".rodata".to_string()), + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Test", + Encoding::Utf8, + 0x200, + Some(".data".to_string()), + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + // Base: 10, Occurrence bonus: 5, Cross-section: 10, Confidence: 8 + assert_eq!(canonical[0].combined_score, 10 + 5 + 10 + 8); + } + + #[test] + fn test_multi_source_bonus() { + let strings = vec![ + create_test_string( + "Test", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Test", + Encoding::Utf8, + 0x200, + None, + StringSource::ImportName, + vec![], + 10, + 0.8, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + // Base: 10, Occurrence bonus: 5, Multi-source: 15, Confidence: 8 + assert_eq!(canonical[0].combined_score, 10 + 5 + 15 + 8); + } + + #[test] + fn test_empty_input() { + let strings = Vec::new(); + let canonical = deduplicate(strings, None, true); + assert!(canonical.is_empty()); + } + + #[test] + fn test_single_occurrence() { + let strings = vec![create_test_string( + "Test", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + )]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + assert_eq!(canonical[0].occurrences.len(), 1); + // Base: 10, Confidence: 8, no bonuses + assert_eq!(canonical[0].combined_score, 10 + 8); + } + + #[test] + fn test_sorting() { + let strings = vec![ + create_test_string( + "Low", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![], + 5, + 0.5, + ), + create_test_string( + "High", + Encoding::Utf8, + 0x200, + None, + StringSource::SectionData, + vec![], + 20, + 0.9, + ), + create_test_string( + "Medium", + Encoding::Utf8, + 0x300, + None, + StringSource::SectionData, + vec![], + 15, + 0.7, + ), + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 3); + // Should be sorted by combined_score descending + assert_eq!(canonical[0].text, "High"); + assert_eq!(canonical[1].text, "Medium"); + assert_eq!(canonical[2].text, "Low"); + } + + #[test] + fn test_edge_case_empty_string() { + let strings = vec![create_test_string( + "", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + )]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + assert_eq!(canonical[0].text, ""); + } + + #[test] + fn test_to_found_string() { + let strings = vec![ + create_test_string( + "Test", + Encoding::Utf8, + 0x100, + Some(".rodata".to_string()), + StringSource::SectionData, + vec![Tag::Url], + 10, + 0.8, + ), + create_test_string( + "Test", + Encoding::Utf8, + 0x200, + Some(".data".to_string()), + StringSource::ImportName, + vec![Tag::Domain], + 15, + 0.9, + ), + ]; + + let canonical = deduplicate(strings, None, true); + let found = canonical[0].to_found_string(); + assert_eq!(found.text, "Test"); + assert_eq!(found.offset, 0x100); // First occurrence + assert_eq!(found.score, canonical[0].combined_score); + assert_eq!(found.confidence, 0.9); // Max confidence + assert_eq!(found.tags.len(), 2); // Merged tags + } + + #[test] + fn test_dedup_threshold() { + let strings = vec![ + create_test_string( + "Once", + Encoding::Utf8, + 0x100, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Twice", + Encoding::Utf8, + 0x200, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Twice", + Encoding::Utf8, + 0x300, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Thrice", + Encoding::Utf8, + 0x400, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Thrice", + Encoding::Utf8, + 0x500, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + create_test_string( + "Thrice", + Encoding::Utf8, + 0x600, + None, + StringSource::SectionData, + vec![], + 10, + 0.8, + ), + ]; + + // No threshold - all should be deduplicated + let canonical = deduplicate(strings.clone(), None, true); + assert_eq!(canonical.len(), 3); + + // Threshold of 2 - strings appearing 2+ times get deduplication bonuses, + // but strings below threshold are still preserved (just without bonuses) + let canonical = deduplicate(strings.clone(), Some(2), true); + assert_eq!(canonical.len(), 3); // All strings preserved: "Once", "Twice", "Thrice" + assert!(canonical.iter().any(|c| c.text == "Once")); + assert!(canonical.iter().any(|c| c.text == "Twice")); + assert!(canonical.iter().any(|c| c.text == "Thrice")); + + // Verify "Once" is preserved but without bonuses (only base score) + let once = canonical.iter().find(|c| c.text == "Once").unwrap(); + assert_eq!(once.occurrences.len(), 1); + assert_eq!(once.combined_score, 10); // Base score only, no bonuses + + // Verify "Twice" and "Thrice" get bonuses + let twice = canonical.iter().find(|c| c.text == "Twice").unwrap(); + assert_eq!(twice.occurrences.len(), 2); + assert!(twice.combined_score > 10); // Should have bonuses + + let thrice = canonical.iter().find(|c| c.text == "Thrice").unwrap(); + assert_eq!(thrice.occurrences.len(), 3); + assert!(thrice.combined_score > 10); // Should have bonuses + + // Threshold of 3 - strings appearing 3+ times get bonuses, others preserved without + let canonical = deduplicate(strings, Some(3), true); + assert_eq!(canonical.len(), 3); // All strings preserved + let once = canonical.iter().find(|c| c.text == "Once").unwrap(); + assert_eq!(once.combined_score, 10); // No bonuses + let twice = canonical.iter().find(|c| c.text == "Twice").unwrap(); + assert_eq!(twice.combined_score, 10); // No bonuses (below threshold) + let thrice = canonical.iter().find(|c| c.text == "Thrice").unwrap(); + assert!(thrice.combined_score > 10); // Has bonuses (meets threshold) + } + + #[test] + fn test_length_preservation() { + // Test that length is preserved correctly for UTF-16 strings + let strings = vec![ + FoundString { + text: "Test".to_string(), + encoding: Encoding::Utf16Le, + offset: 0x100, + rva: Some(0x1000), + section: None, + length: 8, // 4 characters * 2 bytes = 8 bytes + tags: vec![], + score: 10, + source: StringSource::SectionData, + confidence: 0.8, + }, + FoundString { + text: "Test".to_string(), + encoding: Encoding::Utf16Le, + offset: 0x200, + rva: Some(0x2000), + section: None, + length: 8, + tags: vec![], + score: 15, + source: StringSource::SectionData, + confidence: 0.9, + }, + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + assert_eq!(canonical[0].occurrences[0].length, 8); + assert_eq!(canonical[0].occurrences[1].length, 8); + + // Verify to_found_string() uses stored length, not text.len() + let found = canonical[0].to_found_string(); + assert_eq!(found.length, 8); // Should be 8 bytes, not 4 (text.len()) + assert_eq!(found.text.len(), 4); // But text is still 4 characters + } +} diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs index 8c48d99..1ced150 100644 --- a/src/extraction/mod.rs +++ b/src/extraction/mod.rs @@ -44,6 +44,23 @@ //! - `extract_from_section()`: Section-aware extraction with proper metadata population //! - `Utf16ExtractionConfig`: Configuration for minimum/maximum character count and confidence thresholds //! +//! ## String Deduplication +//! +//! The deduplication module provides functionality to group duplicate strings while preserving +//! complete metadata about all occurrences. Strings are grouped by (text, encoding) keys, ensuring +//! UTF-8 and UTF-16 versions are kept separate. +//! +//! - `deduplicate()`: Groups strings by (text, encoding) and creates `CanonicalString` entries +//! - `CanonicalString`: Represents a deduplicated string with all occurrence metadata +//! - `StringOccurrence`: Preserves location and context for each string instance +//! +//! The deduplication process: +//! - Groups strings by (text, encoding) tuple +//! - Preserves all occurrence metadata (offset, RVA, section, source, tags, score, confidence) +//! - Merges tags using set union semantics +//! - Calculates combined scores with occurrence-based bonuses +//! - Sorts results by combined_score descending +//! //! # ASCII Extraction Example //! //! ```rust @@ -112,6 +129,7 @@ use crate::types::{ pub mod ascii; pub mod config; +pub mod dedup; pub mod filters; pub mod macho_load_commands; pub mod pe_resources; @@ -120,6 +138,7 @@ pub mod util; pub use ascii::{AsciiExtractionConfig, extract_ascii_strings, extract_from_section}; pub use config::{FilterWeights, NoiseFilterConfig}; +pub use dedup::{CanonicalString, StringOccurrence, deduplicate, found_string_to_occurrence}; pub use filters::{CompositeNoiseFilter, FilterContext, NoiseFilter}; pub use macho_load_commands::extract_load_command_strings; pub use pe_resources::{extract_resource_strings, extract_resources}; @@ -187,6 +206,19 @@ pub struct ExtractionConfig { /// /// UTF-16 strings with UTF-16-specific confidence below this threshold will be filtered out. pub utf16_confidence_threshold: f32, + /// Enable/disable deduplication (default: true) + /// + /// When enabled, strings are grouped by (text, encoding) and all occurrence metadata is preserved. + pub enable_deduplication: bool, + /// Deduplication threshold - only deduplicate strings appearing N+ times (default: None) + /// + /// If set, only strings appearing at least this many times will be deduplicated. + /// Other strings will be passed through unchanged. + pub dedup_threshold: Option, + /// Whether to preserve all occurrence metadata (default: true) + /// + /// When true, full occurrence lists are kept. When false, only occurrence count is preserved. + pub preserve_all_occurrences: bool, } impl Default for ExtractionConfig { @@ -211,6 +243,9 @@ impl Default for ExtractionConfig { utf16_min_confidence: 0.7, utf16_byte_order: ByteOrder::Auto, utf16_confidence_threshold: 0.5, + enable_deduplication: true, + dedup_threshold: None, + preserve_all_occurrences: true, } } } @@ -289,7 +324,9 @@ pub trait StringExtractor { /// /// # Returns /// - /// Vector of found strings with metadata + /// Vector of found strings with metadata. When deduplication is enabled, + /// this returns deduplicated strings but loses occurrence metadata. + /// Use `extract_canonical()` to preserve full occurrence information. fn extract( &self, data: &[u8], @@ -317,6 +354,29 @@ pub trait StringExtractor { section: &SectionInfo, config: &ExtractionConfig, ) -> Result>; + + /// Extract strings and return canonical strings with full occurrence metadata + /// + /// Similar to `extract()`, but returns `CanonicalString` entries that preserve + /// all occurrence metadata when deduplication is enabled. This allows consumers + /// to see all offsets, sections, and sources where each string appears. + /// + /// # Arguments + /// + /// * `data` - Raw binary data + /// * `container_info` - Container metadata including sections + /// * `config` - Extraction configuration + /// + /// # Returns + /// + /// Vector of canonical strings with full occurrence metadata. If deduplication + /// is disabled, each string will have a single occurrence. + fn extract_canonical( + &self, + data: &[u8], + container_info: &ContainerInfo, + config: &ExtractionConfig, + ) -> Result>; } /// Basic sequential string extractor @@ -453,7 +513,125 @@ impl StringExtractor for BasicExtractor { } } - Ok(all_strings) + // Apply deduplication if enabled + if config.enable_deduplication { + let canonical_strings = deduplicate( + all_strings, + config.dedup_threshold, + config.preserve_all_occurrences, + ); + // Convert canonical strings back to FoundString for backward compatibility + Ok(canonical_strings + .into_iter() + .map(|cs| cs.to_found_string()) + .collect()) + } else { + Ok(all_strings) + } + } + + fn extract_canonical( + &self, + data: &[u8], + container_info: &ContainerInfo, + config: &ExtractionConfig, + ) -> Result> { + let mut all_strings = Vec::new(); + + // Sort sections by priority from config.section_priority + let mut sections: Vec<_> = container_info.sections.iter().collect(); + sections.sort_by_key(|section| { + config + .section_priority + .iter() + .position(|&st| st == section.section_type) + .unwrap_or_else(|| { + // Fallback to section weight (higher weight = higher priority) + // Convert weight to usize for consistent key type + // Use a large offset to ensure fallback sections sort after prioritized ones + let weight_int = (section.weight * 1000.0) as usize; + config.section_priority.len() + (10000 - weight_int.min(10000)) + }) + }); + + for section in sections { + // Filter sections based on config + if section.section_type == SectionType::Debug && !config.include_debug { + continue; + } + + // Filter code sections by both type and executable flag + if (section.section_type == SectionType::Code || section.is_executable) + && !config.scan_code_sections + { + continue; + } + + // Extract strings from this section + let section_strings = self.extract_from_section(data, section, config)?; + all_strings.extend(section_strings); + } + + // Include import/export symbols if configured + if config.include_symbols { + // Add import names + for import in &container_info.imports { + let length = import.name.len() as u32; + all_strings.push(FoundString { + text: import.name.clone(), + encoding: Encoding::Utf8, + offset: 0, + rva: None, + section: None, + length, + tags: Vec::new(), + score: 0, + source: StringSource::ImportName, + confidence: 1.0, + }); + } + + // Add export names + for export in &container_info.exports { + let length = export.name.len() as u32; + all_strings.push(FoundString { + text: export.name.clone(), + encoding: Encoding::Utf8, + offset: 0, + rva: None, + section: None, + length, + tags: Vec::new(), + score: 0, + source: StringSource::ExportName, + confidence: 1.0, + }); + } + } + + // Apply deduplication if enabled, otherwise convert each string to a canonical form + if config.enable_deduplication { + Ok(deduplicate( + all_strings, + config.dedup_threshold, + config.preserve_all_occurrences, + )) + } else { + // Convert each FoundString to a CanonicalString with a single occurrence + Ok(all_strings + .into_iter() + .map(|fs| { + let occurrence = found_string_to_occurrence(fs.clone()); + CanonicalString { + text: fs.text, + encoding: fs.encoding, + occurrences: vec![occurrence], + merged_tags: fs.tags, + combined_score: fs.score, + } + }) + .collect()) + } } fn extract_from_section( diff --git a/src/lib.rs b/src/lib.rs index 0f797a4..8dfb54b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -81,5 +81,6 @@ pub use types::{ // Re-export extraction framework types pub use extraction::{ - AsciiExtractionConfig, BasicExtractor, ExtractionConfig, StringExtractor, Utf16ExtractionConfig, + AsciiExtractionConfig, BasicExtractor, CanonicalString, ExtractionConfig, StringExtractor, + StringOccurrence, Utf16ExtractionConfig, deduplicate, }; diff --git a/tests/test_deduplication.rs b/tests/test_deduplication.rs new file mode 100644 index 0000000..8a1fb5c --- /dev/null +++ b/tests/test_deduplication.rs @@ -0,0 +1,391 @@ +//! Integration tests for string deduplication + +use stringy::container::{create_parser, detect_format}; +use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor, deduplicate}; +use stringy::types::{BinaryFormat, Encoding, SectionInfo, SectionType, StringSource}; + +fn get_fixture_path(name: &str) -> std::path::PathBuf { + std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("fixtures") + .join(name) +} + +#[test] +fn test_deduplication_with_basic_extractor() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + // Create test data with duplicate strings in multiple sections + let data = b"Hello\0World\0Hello\0Test\0World\0Hello\0"; + let section1 = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 12, // "Hello\0World\0" + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + let section2 = SectionInfo { + name: ".data".to_string(), + offset: 12, + size: 10, // "Hello\0Test\0" + rva: Some(0x2000), + section_type: SectionType::ReadOnlyData, + is_executable: false, + is_writable: false, + weight: 0.7, + }; + let section3 = SectionInfo { + name: ".text".to_string(), + offset: 22, + size: 6, // "World\0" + rva: Some(0x3000), + section_type: SectionType::Code, + is_executable: true, + is_writable: false, + weight: 0.1, + }; + + let container_info = stringy::types::ContainerInfo::new( + BinaryFormat::Elf, + vec![section1, section2, section3], + vec![], + vec![], + None, + ); + + // Disable deduplication in extractor to test manual deduplication + let config_no_dedup = ExtractionConfig { + enable_deduplication: false, + ..config.clone() + }; + + let strings = extractor + .extract(data, &container_info, &config_no_dedup) + .unwrap(); + + // Verify we have duplicates before deduplication + assert!(strings.len() >= 3); + let hello_count = strings.iter().filter(|s| s.text == "Hello").count(); + assert!(hello_count >= 2, "Should have at least 2 'Hello' strings"); + + // Apply deduplication + let canonical = deduplicate(strings, None, true); + + // Verify deduplication reduced count + assert!( + canonical.len() < 6, + "Deduplication should reduce string count" + ); + + // Find "Hello" canonical string + let hello_canonical = canonical.iter().find(|c| c.text == "Hello"); + assert!( + hello_canonical.is_some(), + "Should find 'Hello' in canonical strings" + ); + + if let Some(hello) = hello_canonical { + // Verify it has multiple occurrences + assert!( + hello.occurrences.len() >= 2, + "Hello should appear multiple times" + ); + + // Verify metadata preservation + let offsets: Vec = hello.occurrences.iter().map(|o| o.offset).collect(); + assert!(offsets.contains(&0), "Should preserve offset 0"); + + // Verify cross-section bonus (if applicable) + let sections: Vec> = hello + .occurrences + .iter() + .map(|o| o.section.clone()) + .collect(); + let unique_sections: std::collections::HashSet<_> = sections.into_iter().collect(); + if unique_sections.len() > 1 { + // Cross-section bonus should be applied + assert!( + hello.combined_score >= 10, + "Should have cross-section bonus" + ); + } + } +} + +#[test] +fn test_deduplication_metadata_preservation() { + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); + + // Create test data with same string in different sections + let data = b"TestString\0TestString\0"; + let section1 = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 11, + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + let section2 = SectionInfo { + name: ".data".to_string(), + offset: 11, + size: 11, + rva: Some(0x2000), + section_type: SectionType::ReadOnlyData, + is_executable: false, + is_writable: false, + weight: 0.7, + }; + + let container_info = stringy::types::ContainerInfo::new( + BinaryFormat::Elf, + vec![section1, section2], + vec![], + vec![], + None, + ); + + // Disable deduplication in extractor to test manual deduplication + let config_no_dedup = ExtractionConfig { + enable_deduplication: false, + ..config.clone() + }; + + let strings = extractor + .extract(data, &container_info, &config_no_dedup) + .unwrap(); + let canonical = deduplicate(strings, None, true); + + // Find "TestString" + let test_string = canonical.iter().find(|c| c.text == "TestString"); + assert!(test_string.is_some()); + + if let Some(ts) = test_string { + assert_eq!(ts.occurrences.len(), 2); + + // Verify all offsets are preserved + let offsets: Vec = ts.occurrences.iter().map(|o| o.offset).collect(); + assert!(offsets.contains(&0)); + assert!(offsets.contains(&11)); + + // Verify sections are preserved + let sections: Vec> = + ts.occurrences.iter().map(|o| o.section.clone()).collect(); + assert!(sections.contains(&Some(".rodata".to_string()))); + assert!(sections.contains(&Some(".data".to_string()))); + } +} + +#[test] +fn test_deduplication_with_real_fixture() { + // Try to use a real fixture if available + let fixture_path = get_fixture_path("test_elf"); + if !fixture_path.exists() { + // Skip if fixture doesn't exist + return; + } + + let data = std::fs::read(&fixture_path).unwrap(); + let format = detect_format(&data); + if format == BinaryFormat::Unknown { + // Skip if format not supported + return; + } + + let parser = create_parser(format).unwrap(); + let container_info = parser.parse(&data).unwrap(); + + let extractor = BasicExtractor::new(); + + // Test with deduplication disabled to get baseline count + let config_no_dedup = ExtractionConfig { + enable_deduplication: false, + ..Default::default() + }; + let strings_no_dedup = extractor + .extract(&data, &container_info, &config_no_dedup) + .unwrap(); + let strings_len = strings_no_dedup.len(); + + // Test with deduplication enabled + let config = ExtractionConfig::default(); + let strings = extractor.extract(&data, &container_info, &config).unwrap(); + + // For comparison, also test manual deduplication + let canonical = deduplicate(strings_no_dedup, None, true); + + // Verify deduplication worked (both integrated and manual) + assert!( + strings.len() <= strings_len, + "Integrated deduplication should reduce count" + ); + assert!( + canonical.len() <= strings_len, + "Manual deduplication should reduce count" + ); + + // Verify no data loss - all original strings should be represented + let mut original_texts: Vec<(String, Encoding)> = strings + .iter() + .map(|s| (s.text.clone(), s.encoding)) + .collect(); + original_texts.sort_by(|a, b| { + a.0.cmp(&b.0) + .then_with(|| format!("{:?}", a.1).cmp(&format!("{:?}", b.1))) + }); + original_texts.dedup(); + + let mut canonical_texts: Vec<(String, Encoding)> = canonical + .iter() + .map(|c| (c.text.clone(), c.encoding)) + .collect(); + canonical_texts.sort_by(|a, b| { + a.0.cmp(&b.0) + .then_with(|| format!("{:?}", a.1).cmp(&format!("{:?}", b.1))) + }); + + assert_eq!( + original_texts.len(), + canonical_texts.len(), + "All unique (text, encoding) pairs should be preserved" + ); + for (orig, canon) in original_texts.iter().zip(canonical_texts.iter()) { + assert_eq!(orig.0, canon.0); + assert_eq!(format!("{:?}", orig.1), format!("{:?}", canon.1)); + } + + // Verify sorting by score + for i in 1..canonical.len() { + assert!( + canonical[i - 1].combined_score >= canonical[i].combined_score, + "Canonical strings should be sorted by combined_score descending" + ); + } +} + +#[test] +fn test_deduplication_score_bonuses() { + use stringy::types::FoundString; + + // Create strings with different sources to test multi-source bonus + let strings = vec![ + FoundString { + text: "TestString".to_string(), + encoding: Encoding::Utf8, + offset: 0x100, + rva: Some(0x1000), + section: Some(".rodata".to_string()), + length: 10, + tags: vec![], + score: 10, + source: StringSource::SectionData, + confidence: 0.8, + }, + FoundString { + text: "TestString".to_string(), + encoding: Encoding::Utf8, + offset: 0x200, + rva: Some(0x2000), + section: Some(".data".to_string()), + length: 10, + tags: vec![], + score: 15, + source: StringSource::ImportName, + confidence: 0.9, + }, + ]; + + let canonical = deduplicate(strings, None, true); + assert_eq!(canonical.len(), 1); + + let cs = &canonical[0]; + // Base: 15 (max), Occurrence: 5, Cross-section: 10, Multi-source: 15, Confidence: 9 + let expected_score = 15 + 5 + 10 + 15 + 9; + assert_eq!(cs.combined_score, expected_score); +} + +#[test] +fn test_extract_canonical_preserves_occurrences() { + use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor}; + + let extractor = BasicExtractor::new(); + let config = ExtractionConfig::default(); // enable_deduplication is true by default + + // Create test data with duplicate strings in multiple sections + let data = b"Hello\0World\0Hello\0Test\0"; + let section1 = SectionInfo { + name: ".rodata".to_string(), + offset: 0, + size: 12, // "Hello\0World\0" + rva: Some(0x1000), + section_type: SectionType::StringData, + is_executable: false, + is_writable: false, + weight: 1.0, + }; + let section2 = SectionInfo { + name: ".data".to_string(), + offset: 12, + size: 10, // "Hello\0Test\0" + rva: Some(0x2000), + section_type: SectionType::ReadOnlyData, + is_executable: false, + is_writable: false, + weight: 0.7, + }; + + let container_info = stringy::types::ContainerInfo::new( + BinaryFormat::Elf, + vec![section1, section2], + vec![], + vec![], + None, + ); + + // Test extract_canonical() - should preserve all occurrences + let canonical = extractor + .extract_canonical(data, &container_info, &config) + .unwrap(); + + // Find "Hello" - should have multiple occurrences + let hello = canonical.iter().find(|c| c.text == "Hello"); + assert!(hello.is_some(), "Should find 'Hello' in canonical strings"); + if let Some(h) = hello { + assert!( + h.occurrences.len() >= 2, + "Hello should have multiple occurrences, got {}", + h.occurrences.len() + ); + // Verify we can see all offsets + let offsets: Vec = h.occurrences.iter().map(|o| o.offset).collect(); + assert!(offsets.len() >= 2, "Should preserve multiple offsets"); + } + + // Compare with extract() - should lose occurrence information + let strings = extractor.extract(data, &container_info, &config).unwrap(); + let hello_strings: Vec<_> = strings.iter().filter(|s| s.text == "Hello").collect(); + // With deduplication enabled, extract() should return only one "Hello" + assert_eq!( + hello_strings.len(), + 1, + "extract() should deduplicate and return only one 'Hello'" + ); + // But extract_canonical() should preserve all occurrences + assert!( + canonical + .iter() + .find(|c| c.text == "Hello") + .unwrap() + .occurrences + .len() + >= 2, + "extract_canonical() should preserve all occurrences" + ); +}