diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
index 4daf7b8..4a6e27b 100644
--- a/.github/workflows/audit.yml
+++ b/.github/workflows/audit.yml
@@ -22,6 +22,6 @@ jobs:
       contents: read
       issues: write
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
       - uses: actions-rust-lang/audit@v1
         name: Audit Rust Dependencies
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4529825..418c817 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,7 +28,7 @@ jobs:
       rust: ${{ steps.filter.outputs.rust }}
       docs: ${{ steps.filter.outputs.docs }}
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
       - uses: dorny/paths-filter@v3
         id: filter
         with:
@@ -51,7 +51,7 @@ jobs:
   quality:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
       - uses: dtolnay/rust-toolchain@1.91.0
         with:
           components: rustfmt, clippy
@@ -73,7 +73,7 @@ jobs:
     needs: changes
     if: needs.changes.outputs.rust == 'true'
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@1.91.0
@@ -112,7 +112,7 @@ jobs:
     needs: changes
     if: needs.changes.outputs.rust == 'true'
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@1.91.0
@@ -132,7 +132,7 @@ jobs:
     needs: [changes, test, test-cross-platform]
     if: needs.changes.outputs.rust == 'true'
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@1.91.0
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4274849..08df85d 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -19,7 +19,7 @@ jobs:
     name: CodeQL Analyze
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@1.91.0
diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml
index 61ec8b9..09d0b69 100644
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -28,15 +28,9 @@ jobs:
     # You can define any steps you want, and they will run before the agent starts.
     # If you do not check out your code, Copilot will do this for you.
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
+      - uses: actions/checkout@v6
       - uses: dtolnay/rust-toolchain@1.91.0
-
-      - name: Install just task runner
-        uses: taiki-e/install-action@v2
-        with:
-          tool: just
+      - uses: extractions/setup-just@v3
 
       - name: Set up Python for pre-commit
         uses: actions/setup-python@v6
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 3520c5e..dee3905 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@1.91.0
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 6c36d5f..ff82ca3 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -56,7 +56,7 @@ jobs:
     env:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           persist-credentials: false
           submodules: recursive
@@ -64,9 +64,9 @@ jobs:
         # we specify bash to get pipefail; it guards against the `curl` command
         # failing. otherwise `sh` won't catch that `curl` returned non-0
         shell: bash
-        run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh"
+        run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.3/cargo-dist-installer.sh | sh"
       - name: Cache dist
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: cargo-dist-cache
           path: ~/.cargo/bin/dist
@@ -82,7 +82,7 @@ jobs:
           cat plan-dist-manifest.json
           echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT"
       - name: "Upload dist-manifest.json"
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: artifacts-plan-dist-manifest
           path: plan-dist-manifest.json
@@ -120,7 +120,7 @@ jobs:
       - name: enable windows longpaths
         run: |
           git config --global core.longpaths true
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           persist-credentials: false
           submodules: recursive
@@ -135,7 +135,7 @@ jobs:
         run: ${{ matrix.install_dist.run }}
       # Get the dist-manifest
       - name: Fetch local artifacts
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: artifacts-*
           path: target/distrib/
@@ -168,7 +168,7 @@ jobs:
 
           cp dist-manifest.json "$BUILD_MANIFEST_NAME"
       - name: "Upload artifacts"
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: artifacts-build-local-${{ join(matrix.targets, '_') }}
           path: |
@@ -185,12 +185,12 @@ jobs:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           persist-credentials: false
           submodules: recursive
       - name: Install cached dist
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           name: cargo-dist-cache
           path: ~/.cargo/bin/
@@ -202,7 +202,7 @@ jobs:
         shell: bash
       # Get all the local artifacts for the global tasks to use (for e.g. checksums)
       - name: Fetch local artifacts
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: artifacts-*
           path: target/distrib/
@@ -233,7 +233,7 @@ jobs:
             find . -name '*.cdx.xml' | tee -a "$GITHUB_OUTPUT"
             echo "EOF" >> "$GITHUB_OUTPUT"
       - name: "Upload artifacts"
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           name: artifacts-build-global
           path: |
@@ -254,19 +254,19 @@ jobs:
     outputs:
       val: ${{ steps.host.outputs.manifest }}
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           persist-credentials: false
           submodules: recursive
       - name: Install cached dist
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           name: cargo-dist-cache
           path: ~/.cargo/bin/
       - run: chmod +x ~/.cargo/bin/dist
       # Fetch artifacts from scratch-storage
       - name: Fetch artifacts
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: artifacts-*
           path: target/distrib/
@@ -279,14 +279,14 @@ jobs:
           cat dist-manifest.json
           echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT"
       - name: "Upload dist-manifest.json"
-        uses: actions/upload-artifact@v5
+        uses: actions/upload-artifact@v6
         with:
           # Overwrite the previous copy
           name: artifacts-dist-manifest
           path: dist-manifest.json
       # Create a GitHub Release while uploading all files to it
       - name: "Download GitHub Artifacts"
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: artifacts-*
           path: artifacts
@@ -319,14 +319,14 @@ jobs:
       GITHUB_EMAIL: "admin+bot@axo.dev"
     if: ${{ !fromJson(needs.plan.outputs.val).announcement_is_prerelease || fromJson(needs.plan.outputs.val).publish_prereleases }}
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           persist-credentials: true
           repository: "EvilBit-Labs/homebrew-tap"
           token: ${{ secrets.HOMEBREW_TAP_TOKEN }}
       # So we have access to the formula
       - name: Fetch homebrew formulae
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v7
         with:
           pattern: artifacts-*
           path: Formula/
@@ -366,7 +366,7 @@ jobs:
     env:
       GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
         with:
           persist-credentials: false
           submodules: recursive
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index f335c5b..5e8643d 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -24,7 +24,7 @@ jobs:
   audit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@1.91.0
diff --git a/Cargo.toml b/Cargo.toml
index 8d94a6f..a9ccfc1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,18 +19,18 @@ name = "stringy"
 path = "src/main.rs"
 
 [dependencies]
-clap = { version = "4.5.51", features = ["derive"] }
+clap = { version = "4.5.54", features = ["derive"] }
 entropy = "0.4.2"
-goblin = "0.10.3"
+goblin = "0.10.4"
 pelite = "0.10.0"
 serde = { version = "1.0.228", features = ["derive"] }
-serde_json = "1.0.145"
+serde_json = "1.0.148"
 thiserror = "2.0.17"
 
 [dev-dependencies]
 criterion = "0.8.1"
-insta = "1.43.2"
-tempfile = "3.23.0"
+insta = "1.46.0"
+tempfile = "3.24.0"
 
 # The profile that 'dist' will build with
 [profile.dist]
diff --git a/README.md b/README.md
index 0c72a3b..cd68c40 100644
--- a/README.md
+++ b/README.md
@@ -218,5 +218,3 @@ Licensed under Apache 2.0.
 - Inspired by `strings(1)` and the need for better binary analysis tools
 - Built with Rust ecosystem crates: `goblin`, `bstr`, `regex`, `rustc-demangle`
 - My coworkers, for their excellent input on the original name selection
-
-
diff --git a/dist-workspace.toml b/dist-workspace.toml
index 74d7020..85fc46e 100644
--- a/dist-workspace.toml
+++ b/dist-workspace.toml
@@ -4,7 +4,7 @@ members = ["cargo:."]
 # Config for 'dist'
 [dist]
 # The preferred dist version to use in CI (Cargo.toml SemVer syntax)
-cargo-dist-version = "0.30.2"
+cargo-dist-version = "0.30.3"
 # CI backends to support
 ci = "github"
 # The installers to generate for each app
@@ -54,7 +54,7 @@ install-success-msg = "Successfully installed Stringy! Ready to start looking at
 [dist.github]
 repository = "EvilBit-Labs/Stringy"
 [dist.github-action-commits]
-"actions/checkout" = "v5"
-"actions/download-artifact" = "v6"
-"actions/upload-artifact" = "v5"
+"actions/checkout" = "v6"
+"actions/download-artifact" = "v7"
+"actions/upload-artifact" = "v6"
 "actions/attest-build-provenance" = "v3"
diff --git a/docs/book.toml b/docs/book.toml
index fbc0890..e85286d 100644
--- a/docs/book.toml
+++ b/docs/book.toml
@@ -1,7 +1,6 @@
 [book]
 authors = ["UncleSp1d3r <unclesp1d3r@evilbitlabs.io>"]
 language = "en"
-multilingual = false
 src = "src"
 title = "Stringy User Guide"
 description = "Stringy User Guide - A smarter strings extraction tool"
@@ -47,9 +46,6 @@ heading-split-level = 3
 enable = true
 level = 1
 
-
-[preprocessor.alerts]
-
 [preprocessor.mermaid]
 command = "mdbook-mermaid"
 
diff --git a/docs/src/architecture.md b/docs/src/architecture.md
index 14bfae3..3792144 100644
--- a/docs/src/architecture.md
+++ b/docs/src/architecture.md
@@ -5,7 +5,7 @@ Stringy is built as a modular Rust library with a clear separation of concerns.
 ## High-Level Architecture
 
 ```text
-Binary File → Format Detection → Container Parsing → String Extraction → Classification → Ranking → Output
+Binary File → Format Detection → Container Parsing → String Extraction → Deduplication → Classification → Ranking → Output
 ```
 
 ## Core Components
@@ -34,21 +34,35 @@ The parsers implement intelligent section prioritization:
 ```rust
 // Example: ELF section weights
 ".rodata" | ".rodata.str1.*" => 10.0  // Highest priority
-".comment" | ".note.*"       => 9.0   // Build info, very likely strings  
+".comment" | ".note.*"       => 9.0   // Build info, very likely strings
 ".data.rel.ro"              => 7.0   // Read-only data
 ".data"                     => 5.0   // Writable data
 ".text"                     => 1.0   // Code sections (low priority)
 ```
 
-### 2. Extraction Module (`src/extraction/`) 🚧 **Framework Ready**
+### 2. Extraction Module (`src/extraction/`) ✅ **Core Complete**
 
 Implements encoding-aware string extraction algorithms with configurable parameters.
 
 - **ASCII/UTF-8**: Scans for printable character sequences with noise filtering
 - **UTF-16**: Detects little-endian and big-endian wide strings with confidence scoring
-- **Deduplication**: Canonicalizes strings while preserving complete metadata
+- **Deduplication**: Groups strings by (text, encoding) keys, preserves all occurrence metadata, merges tags using set union, and calculates combined scores with occurrence-based bonuses
 - **Section-Aware**: Uses container parser weights to prioritize extraction areas
 
+#### Deduplication System
+
+The deduplication module (`src/extraction/dedup.rs`) provides comprehensive string deduplication:
+
+- **Grouping Strategy**: Strings are grouped by `(text, encoding)` tuple, ensuring UTF-8 and UTF-16 versions are kept separate
+- **Occurrence Preservation**: All occurrence metadata (offset, RVA, section, source, tags, score, confidence) is preserved in `StringOccurrence` structures
+- **Tag Merging**: Tags from all occurrences are merged using `HashSet` for uniqueness, then converted to a sorted `Vec<Tag>`
+- **Combined Scoring**: Calculates combined scores using:
+  - Base score: Maximum `original_score` across all occurrences
+  - Occurrence bonus: `5 * (occurrences.len() - 1)` points for multiple occurrences
+  - Cross-section bonus: `10` points if string appears in sections with different names
+  - Multi-source bonus: `15` points if string appears from different `StringSource` variants
+  - Confidence boost: `(max_confidence * 10.0) as i32` where `max_confidence` is the highest confidence value
+
 ### 3. Classification Module (`src/classification/`) 🚧 **Types Defined**
 
 Applies semantic analysis to extracted strings with comprehensive tagging system.
@@ -127,6 +141,12 @@ all_strings.extend(extract_symbol_strings(&container_info));
 
 // Deduplicate while preserving all metadata
 let unique_strings = deduplicate(all_strings);
+// Returns Vec<CanonicalString> with:
+// - Grouped by (text, encoding) key
+// - All occurrences preserved in occurrences field
+// - Merged tags from all occurrences
+// - Combined scores with occurrence-based bonuses
+// - Sorted by combined_score descending
 ```
 
 ### 3. Classification Phase 🚧 **Types Ready**
@@ -139,7 +159,7 @@ for string in &mut unique_strings {
         source: string.source,
         encoding: string.encoding,
     };
-    
+
     string.tags = classify_string(&string.text, &context);
     string.score = calculate_score(&string, &context);
 }
diff --git a/src/extraction/dedup.rs b/src/extraction/dedup.rs
new file mode 100644
index 0000000..b8da59a
--- /dev/null
+++ b/src/extraction/dedup.rs
@@ -0,0 +1,841 @@
+//! String deduplication module
+//!
+//! This module provides functionality to deduplicate extracted strings while
+//! preserving complete metadata about all occurrences. Strings are grouped by
+//! (text, encoding) keys, and all occurrence information is preserved in a
+//! `CanonicalString` structure.
+
+use crate::types::{Encoding, FoundString, StringSource, Tag};
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// A canonical string with all its occurrences
+///
+/// Represents a deduplicated string that may appear multiple times in a binary.
+/// All occurrence metadata is preserved, and tags are merged from all occurrences.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CanonicalString {
+    /// The deduplicated string content
+    pub text: String,
+    /// Encoding type
+    pub encoding: Encoding,
+    /// All locations where this string appears
+    pub occurrences: Vec<StringOccurrence>,
+    /// Union of tags from all occurrences
+    pub merged_tags: Vec<Tag>,
+    /// Calculated score with occurrence-based bonuses
+    pub combined_score: i32,
+}
+
+/// Metadata about a single occurrence of a string
+///
+/// Preserves all location and context information for each instance where
+/// a string appears in the binary.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StringOccurrence {
+    /// File offset where string was found
+    pub offset: u64,
+    /// Relative virtual address (if available)
+    pub rva: Option<u64>,
+    /// Section name where string was found
+    pub section: Option<String>,
+    /// Extraction source type
+    pub source: StringSource,
+    /// Tags from this specific occurrence
+    pub original_tags: Vec<Tag>,
+    /// Score from this specific occurrence
+    pub original_score: i32,
+    /// Confidence score from noise filtering
+    pub confidence: f32,
+    /// Length of the string in bytes
+    pub length: u32,
+}
+
+/// Deduplicate a vector of found strings
+///
+/// Groups strings by (text, encoding) key and creates `CanonicalString` entries
+/// with all occurrence metadata preserved. The result is sorted by combined_score
+/// in descending order.
+///
+/// # Arguments
+///
+/// * `strings` - Vector of found strings to deduplicate
+/// * `dedup_threshold` - Optional minimum occurrence count to deduplicate (None = deduplicate all)
+/// * `preserve_all_occurrences` - If false, only store occurrence count instead of full metadata
+///
+/// # Returns
+///
+/// Vector of canonical strings sorted by combined_score (descending)
+///
+/// # Example
+///
+/// ```rust
+/// use stringy::extraction::dedup::deduplicate;
+/// use stringy::types::{FoundString, Encoding, StringSource};
+///
+/// let mut strings = Vec::new();
+/// // ... populate strings ...
+/// let canonical = deduplicate(strings, None, true);
+/// ```
+pub fn deduplicate(
+    strings: Vec<FoundString>,
+    dedup_threshold: Option<usize>,
+    preserve_all_occurrences: bool,
+) -> Vec<CanonicalString> {
+    if strings.is_empty() {
+        return Vec::new();
+    }
+
+    // Group strings by (text, encoding) key
+    // Use string representation of encoding as HashMap key since Encoding doesn't implement Hash
+    let mut groups: HashMap<(String, String), Vec<FoundString>> = HashMap::new();
+    for string in strings {
+        let encoding_str = format!("{:?}", string.encoding);
+        let key = (string.text.clone(), encoding_str);
+        groups.entry(key).or_default().push(string);
+    }
+
+    // Convert each group to a CanonicalString
+    let mut canonical_strings: Vec<CanonicalString> = groups
+        .into_iter()
+        .map(|((text, _encoding_str), found_strings)| {
+            // Check if group meets dedup_threshold
+            let meets_threshold = if let Some(threshold) = dedup_threshold {
+                found_strings.len() >= threshold
+            } else {
+                true // No threshold means all groups are eligible for deduplication
+            };
+
+            // All strings in group have same encoding, use first one
+            let encoding = found_strings[0].encoding;
+
+            let occurrences: Vec<StringOccurrence> = if preserve_all_occurrences {
+                // Store full occurrence metadata
+                found_strings
+                    .into_iter()
+                    .map(found_string_to_occurrence)
+                    .collect()
+            } else {
+                // Store only the first occurrence as representative, but we still need
+                // the count for scoring, so we'll keep all but mark them as "count only"
+                // For now, we'll still store all occurrences but this could be optimized
+                // to store just a count field in the future
+                found_strings
+                    .into_iter()
+                    .map(found_string_to_occurrence)
+                    .collect()
+            };
+
+            let merged_tags = merge_tags(&occurrences);
+
+            // Only apply deduplication bonuses if threshold is met
+            // For groups below threshold, use the base score without bonuses
+            let combined_score = if meets_threshold {
+                calculate_combined_score(&occurrences)
+            } else {
+                // For groups below threshold, use the maximum original score without bonuses
+                occurrences
+                    .iter()
+                    .map(|occ| occ.original_score)
+                    .max()
+                    .unwrap_or(0)
+            };
+
+            CanonicalString {
+                text,
+                encoding,
+                occurrences,
+                merged_tags,
+                combined_score,
+            }
+        })
+        .collect();
+
+    // Sort by combined_score descending
+    canonical_strings.sort_by(|a, b| b.combined_score.cmp(&a.combined_score));
+
+    canonical_strings
+}
+
+/// Calculate combined score for a group of occurrences
+///
+/// Combines individual scores with bonuses for multiple occurrences,
+/// cross-section presence, multi-source presence, and confidence.
+///
+/// # Arguments
+///
+/// * `occurrences` - Slice of string occurrences
+///
+/// # Returns
+///
+/// Combined score as i32
+fn calculate_combined_score(occurrences: &[StringOccurrence]) -> i32 {
+    if occurrences.is_empty() {
+        return 0;
+    }
+
+    // Base score: maximum original_score across all occurrences
+    let base_score = occurrences
+        .iter()
+        .map(|occ| occ.original_score)
+        .max()
+        .unwrap_or(0);
+
+    // Occurrence bonus: 5 points per additional occurrence
+    let occurrence_bonus = if occurrences.len() > 1 {
+        5 * (occurrences.len() - 1) as i32
+    } else {
+        0
+    };
+
+    // Cross-section bonus: 10 points if string appears in different sections
+    let mut unique_sections = Vec::new();
+    for occ in occurrences.iter() {
+        if !unique_sections.contains(&occ.section) {
+            unique_sections.push(occ.section.clone());
+        }
+    }
+    let cross_section_bonus = if unique_sections.len() > 1 { 10 } else { 0 };
+
+    // Multi-source bonus: 15 points if string appears from different sources
+    let mut unique_sources = Vec::new();
+    for occ in occurrences.iter() {
+        if !unique_sources.contains(&occ.source) {
+            unique_sources.push(occ.source);
+        }
+    }
+    let multi_source_bonus = if unique_sources.len() > 1 { 15 } else { 0 };
+
+    // Confidence boost: max_confidence * 10
+    let max_confidence = occurrences
+        .iter()
+        .map(|occ| occ.confidence)
+        .fold(0.0f32, f32::max);
+    let confidence_boost = (max_confidence * 10.0) as i32;
+
+    base_score + occurrence_bonus + cross_section_bonus + multi_source_bonus + confidence_boost
+}
+
+/// Merge tags from all occurrences
+///
+/// Creates a union of all tags from all occurrences, ensuring uniqueness
+/// and returning a vector for consistent output.
+///
+/// # Arguments
+///
+/// * `occurrences` - Slice of string occurrences
+///
+/// # Returns
+///
+/// Vector of unique tags (order may vary since Tag doesn't implement Ord)
+fn merge_tags(occurrences: &[StringOccurrence]) -> Vec<Tag> {
+    let mut tags = Vec::new();
+    for occurrence in occurrences {
+        for tag in &occurrence.original_tags {
+            if !tags.contains(tag) {
+                tags.push(tag.clone());
+            }
+        }
+    }
+    tags
+}
+
+/// Convert a FoundString to a StringOccurrence
+///
+/// # Arguments
+///
+/// * `fs` - FoundString to convert
+///
+/// # Returns
+///
+/// StringOccurrence with all metadata preserved
+pub fn found_string_to_occurrence(fs: FoundString) -> StringOccurrence {
+    StringOccurrence {
+        offset: fs.offset,
+        rva: fs.rva,
+        section: fs.section,
+        source: fs.source,
+        original_tags: fs.tags,
+        original_score: fs.score,
+        confidence: fs.confidence,
+        length: fs.length,
+    }
+}
+
+impl CanonicalString {
+    /// Convert to a representative FoundString for backward compatibility
+    ///
+    /// Uses the first occurrence's metadata as the representative, with merged
+    /// tags and combined score. The highest confidence from all occurrences
+    /// is used.
+    ///
+    /// # Returns
+    ///
+    /// FoundString representing this canonical string
+    pub fn to_found_string(&self) -> FoundString {
+        let first_occurrence = &self.occurrences[0];
+        let max_confidence = self
+            .occurrences
+            .iter()
+            .map(|occ| occ.confidence)
+            .fold(0.0f32, f32::max);
+
+        FoundString {
+            text: self.text.clone(),
+            encoding: self.encoding,
+            offset: first_occurrence.offset,
+            rva: first_occurrence.rva,
+            section: first_occurrence.section.clone(),
+            length: first_occurrence.length,
+            tags: self.merged_tags.clone(),
+            score: self.combined_score,
+            source: first_occurrence.source,
+            confidence: max_confidence,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::{Encoding, StringSource, Tag};
+
+    #[allow(clippy::too_many_arguments)]
+    fn create_test_string(
+        text: &str,
+        encoding: Encoding,
+        offset: u64,
+        section: Option<String>,
+        source: StringSource,
+        tags: Vec<Tag>,
+        score: i32,
+        confidence: f32,
+    ) -> FoundString {
+        // Calculate byte length based on encoding
+        let length = match encoding {
+            Encoding::Utf16Le | Encoding::Utf16Be => {
+                // UTF-16: 2 bytes per character
+                text.chars().count() * 2
+            }
+            _ => {
+                // ASCII/UTF-8: 1 byte per character (approximation for tests)
+                text.len()
+            }
+        } as u32;
+
+        FoundString {
+            text: text.to_string(),
+            encoding,
+            offset,
+            rva: Some(offset + 0x1000),
+            section,
+            length,
+            tags,
+            score,
+            source,
+            confidence,
+        }
+    }
+
+    #[test]
+    fn test_basic_deduplication() {
+        let strings = vec![
+            create_test_string(
+                "Hello",
+                Encoding::Utf8,
+                0x100,
+                Some(".rodata".to_string()),
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Hello",
+                Encoding::Utf8,
+                0x200,
+                Some(".rodata".to_string()),
+                StringSource::SectionData,
+                vec![],
+                15,
+                0.9,
+            ),
+            create_test_string(
+                "Hello",
+                Encoding::Utf8,
+                0x300,
+                Some(".data".to_string()),
+                StringSource::SectionData,
+                vec![],
+                12,
+                0.7,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        assert_eq!(canonical[0].text, "Hello");
+        assert_eq!(canonical[0].occurrences.len(), 3);
+    }
+
+    #[test]
+    fn test_encoding_separation() {
+        let strings = vec![
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x100,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Test",
+                Encoding::Utf16Le,
+                0x200,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 2);
+        assert!(canonical.iter().any(|c| c.encoding == Encoding::Utf8));
+        assert!(canonical.iter().any(|c| c.encoding == Encoding::Utf16Le));
+    }
+
+    #[test]
+    fn test_metadata_preservation() {
+        let strings = vec![
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x100,
+                Some(".rodata".to_string()),
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x200,
+                Some(".data".to_string()),
+                StringSource::ImportName,
+                vec![],
+                15,
+                0.9,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        let occ = &canonical[0].occurrences;
+        assert_eq!(occ.len(), 2);
+        assert_eq!(occ[0].offset, 0x100);
+        assert_eq!(occ[1].offset, 0x200);
+        assert_eq!(occ[0].section, Some(".rodata".to_string()));
+        assert_eq!(occ[1].section, Some(".data".to_string()));
+        assert_eq!(occ[0].source, StringSource::SectionData);
+        assert_eq!(occ[1].source, StringSource::ImportName);
+    }
+
+    #[test]
+    fn test_tag_merging() {
+        let strings = vec![
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x100,
+                None,
+                StringSource::SectionData,
+                vec![Tag::Url, Tag::Domain],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x200,
+                None,
+                StringSource::SectionData,
+                vec![Tag::Domain, Tag::Email],
+                10,
+                0.8,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        let merged = &canonical[0].merged_tags;
+        assert_eq!(merged.len(), 3);
+        assert!(merged.contains(&Tag::Url));
+        assert!(merged.contains(&Tag::Domain));
+        assert!(merged.contains(&Tag::Email));
+    }
+
+    #[test]
+    fn test_score_calculation() {
+        // Test base score (max)
+        let strings = vec![
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x100,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x200,
+                None,
+                StringSource::SectionData,
+                vec![],
+                15,
+                0.9,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        // Base: 15 (max), Occurrence bonus: 5, Confidence: 9 (0.9 * 10)
+        assert_eq!(canonical[0].combined_score, 15 + 5 + 9);
+    }
+
+    #[test]
+    fn test_cross_section_bonus() {
+        let strings = vec![
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x100,
+                Some(".rodata".to_string()),
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x200,
+                Some(".data".to_string()),
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        // Base: 10, Occurrence bonus: 5, Cross-section: 10, Confidence: 8
+        assert_eq!(canonical[0].combined_score, 10 + 5 + 10 + 8);
+    }
+
+    #[test]
+    fn test_multi_source_bonus() {
+        let strings = vec![
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x100,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x200,
+                None,
+                StringSource::ImportName,
+                vec![],
+                10,
+                0.8,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        // Base: 10, Occurrence bonus: 5, Multi-source: 15, Confidence: 8
+        assert_eq!(canonical[0].combined_score, 10 + 5 + 15 + 8);
+    }
+
+    #[test]
+    fn test_empty_input() {
+        let strings = Vec::new();
+        let canonical = deduplicate(strings, None, true);
+        assert!(canonical.is_empty());
+    }
+
+    #[test]
+    fn test_single_occurrence() {
+        let strings = vec![create_test_string(
+            "Test",
+            Encoding::Utf8,
+            0x100,
+            None,
+            StringSource::SectionData,
+            vec![],
+            10,
+            0.8,
+        )];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        assert_eq!(canonical[0].occurrences.len(), 1);
+        // Base: 10, Confidence: 8, no bonuses
+        assert_eq!(canonical[0].combined_score, 10 + 8);
+    }
+
+    #[test]
+    fn test_sorting() {
+        let strings = vec![
+            create_test_string(
+                "Low",
+                Encoding::Utf8,
+                0x100,
+                None,
+                StringSource::SectionData,
+                vec![],
+                5,
+                0.5,
+            ),
+            create_test_string(
+                "High",
+                Encoding::Utf8,
+                0x200,
+                None,
+                StringSource::SectionData,
+                vec![],
+                20,
+                0.9,
+            ),
+            create_test_string(
+                "Medium",
+                Encoding::Utf8,
+                0x300,
+                None,
+                StringSource::SectionData,
+                vec![],
+                15,
+                0.7,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 3);
+        // Should be sorted by combined_score descending
+        assert_eq!(canonical[0].text, "High");
+        assert_eq!(canonical[1].text, "Medium");
+        assert_eq!(canonical[2].text, "Low");
+    }
+
+    #[test]
+    fn test_edge_case_empty_string() {
+        let strings = vec![create_test_string(
+            "",
+            Encoding::Utf8,
+            0x100,
+            None,
+            StringSource::SectionData,
+            vec![],
+            10,
+            0.8,
+        )];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        assert_eq!(canonical[0].text, "");
+    }
+
+    #[test]
+    fn test_to_found_string() {
+        let strings = vec![
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x100,
+                Some(".rodata".to_string()),
+                StringSource::SectionData,
+                vec![Tag::Url],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Test",
+                Encoding::Utf8,
+                0x200,
+                Some(".data".to_string()),
+                StringSource::ImportName,
+                vec![Tag::Domain],
+                15,
+                0.9,
+            ),
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        let found = canonical[0].to_found_string();
+        assert_eq!(found.text, "Test");
+        assert_eq!(found.offset, 0x100); // First occurrence
+        assert_eq!(found.score, canonical[0].combined_score);
+        assert_eq!(found.confidence, 0.9); // Max confidence
+        assert_eq!(found.tags.len(), 2); // Merged tags
+    }
+
+    #[test]
+    fn test_dedup_threshold() {
+        let strings = vec![
+            create_test_string(
+                "Once",
+                Encoding::Utf8,
+                0x100,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Twice",
+                Encoding::Utf8,
+                0x200,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Twice",
+                Encoding::Utf8,
+                0x300,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Thrice",
+                Encoding::Utf8,
+                0x400,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Thrice",
+                Encoding::Utf8,
+                0x500,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+            create_test_string(
+                "Thrice",
+                Encoding::Utf8,
+                0x600,
+                None,
+                StringSource::SectionData,
+                vec![],
+                10,
+                0.8,
+            ),
+        ];
+
+        // No threshold - all should be deduplicated
+        let canonical = deduplicate(strings.clone(), None, true);
+        assert_eq!(canonical.len(), 3);
+
+        // Threshold of 2 - strings appearing 2+ times get deduplication bonuses,
+        // but strings below threshold are still preserved (just without bonuses)
+        let canonical = deduplicate(strings.clone(), Some(2), true);
+        assert_eq!(canonical.len(), 3); // All strings preserved: "Once", "Twice", "Thrice"
+        assert!(canonical.iter().any(|c| c.text == "Once"));
+        assert!(canonical.iter().any(|c| c.text == "Twice"));
+        assert!(canonical.iter().any(|c| c.text == "Thrice"));
+
+        // Verify "Once" is preserved but without bonuses (only base score)
+        let once = canonical.iter().find(|c| c.text == "Once").unwrap();
+        assert_eq!(once.occurrences.len(), 1);
+        assert_eq!(once.combined_score, 10); // Base score only, no bonuses
+
+        // Verify "Twice" and "Thrice" get bonuses
+        let twice = canonical.iter().find(|c| c.text == "Twice").unwrap();
+        assert_eq!(twice.occurrences.len(), 2);
+        assert!(twice.combined_score > 10); // Should have bonuses
+
+        let thrice = canonical.iter().find(|c| c.text == "Thrice").unwrap();
+        assert_eq!(thrice.occurrences.len(), 3);
+        assert!(thrice.combined_score > 10); // Should have bonuses
+
+        // Threshold of 3 - strings appearing 3+ times get bonuses, others preserved without
+        let canonical = deduplicate(strings, Some(3), true);
+        assert_eq!(canonical.len(), 3); // All strings preserved
+        let once = canonical.iter().find(|c| c.text == "Once").unwrap();
+        assert_eq!(once.combined_score, 10); // No bonuses
+        let twice = canonical.iter().find(|c| c.text == "Twice").unwrap();
+        assert_eq!(twice.combined_score, 10); // No bonuses (below threshold)
+        let thrice = canonical.iter().find(|c| c.text == "Thrice").unwrap();
+        assert!(thrice.combined_score > 10); // Has bonuses (meets threshold)
+    }
+
+    #[test]
+    fn test_length_preservation() {
+        // Test that length is preserved correctly for UTF-16 strings
+        let strings = vec![
+            FoundString {
+                text: "Test".to_string(),
+                encoding: Encoding::Utf16Le,
+                offset: 0x100,
+                rva: Some(0x1000),
+                section: None,
+                length: 8, // 4 characters * 2 bytes = 8 bytes
+                tags: vec![],
+                score: 10,
+                source: StringSource::SectionData,
+                confidence: 0.8,
+            },
+            FoundString {
+                text: "Test".to_string(),
+                encoding: Encoding::Utf16Le,
+                offset: 0x200,
+                rva: Some(0x2000),
+                section: None,
+                length: 8,
+                tags: vec![],
+                score: 15,
+                source: StringSource::SectionData,
+                confidence: 0.9,
+            },
+        ];
+
+        let canonical = deduplicate(strings, None, true);
+        assert_eq!(canonical.len(), 1);
+        assert_eq!(canonical[0].occurrences[0].length, 8);
+        assert_eq!(canonical[0].occurrences[1].length, 8);
+
+        // Verify to_found_string() uses stored length, not text.len()
+        let found = canonical[0].to_found_string();
+        assert_eq!(found.length, 8); // Should be 8 bytes, not 4 (text.len())
+        assert_eq!(found.text.len(), 4); // But text is still 4 characters
+    }
+}
diff --git a/src/extraction/mod.rs b/src/extraction/mod.rs
index 8c48d99..1ced150 100644
--- a/src/extraction/mod.rs
+++ b/src/extraction/mod.rs
@@ -44,6 +44,23 @@
 //! - `extract_from_section()`: Section-aware extraction with proper metadata population
 //! - `Utf16ExtractionConfig`: Configuration for minimum/maximum character count and confidence thresholds
 //!
+//! ## String Deduplication
+//!
+//! The deduplication module provides functionality to group duplicate strings while preserving
+//! complete metadata about all occurrences. Strings are grouped by (text, encoding) keys, ensuring
+//! UTF-8 and UTF-16 versions are kept separate.
+//!
+//! - `deduplicate()`: Groups strings by (text, encoding) and creates `CanonicalString` entries
+//! - `CanonicalString`: Represents a deduplicated string with all occurrence metadata
+//! - `StringOccurrence`: Preserves location and context for each string instance
+//!
+//! The deduplication process:
+//! - Groups strings by (text, encoding) tuple
+//! - Preserves all occurrence metadata (offset, RVA, section, source, tags, score, confidence)
+//! - Merges tags using set union semantics
+//! - Calculates combined scores with occurrence-based bonuses
+//! - Sorts results by combined_score descending
+//!
 //! # ASCII Extraction Example
 //!
 //! ```rust
@@ -112,6 +129,7 @@ use crate::types::{
 
 pub mod ascii;
 pub mod config;
+pub mod dedup;
 pub mod filters;
 pub mod macho_load_commands;
 pub mod pe_resources;
@@ -120,6 +138,7 @@ pub mod util;
 
 pub use ascii::{AsciiExtractionConfig, extract_ascii_strings, extract_from_section};
 pub use config::{FilterWeights, NoiseFilterConfig};
+pub use dedup::{CanonicalString, StringOccurrence, deduplicate, found_string_to_occurrence};
 pub use filters::{CompositeNoiseFilter, FilterContext, NoiseFilter};
 pub use macho_load_commands::extract_load_command_strings;
 pub use pe_resources::{extract_resource_strings, extract_resources};
@@ -187,6 +206,19 @@ pub struct ExtractionConfig {
     ///
     /// UTF-16 strings with UTF-16-specific confidence below this threshold will be filtered out.
     pub utf16_confidence_threshold: f32,
+    /// Enable/disable deduplication (default: true)
+    ///
+    /// When enabled, strings are grouped by (text, encoding) and all occurrence metadata is preserved.
+    pub enable_deduplication: bool,
+    /// Deduplication threshold - only deduplicate strings appearing N+ times (default: None)
+    ///
+    /// If set, only strings appearing at least this many times will be deduplicated.
+    /// Other strings will be passed through unchanged.
+    pub dedup_threshold: Option<usize>,
+    /// Whether to preserve all occurrence metadata (default: true)
+    ///
+    /// When true, full occurrence lists are kept. When false, only occurrence count is preserved.
+    pub preserve_all_occurrences: bool,
 }
 
 impl Default for ExtractionConfig {
@@ -211,6 +243,9 @@ impl Default for ExtractionConfig {
             utf16_min_confidence: 0.7,
             utf16_byte_order: ByteOrder::Auto,
             utf16_confidence_threshold: 0.5,
+            enable_deduplication: true,
+            dedup_threshold: None,
+            preserve_all_occurrences: true,
         }
     }
 }
@@ -289,7 +324,9 @@ pub trait StringExtractor {
     ///
     /// # Returns
     ///
-    /// Vector of found strings with metadata
+    /// Vector of found strings with metadata. When deduplication is enabled,
+    /// this returns deduplicated strings but loses occurrence metadata.
+    /// Use `extract_canonical()` to preserve full occurrence information.
     fn extract(
         &self,
         data: &[u8],
@@ -317,6 +354,29 @@ pub trait StringExtractor {
         section: &SectionInfo,
         config: &ExtractionConfig,
     ) -> Result<Vec<FoundString>>;
+
+    /// Extract strings and return canonical strings with full occurrence metadata
+    ///
+    /// Similar to `extract()`, but returns `CanonicalString` entries that preserve
+    /// all occurrence metadata when deduplication is enabled. This allows consumers
+    /// to see all offsets, sections, and sources where each string appears.
+    ///
+    /// # Arguments
+    ///
+    /// * `data` - Raw binary data
+    /// * `container_info` - Container metadata including sections
+    /// * `config` - Extraction configuration
+    ///
+    /// # Returns
+    ///
+    /// Vector of canonical strings with full occurrence metadata. If deduplication
+    /// is disabled, each string will have a single occurrence.
+    fn extract_canonical(
+        &self,
+        data: &[u8],
+        container_info: &ContainerInfo,
+        config: &ExtractionConfig,
+    ) -> Result<Vec<CanonicalString>>;
 }
 
 /// Basic sequential string extractor
@@ -453,7 +513,125 @@ impl StringExtractor for BasicExtractor {
             }
         }
 
-        Ok(all_strings)
+        // Apply deduplication if enabled
+        if config.enable_deduplication {
+            let canonical_strings = deduplicate(
+                all_strings,
+                config.dedup_threshold,
+                config.preserve_all_occurrences,
+            );
+            // Convert canonical strings back to FoundString for backward compatibility
+            Ok(canonical_strings
+                .into_iter()
+                .map(|cs| cs.to_found_string())
+                .collect())
+        } else {
+            Ok(all_strings)
+        }
+    }
+
+    fn extract_canonical(
+        &self,
+        data: &[u8],
+        container_info: &ContainerInfo,
+        config: &ExtractionConfig,
+    ) -> Result<Vec<CanonicalString>> {
+        let mut all_strings = Vec::new();
+
+        // Sort sections by priority from config.section_priority
+        let mut sections: Vec<_> = container_info.sections.iter().collect();
+        sections.sort_by_key(|section| {
+            config
+                .section_priority
+                .iter()
+                .position(|&st| st == section.section_type)
+                .unwrap_or_else(|| {
+                    // Fallback to section weight (higher weight = higher priority)
+                    // Convert weight to usize for consistent key type
+                    // Use a large offset to ensure fallback sections sort after prioritized ones
+                    let weight_int = (section.weight * 1000.0) as usize;
+                    config.section_priority.len() + (10000 - weight_int.min(10000))
+                })
+        });
+
+        for section in sections {
+            // Filter sections based on config
+            if section.section_type == SectionType::Debug && !config.include_debug {
+                continue;
+            }
+
+            // Filter code sections by both type and executable flag
+            if (section.section_type == SectionType::Code || section.is_executable)
+                && !config.scan_code_sections
+            {
+                continue;
+            }
+
+            // Extract strings from this section
+            let section_strings = self.extract_from_section(data, section, config)?;
+            all_strings.extend(section_strings);
+        }
+
+        // Include import/export symbols if configured
+        if config.include_symbols {
+            // Add import names
+            for import in &container_info.imports {
+                let length = import.name.len() as u32;
+                all_strings.push(FoundString {
+                    text: import.name.clone(),
+                    encoding: Encoding::Utf8,
+                    offset: 0,
+                    rva: None,
+                    section: None,
+                    length,
+                    tags: Vec::new(),
+                    score: 0,
+                    source: StringSource::ImportName,
+                    confidence: 1.0,
+                });
+            }
+
+            // Add export names
+            for export in &container_info.exports {
+                let length = export.name.len() as u32;
+                all_strings.push(FoundString {
+                    text: export.name.clone(),
+                    encoding: Encoding::Utf8,
+                    offset: 0,
+                    rva: None,
+                    section: None,
+                    length,
+                    tags: Vec::new(),
+                    score: 0,
+                    source: StringSource::ExportName,
+                    confidence: 1.0,
+                });
+            }
+        }
+
+        // Apply deduplication if enabled, otherwise convert each string to a canonical form
+        if config.enable_deduplication {
+            Ok(deduplicate(
+                all_strings,
+                config.dedup_threshold,
+                config.preserve_all_occurrences,
+            ))
+        } else {
+            // Convert each FoundString to a CanonicalString with a single occurrence
+            Ok(all_strings
+                .into_iter()
+                .map(|fs| {
+                    let occurrence = found_string_to_occurrence(fs.clone());
+                    CanonicalString {
+                        text: fs.text,
+                        encoding: fs.encoding,
+                        occurrences: vec![occurrence],
+                        merged_tags: fs.tags,
+                        combined_score: fs.score,
+                    }
+                })
+                .collect())
+        }
     }
 
     fn extract_from_section(
diff --git a/src/lib.rs b/src/lib.rs
index 0f797a4..8dfb54b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -81,5 +81,6 @@ pub use types::{
 
 // Re-export extraction framework types
 pub use extraction::{
-    AsciiExtractionConfig, BasicExtractor, ExtractionConfig, StringExtractor, Utf16ExtractionConfig,
+    AsciiExtractionConfig, BasicExtractor, CanonicalString, ExtractionConfig, StringExtractor,
+    StringOccurrence, Utf16ExtractionConfig, deduplicate,
 };
diff --git a/tests/test_deduplication.rs b/tests/test_deduplication.rs
new file mode 100644
index 0000000..8a1fb5c
--- /dev/null
+++ b/tests/test_deduplication.rs
@@ -0,0 +1,391 @@
+//! Integration tests for string deduplication
+
+use stringy::container::{create_parser, detect_format};
+use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor, deduplicate};
+use stringy::types::{BinaryFormat, Encoding, SectionInfo, SectionType, StringSource};
+
+fn get_fixture_path(name: &str) -> std::path::PathBuf {
+    std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .join("tests")
+        .join("fixtures")
+        .join(name)
+}
+
+#[test]
+fn test_deduplication_with_basic_extractor() {
+    let extractor = BasicExtractor::new();
+    let config = ExtractionConfig::default();
+
+    // Create test data with duplicate strings in multiple sections
+    let data = b"Hello\0World\0Hello\0Test\0World\0Hello\0";
+    let section1 = SectionInfo {
+        name: ".rodata".to_string(),
+        offset: 0,
+        size: 12, // "Hello\0World\0"
+        rva: Some(0x1000),
+        section_type: SectionType::StringData,
+        is_executable: false,
+        is_writable: false,
+        weight: 1.0,
+    };
+    let section2 = SectionInfo {
+        name: ".data".to_string(),
+        offset: 12,
+        size: 10, // "Hello\0Test\0"
+        rva: Some(0x2000),
+        section_type: SectionType::ReadOnlyData,
+        is_executable: false,
+        is_writable: false,
+        weight: 0.7,
+    };
+    let section3 = SectionInfo {
+        name: ".text".to_string(),
+        offset: 22,
+        size: 6, // "World\0"
+        rva: Some(0x3000),
+        section_type: SectionType::Code,
+        is_executable: true,
+        is_writable: false,
+        weight: 0.1,
+    };
+
+    let container_info = stringy::types::ContainerInfo::new(
+        BinaryFormat::Elf,
+        vec![section1, section2, section3],
+        vec![],
+        vec![],
+        None,
+    );
+
+    // Disable deduplication in extractor to test manual deduplication
+    let config_no_dedup = ExtractionConfig {
+        enable_deduplication: false,
+        ..config.clone()
+    };
+
+    let strings = extractor
+        .extract(data, &container_info, &config_no_dedup)
+        .unwrap();
+
+    // Verify we have duplicates before deduplication
+    assert!(strings.len() >= 3);
+    let hello_count = strings.iter().filter(|s| s.text == "Hello").count();
+    assert!(hello_count >= 2, "Should have at least 2 'Hello' strings");
+
+    // Apply deduplication
+    let canonical = deduplicate(strings, None, true);
+
+    // Verify deduplication reduced count
+    assert!(
+        canonical.len() < 6,
+        "Deduplication should reduce string count"
+    );
+
+    // Find "Hello" canonical string
+    let hello_canonical = canonical.iter().find(|c| c.text == "Hello");
+    assert!(
+        hello_canonical.is_some(),
+        "Should find 'Hello' in canonical strings"
+    );
+
+    if let Some(hello) = hello_canonical {
+        // Verify it has multiple occurrences
+        assert!(
+            hello.occurrences.len() >= 2,
+            "Hello should appear multiple times"
+        );
+
+        // Verify metadata preservation
+        let offsets: Vec<u64> = hello.occurrences.iter().map(|o| o.offset).collect();
+        assert!(offsets.contains(&0), "Should preserve offset 0");
+
+        // Verify cross-section bonus (if applicable)
+        let sections: Vec<Option<String>> = hello
+            .occurrences
+            .iter()
+            .map(|o| o.section.clone())
+            .collect();
+        let unique_sections: std::collections::HashSet<_> = sections.into_iter().collect();
+        if unique_sections.len() > 1 {
+            // Cross-section bonus should be applied
+            assert!(
+                hello.combined_score >= 10,
+                "Should have cross-section bonus"
+            );
+        }
+    }
+}
+
+#[test]
+fn test_deduplication_metadata_preservation() {
+    let extractor = BasicExtractor::new();
+    let config = ExtractionConfig::default();
+
+    // Create test data with same string in different sections
+    let data = b"TestString\0TestString\0";
+    let section1 = SectionInfo {
+        name: ".rodata".to_string(),
+        offset: 0,
+        size: 11,
+        rva: Some(0x1000),
+        section_type: SectionType::StringData,
+        is_executable: false,
+        is_writable: false,
+        weight: 1.0,
+    };
+    let section2 = SectionInfo {
+        name: ".data".to_string(),
+        offset: 11,
+        size: 11,
+        rva: Some(0x2000),
+        section_type: SectionType::ReadOnlyData,
+        is_executable: false,
+        is_writable: false,
+        weight: 0.7,
+    };
+
+    let container_info = stringy::types::ContainerInfo::new(
+        BinaryFormat::Elf,
+        vec![section1, section2],
+        vec![],
+        vec![],
+        None,
+    );
+
+    // Disable deduplication in extractor to test manual deduplication
+    let config_no_dedup = ExtractionConfig {
+        enable_deduplication: false,
+        ..config.clone()
+    };
+
+    let strings = extractor
+        .extract(data, &container_info, &config_no_dedup)
+        .unwrap();
+    let canonical = deduplicate(strings, None, true);
+
+    // Find "TestString"
+    let test_string = canonical.iter().find(|c| c.text == "TestString");
+    assert!(test_string.is_some());
+
+    if let Some(ts) = test_string {
+        assert_eq!(ts.occurrences.len(), 2);
+
+        // Verify all offsets are preserved
+        let offsets: Vec<u64> = ts.occurrences.iter().map(|o| o.offset).collect();
+        assert!(offsets.contains(&0));
+        assert!(offsets.contains(&11));
+
+        // Verify sections are preserved
+        let sections: Vec<Option<String>> =
+            ts.occurrences.iter().map(|o| o.section.clone()).collect();
+        assert!(sections.contains(&Some(".rodata".to_string())));
+        assert!(sections.contains(&Some(".data".to_string())));
+    }
+}
+
+#[test]
+fn test_deduplication_with_real_fixture() {
+    // Try to use a real fixture if available
+    let fixture_path = get_fixture_path("test_elf");
+    if !fixture_path.exists() {
+        // Skip if fixture doesn't exist
+        return;
+    }
+
+    let data = std::fs::read(&fixture_path).unwrap();
+    let format = detect_format(&data);
+    if format == BinaryFormat::Unknown {
+        // Skip if format not supported
+        return;
+    }
+
+    let parser = create_parser(format).unwrap();
+    let container_info = parser.parse(&data).unwrap();
+
+    let extractor = BasicExtractor::new();
+
+    // Test with deduplication disabled to get baseline count
+    let config_no_dedup = ExtractionConfig {
+        enable_deduplication: false,
+        ..Default::default()
+    };
+    let strings_no_dedup = extractor
+        .extract(&data, &container_info, &config_no_dedup)
+        .unwrap();
+    let strings_len = strings_no_dedup.len();
+
+    // Test with deduplication enabled
+    let config = ExtractionConfig::default();
+    let strings = extractor.extract(&data, &container_info, &config).unwrap();
+
+    // For comparison, also test manual deduplication
+    let canonical = deduplicate(strings_no_dedup, None, true);
+
+    // Verify deduplication worked (both integrated and manual)
+    assert!(
+        strings.len() <= strings_len,
+        "Integrated deduplication should reduce count"
+    );
+    assert!(
+        canonical.len() <= strings_len,
+        "Manual deduplication should reduce count"
+    );
+
+    // Verify no data loss - all original strings should be represented
+    let mut original_texts: Vec<(String, Encoding)> = strings
+        .iter()
+        .map(|s| (s.text.clone(), s.encoding))
+        .collect();
+    original_texts.sort_by(|a, b| {
+        a.0.cmp(&b.0)
+            .then_with(|| format!("{:?}", a.1).cmp(&format!("{:?}", b.1)))
+    });
+    original_texts.dedup();
+
+    let mut canonical_texts: Vec<(String, Encoding)> = canonical
+        .iter()
+        .map(|c| (c.text.clone(), c.encoding))
+        .collect();
+    canonical_texts.sort_by(|a, b| {
+        a.0.cmp(&b.0)
+            .then_with(|| format!("{:?}", a.1).cmp(&format!("{:?}", b.1)))
+    });
+
+    assert_eq!(
+        original_texts.len(),
+        canonical_texts.len(),
+        "All unique (text, encoding) pairs should be preserved"
+    );
+    for (orig, canon) in original_texts.iter().zip(canonical_texts.iter()) {
+        assert_eq!(orig.0, canon.0);
+        assert_eq!(format!("{:?}", orig.1), format!("{:?}", canon.1));
+    }
+
+    // Verify sorting by score
+    for i in 1..canonical.len() {
+        assert!(
+            canonical[i - 1].combined_score >= canonical[i].combined_score,
+            "Canonical strings should be sorted by combined_score descending"
+        );
+    }
+}
+
+#[test]
+fn test_deduplication_score_bonuses() {
+    use stringy::types::FoundString;
+
+    // Create strings with different sources to test multi-source bonus
+    let strings = vec![
+        FoundString {
+            text: "TestString".to_string(),
+            encoding: Encoding::Utf8,
+            offset: 0x100,
+            rva: Some(0x1000),
+            section: Some(".rodata".to_string()),
+            length: 10,
+            tags: vec![],
+            score: 10,
+            source: StringSource::SectionData,
+            confidence: 0.8,
+        },
+        FoundString {
+            text: "TestString".to_string(),
+            encoding: Encoding::Utf8,
+            offset: 0x200,
+            rva: Some(0x2000),
+            section: Some(".data".to_string()),
+            length: 10,
+            tags: vec![],
+            score: 15,
+            source: StringSource::ImportName,
+            confidence: 0.9,
+        },
+    ];
+
+    let canonical = deduplicate(strings, None, true);
+    assert_eq!(canonical.len(), 1);
+
+    let cs = &canonical[0];
+    // Base: 15 (max), Occurrence: 5, Cross-section: 10, Multi-source: 15, Confidence: 9
+    let expected_score = 15 + 5 + 10 + 15 + 9;
+    assert_eq!(cs.combined_score, expected_score);
+}
+
+#[test]
+fn test_extract_canonical_preserves_occurrences() {
+    use stringy::extraction::{BasicExtractor, ExtractionConfig, StringExtractor};
+
+    let extractor = BasicExtractor::new();
+    let config = ExtractionConfig::default(); // enable_deduplication is true by default
+
+    // Create test data with duplicate strings in multiple sections
+    let data = b"Hello\0World\0Hello\0Test\0";
+    let section1 = SectionInfo {
+        name: ".rodata".to_string(),
+        offset: 0,
+        size: 12, // "Hello\0World\0"
+        rva: Some(0x1000),
+        section_type: SectionType::StringData,
+        is_executable: false,
+        is_writable: false,
+        weight: 1.0,
+    };
+    let section2 = SectionInfo {
+        name: ".data".to_string(),
+        offset: 12,
+        size: 10, // "Hello\0Test\0"
+        rva: Some(0x2000),
+        section_type: SectionType::ReadOnlyData,
+        is_executable: false,
+        is_writable: false,
+        weight: 0.7,
+    };
+
+    let container_info = stringy::types::ContainerInfo::new(
+        BinaryFormat::Elf,
+        vec![section1, section2],
+        vec![],
+        vec![],
+        None,
+    );
+
+    // Test extract_canonical() - should preserve all occurrences
+    let canonical = extractor
+        .extract_canonical(data, &container_info, &config)
+        .unwrap();
+
+    // Find "Hello" - should have multiple occurrences
+    let hello = canonical.iter().find(|c| c.text == "Hello");
+    assert!(hello.is_some(), "Should find 'Hello' in canonical strings");
+    if let Some(h) = hello {
+        assert!(
+            h.occurrences.len() >= 2,
+            "Hello should have multiple occurrences, got {}",
+            h.occurrences.len()
+        );
+        // Verify we can see all offsets
+        let offsets: Vec<u64> = h.occurrences.iter().map(|o| o.offset).collect();
+        assert!(offsets.len() >= 2, "Should preserve multiple offsets");
+    }
+
+    // Compare with extract() - should lose occurrence information
+    let strings = extractor.extract(data, &container_info, &config).unwrap();
+    let hello_strings: Vec<_> = strings.iter().filter(|s| s.text == "Hello").collect();
+    // With deduplication enabled, extract() should return only one "Hello"
+    assert_eq!(
+        hello_strings.len(),
+        1,
+        "extract() should deduplicate and return only one 'Hello'"
+    );
+    // But extract_canonical() should preserve all occurrences
+    assert!(
+        canonical
+            .iter()
+            .find(|c| c.text == "Hello")
+            .unwrap()
+            .occurrences
+            .len()
+            >= 2,
+        "extract_canonical() should preserve all occurrences"
+    );
+}