diff --git a/.cursor/commands/simplicity_review.md b/.cursor/commands/simplicity_review.md
new file mode 100644
index 0000000..527d914
--- /dev/null
+++ b/.cursor/commands/simplicity_review.md
@@ -0,0 +1 @@
+Follow the steps in the .github/prompt/simplicity-review.prompt.md file to review the code for simplicity.
diff --git a/.github/prompt/simplicity-review.prompt.md b/.github/prompt/simplicity-review.prompt.md
new file mode 100644
index 0000000..7464254
--- /dev/null
+++ b/.github/prompt/simplicity-review.prompt.md
@@ -0,0 +1,36 @@
+CODE SIMPLIFICATION REVIEW
+
+Start by examining the uncommitted changes in the current codebase.
+
+ANALYSIS STEPS:
+
+1. Identify what files have been modified or added
+2. Review the actual code changes
+3. Apply simplification principles below
+4. Refactor directly, then show what you changed
+
+SIMPLIFICATION PRINCIPLES:
+
+Complexity Reduction:
+
+- Remove abstraction layers that don't provide clear value
+- Replace complex patterns with straightforward implementations
+- Use language idioms over custom abstractions
+- If a simple function/lambda works, use it—don't create classes
+
+Test Proportionality:
+
+- Keep only tests for critical functionality and real edge cases
+- Delete tests for trivial operations, framework behavior, or hypothetical scenarios
+- For small projects: aim for \<10 meaningful tests per feature
+- Test code should be shorter than implementation
+
+Idiomatic Code:
+
+- Use conventional patterns for the language
+- Prioritize readability and maintainability
+- Apply the principle of least surprise
+
+Ask yourself: "What's the simplest version that actually works reliably?"
+
+Make the refactoring changes, then summarize what you simplified and why. Always finish by running `just ci-check` and ensuring that all checks and tests remain green.
diff --git a/.kiro/hooks/code-review-refactor.kiro.hook b/.kiro/hooks/code-review-refactor.kiro.hook
new file mode 100644
index 0000000..7ccacf3
--- /dev/null
+++ b/.kiro/hooks/code-review-refactor.kiro.hook
@@ -0,0 +1,13 @@
+{
+  "enabled": true,
+  "name": "Code Simplicity Checker",
+  "description": "When the agent finishes its work, automatically trigger a comprehensive code review to eliminate unnecessary complexity, refactor for simplicity, reduce test bloat, and verify idiomatic style before finalizing any code changes",
+  "version": "1",
+  "when": {
+    "type": "agentStop"
+  },
+  "then": {
+    "type": "askAgent",
+    "prompt": "CODE SIMPLIFICATION REVIEW\n\nStart by examining the uncommitted changes in the current codebase.\n\nANALYSIS STEPS:\n1. Identify what files have been modified or added\n2. Review the actual code changes\n3. Apply simplification principles below\n4. Refactor directly, then show what you changed\n\nSIMPLIFICATION PRINCIPLES:\n\nComplexity Reduction:\n- Remove abstraction layers that don't provide clear value\n- Replace complex patterns with straightforward implementations\n- Use language idioms over custom abstractions\n- If a simple function/lambda works, use it—don't create classes\n\nTest Proportionality:\n- Keep only tests for critical functionality and real edge cases\n- Delete tests for trivial operations, framework behavior, or hypothetical scenarios\n- For small projects: aim for <10 meaningful tests per feature\n- Test code should be shorter than implementation\n\nIdiomatic Code:\n- Use conventional patterns for the language\n- Prioritize readability and maintainability\n- Apply the principle of least surprise\n\nAsk yourself: \"What's the simplest version that actually works reliably?\"\n\nMake the refactoring changes, then summarize what you simplified and why. Always finish by running `just ci-check` and ensuring that all checks and tests remain green."
+  }
+}
\ No newline at end of file
diff --git a/.kiro/steering/rust/cargo-toml.md b/.kiro/steering/rust/cargo-toml.md
new file mode 100644
index 0000000..1843b93
--- /dev/null
+++ b/.kiro/steering/rust/cargo-toml.md
@@ -0,0 +1,84 @@
+---
+inclusion: fileMatch
+fileMatchPattern: [Cargo.toml]
+---
+
+# Cargo.toml Standards for Stringy
+
+## Package Configuration
+
+- Use **Rust 2024 Edition** (MSRV: 1.91+) as specified in the package
+- Single crate structure (not a workspace)
+- Enforce lint policy via `[lints.rust]` configuration
+  - Forbid unsafe code globally
+  - Deny all warnings to preserve code quality
+
+Example `Cargo.toml` structure:
+
+```toml
+[package]
+name = "stringy"
+version = "0.1.0"
+edition = "2024"
+authors = ["UncleSp1d3r <unclesp1d3r@evilbitlabs.io>"]
+description = "A smarter alternative to the strings command that leverages format-specific knowledge"
+license = "Apache-2.0"
+repository = "https://github.com/EvilBit-Labs/StringyMcStringFace"
+homepage = "http://evilbitlabs.io/StringyMcStringFace/"
+keywords = ["binary", "strings", "analysis", "reverse-engineering", "malware"]
+categories = ["command-line-utilities", "development-tools"]
+
+[lib]
+name = "stringy"
+path = "src/lib.rs"
+
+[[bin]]
+name = "stringy"
+path = "src/main.rs"
+
+[lints.rust]
+unsafe_code = "forbid"
+warnings = "deny"
+```
+
+## Dependencies
+
+- **Core dependencies**:
+
+  - `clap = { version = "4.5", features = ["derive"] }` - CLI argument parsing
+  - `goblin = "0.10"` - Binary format parsing (ELF, PE, Mach-O)
+  - `serde = { version = "1.0", features = ["derive"] }` - Serialization
+  - `serde_json = "1.0"` - JSON output
+  - `thiserror = "2.0"` - Structured error types
+
+- **Dev dependencies**:
+
+  - `criterion = "0.7"` - Benchmarking
+  - `insta = "1.0"` - Snapshot testing
+  - `tempfile = "3.8"` - Temporary file handling in tests
+
+## Build Profiles
+
+- Use `[profile.dist]` for distribution builds with LTO:
+
+```toml
+[profile.dist]
+inherits = "release"
+lto = "thin"
+```
+
+## Benchmarks
+
+- Define benchmarks in `[[bench]]` sections:
+
+```toml
+[[bench]]
+name = "elf"
+harness = false
+```
+
+## Package Metadata
+
+- Include proper license (Apache-2.0)
+- Provide clear description for binary analysis tool
+- Include relevant keywords for discoverability
diff --git a/.kiro/steering/rust/configuration-management.md b/.kiro/steering/rust/configuration-management.md
new file mode 100644
index 0000000..985b721
--- /dev/null
+++ b/.kiro/steering/rust/configuration-management.md
@@ -0,0 +1,80 @@
+---
+inclusion: fileMatch
+fileMatchPattern: ['**/config*.rs', '**/*config*.rs']
+---
+
+# Configuration Management Standards for Stringy
+
+## Configuration Architecture
+
+Stringy uses **CLI arguments only** for configuration via `clap`:
+
+- **Command-line flags** (only source of configuration)
+- **No configuration files** - all options specified via CLI
+- **No environment variables** - use CLI flags instead
+- **No hierarchical configuration** - simple argument parsing
+
+## CLI Configuration
+
+Define CLI arguments using `clap` with derive macros:
+
+```rust
+use clap::Parser;
+use std::path::PathBuf;
+
+#[derive(Parser)]
+#[command(name = "stringy")]
+#[command(about = "Extract meaningful strings from binary files")]
+struct Cli {
+    /// Input binary file to analyze
+    #[arg(value_name = "FILE")]
+    input: PathBuf,
+
+    /// Minimum string length
+    #[arg(short, long, default_value_t = 4)]
+    min_len: usize,
+
+    /// Output format (json, text, yara)
+    #[arg(short, long, default_value = "text")]
+    format: String,
+
+    /// Only extract strings matching specific tags
+    #[arg(long)]
+    only: Option<Vec<String>>,
+}
+```
+
+## Configuration Validation
+
+Validate CLI arguments:
+
+```rust
+impl Cli {
+    pub fn validate(&self) -> Result<(), StringyError> {
+        if self.min_len < 1 {
+            return Err(StringyError::ConfigError(
+                "Minimum string length must be at least 1".to_string(),
+            ));
+        }
+
+        let valid_formats = ["json", "text", "yara"];
+        if !valid_formats.contains(&self.format.as_str()) {
+            return Err(StringyError::ConfigError(format!(
+                "Invalid output format: {}. Must be one of: {:?}",
+                self.format, valid_formats
+            )));
+        }
+
+        Ok(())
+    }
+}
+```
+
+## No File-Based Configuration
+
+Stringy intentionally does not support configuration files to keep it simple and portable:
+
+- All configuration comes from CLI arguments
+- No need to manage config file locations
+- Works the same way across all environments
+- Easier to use in scripts and pipelines
diff --git a/.kiro/steering/rust/error-handling-patterns.md b/.kiro/steering/rust/error-handling-patterns.md
new file mode 100644
index 0000000..0262e2a
--- /dev/null
+++ b/.kiro/steering/rust/error-handling-patterns.md
@@ -0,0 +1,251 @@
+---
+inclusion: fileMatch
+fileMatchPattern: ['**/*.rs']
+---
+
+# Error Handling Patterns for Stringy
+
+## Error Handling Philosophy
+
+Stringy uses structured error handling with clear error boundaries:
+
+- **Structured Errors**: Use `thiserror` for all error types with derive macros
+- **Error Context**: Provide detailed context in error messages (offsets, section names, file paths)
+- **Error Boundaries**: Implement proper error boundaries for different components (parsing, extraction, classification)
+- **Recovery Strategies**: Continue processing when possible, provide partial results
+
+## Error Type Definition
+
+Define structured error types using thiserror:
+
+```rust
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum StringyError {
+    #[error("Unsupported file format")]
+    UnsupportedFormat,
+
+    #[error("File I/O error: {0}")]
+    IoError(#[from] std::io::Error),
+
+    #[error("Binary parsing error: {0}")]
+    ParseError(String),
+
+    #[error("Invalid encoding in string at offset {offset}")]
+    EncodingError { offset: u64 },
+
+    #[error("Configuration error: {0}")]
+    ConfigError(String),
+
+    #[error("Memory mapping error: {0}")]
+    MemoryMapError(String),
+}
+
+// Convert from goblin errors
+impl From<goblin::error::Error> for StringyError {
+    fn from(err: goblin::error::Error) -> Self {
+        StringyError::ParseError(err.to_string())
+    }
+}
+```
+
+## Error Context
+
+Provide detailed error context:
+
+```rust
+fn parse_elf_section(data: &[u8], section_name: &str) -> Result<SectionInfo> {
+    let section = goblin::elf::Elf::parse(data).map_err(|e| {
+        StringyError::ParseError(format!(
+            "Failed to parse ELF section '{}': {}",
+            section_name, e
+        ))
+    })?;
+
+    // ... parsing logic
+    Ok(section_info)
+}
+```
+
+## Component-Specific Error Handling
+
+Implement error boundaries for different components:
+
+```rust
+// Container parsing errors
+#[derive(Debug, Error)]
+pub enum ContainerError {
+    #[error("Failed to detect binary format")]
+    FormatDetectionFailed,
+
+    #[error("Unsupported binary format: {format:?}")]
+    UnsupportedFormat { format: BinaryFormat },
+
+    #[error("Section parsing failed: {section}: {error}")]
+    SectionParseFailed { section: String, error: String },
+}
+
+// String extraction errors
+#[derive(Debug, Error)]
+pub enum ExtractionError {
+    #[error("Invalid encoding at offset {offset}")]
+    InvalidEncoding { offset: u64 },
+
+    #[error("String extraction failed: {0}")]
+    ExtractionFailed(String),
+}
+
+// Classification errors
+#[derive(Debug, Error)]
+pub enum ClassificationError {
+    #[error("Tagging failed: {0}")]
+    TaggingFailed(String),
+}
+```
+
+## Error Recovery Patterns
+
+Implement graceful degradation:
+
+```rust
+fn extract_strings_from_sections(data: &[u8], sections: &[SectionInfo]) -> Vec<FoundString> {
+    let mut found_strings = Vec::new();
+
+    for section in sections {
+        match extract_strings_from_section(data, section) {
+            Ok(strings) => found_strings.extend(strings),
+            Err(e) => {
+                eprintln!(
+                    "Warning: Failed to extract strings from section '{}': {}",
+                    section.name, e
+                );
+                // Continue with other sections
+            }
+        }
+    }
+
+    found_strings
+}
+```
+
+## Error Logging
+
+Implement structured error logging:
+
+```rust
+fn handle_parsing_error(error: StringyError, context: &str) {
+    match error {
+        StringyError::UnsupportedFormat => {
+            eprintln!("Error: {} - Unsupported binary format", context);
+        }
+        StringyError::ParseError(msg) => {
+            eprintln!("Error: {} - Parse error: {}", context, msg);
+        }
+        StringyError::EncodingError { offset } => {
+            eprintln!(
+                "Error: {} - Invalid encoding at offset 0x{:x}",
+                context, offset
+            );
+        }
+        StringyError::IoError(e) => {
+            eprintln!("Error: {} - I/O error: {}", context, e);
+        }
+        _ => {
+            eprintln!("Error: {} - {}", context, error);
+        }
+    }
+}
+```
+
+## Error Propagation
+
+Use proper error propagation patterns:
+
+```rust
+// Use ? operator for early returns
+fn parse_binary_file(path: &Path) -> Result<ContainerInfo> {
+    let data = std::fs::read(path).map_err(|e| StringyError::IoError(e))?;
+
+    let format = detect_format(&data);
+    let parser = create_parser(format)?;
+    let container_info = parser.parse(&data)?;
+
+    Ok(container_info)
+}
+
+// Use map_err for error transformation
+fn extract_from_section(data: &[u8], section: &SectionInfo) -> Result<Vec<FoundString>> {
+    let section_data = &data[section.offset as usize..(section.offset + section.size) as usize];
+
+    extract_strings(section_data).map_err(|e| StringyError::ExtractionError {
+        section: section.name.clone(),
+        error: e.to_string(),
+    })
+}
+```
+
+## Error Testing
+
+Test error conditions thoroughly:
+
+```rust
+#[test]
+fn test_unsupported_format() {
+    let data = b"NOT_A_BINARY_FORMAT";
+    let format = detect_format(data);
+    assert_eq!(format, BinaryFormat::Unknown);
+
+    let result = create_parser(format);
+    assert!(matches!(result, Err(StringyError::UnsupportedFormat)));
+}
+
+#[test]
+fn test_malformed_binary() {
+    let data = b"\x7fELF\x01\x01\x01"; // Invalid ELF header
+    let parser = ElfParser::new();
+    let result = parser.parse(data);
+
+    assert!(result.is_err());
+    if let Err(StringyError::ParseError(_)) = result {
+        // Expected
+    } else {
+        panic!("Expected ParseError");
+    }
+}
+
+#[test]
+fn test_error_recovery() {
+    // Test that errors in one section don't stop processing of others
+    let result = extract_strings_from_sections(&data, &sections);
+    // Should return partial results even if some sections fail
+    assert!(!result.is_empty());
+}
+```
+
+## Error Documentation
+
+Document error conditions in rustdoc:
+
+````rust
+/// Parses a binary file and extracts container information.
+///
+/// # Errors
+///
+/// Returns [`StringyError`] for:
+/// - Unsupported binary formats
+/// - I/O errors reading the file
+/// - Binary parsing errors (malformed headers, invalid structures)
+///
+/// # Examples
+///
+/// ```rust,no_run
+/// use stringy::container::parse_binary_file;
+///
+/// let info = parse_binary_file("binary.exe")?;
+/// println!("Format: {:?}", info.format);
+/// ```
+pub fn parse_binary_file(path: &Path) -> Result<ContainerInfo> {
+    // Implementation
+}
+````
diff --git a/.kiro/steering/rust/error-handling.md b/.kiro/steering/rust/error-handling.md
new file mode 100644
index 0000000..a0729f6
--- /dev/null
+++ b/.kiro/steering/rust/error-handling.md
@@ -0,0 +1,63 @@
+---
+inclusion: fileMatch
+fileMatchPattern: ['**/*.rs']
+---
+
+# Error Handling Standards for Stringy
+
+## Error Types
+
+Use `thiserror` for structured error types:
+
+```rust
+#[derive(Debug, thiserror::Error)]
+pub enum StringyError {
+    #[error("Unsupported file format")]
+    UnsupportedFormat,
+
+    #[error("File I/O error: {0}")]
+    IoError(#[from] std::io::Error),
+
+    #[error("Binary parsing error: {0}")]
+    ParseError(String),
+
+    #[error("Invalid encoding in string at offset {offset}")]
+    EncodingError { offset: u64 },
+
+    #[error("Configuration error: {0}")]
+    ConfigError(String),
+
+    #[error("Memory mapping error: {0}")]
+    MemoryMapError(String),
+}
+```
+
+## Error Context
+
+- Provide detailed error messages with actionable suggestions
+- Include relevant context information (file paths, offsets, section names, etc.)
+- Convert `goblin` errors to `StringyError` using `From` implementations
+
+## Error Propagation
+
+- Use `?` operator for error propagation
+- Convert between error types using `From` implementations
+- Avoid `unwrap()` and `expect()` in production code
+
+## Binary Parsing Errors
+
+- Handle malformed binary files gracefully
+- Provide clear error messages indicating what went wrong
+- Include file offset information when available
+
+## Error Recovery
+
+- Continue processing other sections when one section fails
+- Provide partial results when possible
+- Log errors but don't crash on non-critical failures
+
+## Error Testing
+
+- Test all error conditions (invalid formats, malformed binaries, I/O errors)
+- Validate error messages and context
+- Test error propagation through the parsing pipeline
diff --git a/.kiro/steering/rust/linting-rules.md b/.kiro/steering/rust/linting-rules.md
new file mode 100644
index 0000000..ab7a881
--- /dev/null
+++ b/.kiro/steering/rust/linting-rules.md
@@ -0,0 +1,370 @@
+---
+inclusion: fileMatch
+fileMatchPattern: ['**/*.rs']
+---
+
+# Rust Linting Rules for Stringy
+
+This document explains the intent behind each clippy lint rule in Stringy's Cargo.toml. These rules are carefully chosen for a binary analysis tool and should not be disabled without understanding their purpose.
+
+## Critical Security Rules (FORBIDDEN)
+
+### `panic = "forbid"`
+
+**Intent**: Panics crash the binary analysis tool, leaving users without results. In a CLI tool context, this is unacceptable. **Why not to disable**: A crashed tool provides no value to users analyzing binaries.
+
+### `unwrap_used = "forbid"`
+
+**Intent**: Unwraps can cause unexpected crashes. Production CLI tools must handle all error cases gracefully. **Why not to disable**: Silent failures in error handling can mask parsing errors or provide incomplete results.
+
+### `await_holding_lock = "deny"`
+
+**Intent**: Prevents deadlocks in async code (though Stringy is synchronous, this rule still applies if async is added). **Why not to disable**: Deadlocks freeze the tool, providing no value to users.
+
+## Memory Safety Rules
+
+### `as_conversions = "warn"`
+
+**Intent**: Prevents potentially lossy type conversions that could corrupt data or cause security bypasses. **Why not to disable**: Data corruption in security monitoring can lead to false negatives.
+
+### `as_ptr_cast_mut = "warn"`
+
+**Intent**: Prevents dangerous mutable pointer casts that could lead to memory corruption or use-after-free. **Why not to disable**: Memory corruption in security-critical code is a major vulnerability.
+
+### `cast_ptr_alignment = "warn"`
+
+**Intent**: Ensures pointer alignment is correct to prevent undefined behavior and potential crashes. **Why not to disable**: Misaligned pointers can cause crashes or security issues.
+
+### `indexing_slicing = "warn"`
+
+**Intent**: Prevents out-of-bounds array access that could cause crashes when parsing binary data. **Why not to disable**: Buffer overflows when parsing binary sections can crash the tool or produce incorrect results.
+
+## Arithmetic Safety Rules
+
+### `arithmetic_side_effects = "warn"`
+
+**Intent**: Catches unintended arithmetic operations that could lead to incorrect calculations or security bypasses. **Why not to disable**: Incorrect arithmetic in security calculations can create vulnerabilities.
+
+### `integer_division = "warn"`
+
+**Intent**: Warns about potential division by zero that could crash the system. **Why not to disable**: Division by zero crashes can be exploited or cause monitoring failures.
+
+### `modulo_arithmetic = "warn"`
+
+**Intent**: Prevents modulo by zero errors that could crash the system. **Why not to disable**: Similar to division by zero, this can cause system crashes.
+
+### `float_cmp = "warn"`
+
+**Intent**: Ensures safe floating-point comparisons to prevent incorrect security decisions. **Why not to disable**: Incorrect float comparisons can lead to wrong threat assessments.
+
+## Performance Rules
+
+### `clone_on_ref_ptr = "warn"`
+
+**Intent**: Prevents unnecessary cloning of reference-counted types that wastes memory and CPU. **Why not to disable**: In a monitoring system, performance directly impacts detection capability.
+
+### `rc_buffer = "warn"`
+
+**Intent**: Optimizes reference-counted buffer usage for better performance. **Why not to disable**: Poor buffer management can cause memory pressure and slow detection.
+
+### `rc_mutex = "warn"`
+
+**Intent**: Warns about inefficient reference-counted mutex usage that can cause contention. **Why not to disable**: Lock contention can slow down threat detection.
+
+### `large_stack_arrays = "warn"`
+
+**Intent**: Prevents stack overflow from large arrays that could crash the system. **Why not to disable**: Stack overflows can crash the monitoring system.
+
+### `str_to_string = "warn"`
+
+**Intent**: Avoids unnecessary string allocations that waste memory. **Why not to disable**: String allocation overhead can impact performance in high-throughput monitoring.
+
+### `string_add = "warn"`
+
+**Intent**: Prevents inefficient string concatenation that can cause performance issues. **Why not to disable**: String concatenation performance matters in log processing.
+
+### `string_add_assign = "warn"`
+
+**Intent**: Optimizes string building operations for better performance. **Why not to disable**: String building is common in alert generation and logging.
+
+### `unused_async = "warn"`
+
+**Intent**: Removes unnecessary async overhead that wastes resources. **Why not to disable**: Unnecessary async can impact system responsiveness.
+
+## Correctness Rules
+
+### `correctness = { level = "deny", priority = -1 }`
+
+**Intent**: Denies all correctness issues that could lead to bugs or security vulnerabilities. **Why not to disable**: Correctness is fundamental to security monitoring.
+
+### `suspicious = { level = "warn", priority = -1 }`
+
+**Intent**: Warns about suspicious patterns that might indicate bugs or security issues. **Why not to disable**: Suspicious patterns often indicate real problems.
+
+### `perf = { level = "warn", priority = -1 }`
+
+**Intent**: Optimizes performance-critical code paths. **Why not to disable**: Performance directly impacts security monitoring effectiveness.
+
+## Error Handling Rules
+
+### `expect_used = "warn"`
+
+**Intent**: Prefers proper error handling over expect() for better error messages and handling. **Why not to disable**: Proper error handling is crucial for debugging security issues.
+
+### `map_err_ignore = "warn"`
+
+**Intent**: Ensures error transformations are meaningful and not ignored. **Why not to disable**: Ignored errors can mask security problems.
+
+### `let_underscore_must_use = "warn"`
+
+**Intent**: Prevents ignoring important return values that might indicate errors. **Why not to disable**: Ignored return values can hide security-relevant information.
+
+## Code Organization Rules
+
+### `missing_docs_in_private_items = "allow"`
+
+**Intent**: Private items don't need documentation to reduce noise. **Why this exception**: Private implementation details don't need public documentation.
+
+### `redundant_type_annotations = "warn"`
+
+**Intent**: Removes unnecessary type annotations that clutter code. **Why not to disable**: Clean code is easier to audit for security issues.
+
+### `ref_binding_to_reference = "warn"`
+
+**Intent**: Prevents unnecessary reference binding that can hide ownership issues. **Why not to disable**: Ownership issues can lead to use-after-free vulnerabilities.
+
+### `pattern_type_mismatch = "warn"`
+
+**Intent**: Ensures pattern matching is type-safe to prevent runtime errors. **Why not to disable**: Type mismatches can cause crashes or security bypasses.
+
+## Additional Security Rules
+
+### `dbg_macro = "warn"`
+
+**Intent**: Prevents debug output from accidentally reaching production logs. **Why not to disable**: Debug output in production can leak sensitive information.
+
+### `todo = "warn"`
+
+**Intent**: Ensures TODO comments are addressed before production deployment. **Why not to disable**: Unfinished code in production monitoring is a security risk.
+
+### `unimplemented = "warn"`
+
+**Intent**: Prevents unimplemented code from reaching production. **Why not to disable**: Unimplemented code will panic at runtime.
+
+### `unreachable = "warn"`
+
+**Intent**: Identifies unreachable code that might indicate logic errors. **Why not to disable**: Unreachable code often indicates security bypasses or bugs.
+
+## Performance and Resource Rules
+
+### `create_dir = "warn"`
+
+**Intent**: Ensures directory creation is handled properly to prevent race conditions. **Why not to disable**: Race conditions in file operations can cause security issues.
+
+### `exit = "warn"`
+
+**Intent**: Prevents unexpected program termination that could leave systems unmonitored. **Why not to disable**: Unexpected exits can leave security gaps.
+
+### `filetype_is_file = "warn"`
+
+**Intent**: Ensures proper file type checking to prevent security bypasses. **Why not to disable**: Incorrect file type checks can lead to security vulnerabilities.
+
+### `float_equality_without_abs = "warn"`
+
+**Intent**: Prevents incorrect floating-point comparisons that could affect security calculations. **Why not to disable**: Incorrect comparisons can lead to wrong security decisions.
+
+### `if_then_some_else_none = "warn"`
+
+**Intent**: Identifies potentially confusing conditional logic that might hide bugs. **Why not to disable**: Confusing logic can hide security vulnerabilities.
+
+### `lossy_float_literal = "warn"`
+
+**Intent**: Prevents precision loss in floating-point calculations that could affect security metrics. **Why not to disable**: Precision loss can lead to incorrect security assessments.
+
+### `match_same_arms = "warn"`
+
+**Intent**: Identifies duplicate match arms that might indicate copy-paste errors or logic bugs. **Why not to disable**: Duplicate arms can hide security logic errors.
+
+### `missing_assert_message = "warn"`
+
+**Intent**: Ensures assertions have meaningful messages for debugging security issues. **Why not to disable**: Good assertion messages are crucial for security debugging.
+
+### `mixed_read_write_in_expression = "warn"`
+
+**Intent**: Prevents confusing read/write operations that could hide race conditions. **Why not to disable**: Race conditions can lead to security vulnerabilities.
+
+### `mutex_atomic = "warn"`
+
+**Intent**: Suggests using atomic operations instead of mutexes for better performance. **Why not to disable**: Performance matters in high-throughput security monitoring.
+
+### `mutex_integer = "warn"`
+
+**Intent**: Suggests using atomic integers instead of mutex-protected integers. **Why not to disable**: Atomic operations are more efficient for simple data.
+
+### `non_ascii_literal = "warn"`
+
+**Intent**: Ensures non-ASCII literals are intentional and properly handled. **Why not to disable**: Improper handling of non-ASCII can lead to security bypasses.
+
+### `non_send_fields_in_send_ty = "warn"`
+
+**Intent**: Ensures thread safety in async code that processes security data. **Why not to disable**: Thread safety is crucial for concurrent security processing.
+
+### `partial_pub_fields = "warn"`
+
+**Intent**: Prevents partially public structs that can break encapsulation. **Why not to disable**: Encapsulation is important for security-critical data structures.
+
+### `same_name_method = "warn"`
+
+**Intent**: Prevents method name conflicts that could lead to confusion or bugs. **Why not to disable**: Confusing method names can hide security logic errors.
+
+### `self_named_module_files = "warn"`
+
+**Intent**: Ensures consistent module naming that aids in code organization and security auditing. **Why not to disable**: Consistent naming helps with security code reviews.
+
+### `semicolon_inside_block = "warn"`
+
+**Intent**: Prevents confusing semicolon usage that could change code behavior. **Why not to disable**: Incorrect semicolons can change security logic.
+
+### `semicolon_outside_block = "warn"`
+
+**Intent**: Ensures proper semicolon usage for clear code structure. **Why not to disable**: Clear code structure aids in security auditing.
+
+### `shadow_reuse = "warn"`
+
+**Intent**: Prevents variable shadowing that can hide bugs or security issues. **Why not to disable**: Variable shadowing can hide security logic errors.
+
+### `shadow_same = "warn"`
+
+**Intent**: Prevents shadowing variables with the same name. **Why not to disable**: Same-name shadowing can hide bugs.
+
+### `shadow_unrelated = "warn"`
+
+**Intent**: Prevents shadowing unrelated variables that can cause confusion. **Why not to disable**: Unrelated shadowing can hide security bugs.
+
+### `string_lit_as_bytes = "warn"`
+
+**Intent**: Prevents unnecessary string literal to bytes conversion. **Why not to disable**: Unnecessary conversions waste resources in high-throughput monitoring.
+
+### `string_slice = "warn"`
+
+**Intent**: Optimizes string slicing operations for better performance. **Why not to disable**: String operations are common in log processing and alert generation.
+
+### `suspicious_operation_groupings = "warn"`
+
+**Intent**: Identifies suspicious operation groupings that might indicate bugs. **Why not to disable**: Suspicious patterns often indicate real security issues.
+
+### `trailing_empty_array = "warn"`
+
+**Intent**: Prevents trailing empty arrays that can cause confusion or bugs. **Why not to disable**: Confusing array structures can hide security bugs.
+
+### `transmute_undefined_repr = "warn"`
+
+**Intent**: Prevents undefined behavior from transmute operations. **Why not to disable**: Undefined behavior can lead to security vulnerabilities.
+
+### `trivial_regex = "warn"`
+
+**Intent**: Identifies trivial regex patterns that could be simplified. **Why not to disable**: Simple patterns are easier to audit for security issues.
+
+### `undocumented_unsafe_blocks = "warn"`
+
+**Intent**: Ensures unsafe blocks are documented to explain their necessity. **Why not to disable**: Unsafe code must be carefully documented for security auditing.
+
+### `unnecessary_self_imports = "warn"`
+
+**Intent**: Removes unnecessary self imports that clutter code. **Why not to disable**: Clean code is easier to audit for security issues.
+
+### `unseparated_literal_suffix = "warn"`
+
+**Intent**: Ensures proper literal suffix formatting for readability. **Why not to disable**: Readable code aids in security auditing.
+
+### `unused_peekable = "warn"`
+
+**Intent**: Removes unused peekable iterators that waste resources. **Why not to disable**: Unused resources can impact performance in monitoring systems.
+
+### `unused_rounding = "warn"`
+
+**Intent**: Removes unused rounding operations that waste CPU cycles. **Why not to disable**: CPU cycles matter in high-throughput security monitoring.
+
+### `use_debug = "warn"`
+
+**Intent**: Prevents debug formatting in production code that can leak information. **Why not to disable**: Debug formatting can leak sensitive information in logs.
+
+### `verbose_file_reads = "warn"`
+
+**Intent**: Optimizes file reading operations for better performance. **Why not to disable**: File I/O performance matters in log processing.
+
+### `wildcard_enum_match_arm = "warn"`
+
+**Intent**: Prevents wildcard enum matching that can hide security logic errors. **Why not to disable**: Wildcard matching can hide important security cases.
+
+### `zero_sized_map_values = "warn"`
+
+**Intent**: Identifies zero-sized map values that might indicate inefficient data structures. **Why not to disable**: Inefficient data structures can impact monitoring performance.
+
+## Pragmatic Exceptions
+
+These exceptions are allowed currently while the project is in early development.
+
+### `missing_errors_doc = "allow"`
+
+**Intent**: Error documentation can be verbose and obvious from context. **Why this exception**: Reduces noise while maintaining code clarity.
+
+### `missing_panics_doc = "allow"`
+
+**Intent**: Panic documentation is often obvious from the panic message. **Why this exception**: Reduces documentation overhead for obvious cases.
+
+### `must_use_candidate = "allow"`
+
+**Intent**: Some must-use candidates are too noisy for this project. **Why this exception**: Balances safety with developer productivity.
+
+### `cast_possible_truncation = "allow"`
+
+**Intent**: Some truncation warnings are too noisy for this project. **Why this exception**: Reduces noise while maintaining type safety.
+
+### `cast_precision_loss = "allow"`
+
+**Intent**: Some precision loss warnings are acceptable in this context. **Why this exception**: Balances precision with practical considerations.
+
+### `cast_sign_loss = "allow"`
+
+**Intent**: Some sign loss warnings are acceptable in this context. **Why this exception**: Reduces noise while maintaining correctness.
+
+### `module_name_repetitions = "allow"`
+
+**Intent**: Module name repetitions are sometimes necessary for clarity. **Why this exception**: Allows clear module organization.
+
+### `similar_names = "allow"`
+
+**Intent**: Similar names are sometimes necessary for related functionality. **Why this exception**: Reduces noise while maintaining clear naming.
+
+### `too_many_lines = "allow"`
+
+**Intent**: Some modules are naturally large due to their complexity. **Why this exception**: Allows complex modules when necessary.
+
+### `type_complexity = "allow"`
+
+**Intent**: Complex types are sometimes necessary for security-critical code. **Why this exception**: Balances complexity with functionality.
+
+### `async_yields_async = "allow"`
+
+**Intent**: Some async yields are necessary for proper async patterns. **Why this exception**: Allows necessary async patterns.
+
+### `large_futures = "allow"`
+
+**Intent**: Some futures are naturally large due to their complexity. **Why this exception**: Allows complex futures when necessary.
+
+### `result_large_err = "allow"`
+
+**Intent**: Some error types are naturally large due to their complexity. **Why this exception**: Allows complex error types when necessary.
+
+### `cargo_common_metadata = "allow"`
+
+**Intent**: Common metadata warnings are too noisy for this project. **Why this exception**: Reduces noise while maintaining package metadata.
+
+## Summary
+
+These linting rules are carefully chosen for a binary analysis tool. Each rule serves a specific purpose in preventing crashes, ensuring performance, and maintaining code quality. They should not be disabled without understanding their intent and the implications of doing so.
+
+## AI Assistant Restrictions
+
+**CRITICAL**: AI assistants are explicitly prohibited from removing clippy restrictions or allowing linters marked as `deny` without explicit permission. All `-D warnings` and `deny` attributes must be preserved. Any changes to linting configuration require explicit user approval.
diff --git a/.kiro/steering/rust/performance-optimization.md b/.kiro/steering/rust/performance-optimization.md
new file mode 100644
index 0000000..942933b
--- /dev/null
+++ b/.kiro/steering/rust/performance-optimization.md
@@ -0,0 +1,273 @@
+---
+inclusion: fileMatch
+fileMatchPattern: ['**/benches/**/*.rs', '**/*bench*.rs', '**/*performance*.rs']
+---
+
+# Performance Optimization Standards for Stringy
+
+## High-Performance Binary Processing
+
+Stringy uses idiomatic best practices for high-performance binary analysis:
+
+- **Zero-Copy Parsing**: Use `goblin` for efficient binary format parsing without unnecessary allocations
+- **Memory-Mapped Files**: Consider memory-mapped I/O for large binary files
+- **Lazy Evaluation**: Process sections on-demand rather than loading everything into memory
+- **Efficient String Extraction**: Use slice-based operations for string extraction to avoid allocations
+
+## Performance Targets
+
+Stringy must meet strict performance requirements:
+
+- **Large Binary Processing**: Handle binaries up to several GB efficiently
+- **Memory Usage**: Minimize memory footprint, especially for large binaries
+- **Processing Speed**: Process typical binaries (10-100 MB) in < 1 second
+- **String Extraction**: Extract strings from sections efficiently without excessive allocations
+- **Format Detection**: Detect binary format in < 10ms
+
+## Benchmarking with Criterion
+
+Use Criterion for performance benchmarking:
+
+```rust
+use criterion::{Criterion, black_box, criterion_group, criterion_main};
+use std::fs;
+
+fn benchmark_format_detection(c: &mut Criterion) {
+    let mut group = c.benchmark_group("format_detection");
+
+    let elf_data = fs::read("test_data/sample.elf").unwrap();
+    let pe_data = fs::read("test_data/sample.exe").unwrap();
+    let macho_data = fs::read("test_data/sample.macho").unwrap();
+
+    group.bench_function("detect_elf", |b| {
+        b.iter(|| black_box(stringy::container::detect_format(&elf_data)))
+    });
+
+    group.bench_function("detect_pe", |b| {
+        b.iter(|| black_box(stringy::container::detect_format(&pe_data)))
+    });
+
+    group.bench_function("detect_macho", |b| {
+        b.iter(|| black_box(stringy::container::detect_format(&macho_data)))
+    });
+
+    group.finish();
+}
+
+fn benchmark_string_extraction(c: &mut Criterion) {
+    let mut group = c.benchmark_group("string_extraction");
+
+    let binary_data = fs::read("test_data/large_binary").unwrap();
+    let container_info = stringy::container::parse_binary(&binary_data).unwrap();
+
+    group.bench_function("extract_strings", |b| {
+        b.iter(|| {
+            black_box(stringy::extraction::extract_strings(
+                &binary_data,
+                &container_info.sections,
+            ))
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    benchmark_format_detection,
+    benchmark_string_extraction
+);
+criterion_main!(benches);
+```
+
+## Memory Management
+
+Implement efficient memory usage patterns for binary processing:
+
+```rust
+use memmap2::MmapOptions;
+use std::fs::File;
+
+// Memory-mapped file for large binaries
+fn process_large_binary(path: &Path) -> Result<ContainerInfo> {
+    let file = File::open(path)?;
+    let mmap = unsafe { MmapOptions::new().map(&file)? };
+
+    // Process memory-mapped data without loading entire file
+    let format = detect_format(&mmap);
+    let parser = create_parser(format)?;
+    let container_info = parser.parse(&mmap)?;
+
+    Ok(container_info)
+}
+
+// Slice-based string extraction to avoid allocations
+fn extract_strings_efficient(data: &[u8]) -> Vec<FoundString> {
+    let mut strings = Vec::new();
+    let mut i = 0;
+
+    while i < data.len() {
+        if let Some(string) = find_string_at_offset(data, i) {
+            strings.push(string);
+            i += string.length as usize;
+        } else {
+            i += 1;
+        }
+    }
+
+    strings
+}
+```
+
+## Binary Parsing Optimization
+
+Optimize binary format parsing:
+
+```rust
+// Use goblin's zero-copy parsing
+fn parse_elf_efficient(data: &[u8]) -> Result<ContainerInfo> {
+    let elf = goblin::elf::Elf::parse(data)?;
+
+    // Process sections without cloning
+    let sections: Vec<SectionInfo> = elf
+        .section_headers
+        .iter()
+        .enumerate()
+        .filter_map(|(idx, header)| {
+            // Only process sections that are likely to contain strings
+            if is_string_section(&elf, idx) {
+                Some(parse_section_info(&elf, header, idx))
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    Ok(ContainerInfo {
+        format: BinaryFormat::Elf,
+        sections,
+        imports: extract_imports(&elf)?,
+        exports: extract_exports(&elf)?,
+    })
+}
+```
+
+## String Extraction Optimization
+
+Optimize string extraction algorithms:
+
+```rust
+// Efficient UTF-8 string extraction
+fn extract_utf8_strings(data: &[u8], min_len: usize) -> Vec<FoundString> {
+    let mut strings = Vec::new();
+    let mut start = None;
+
+    for (i, &byte) in data.iter().enumerate() {
+        if byte.is_ascii() && byte >= 0x20 && byte < 0x7F {
+            if start.is_none() {
+                start = Some(i);
+            }
+        } else if byte == 0 {
+            if let Some(s) = start {
+                let len = i - s;
+                if len >= min_len {
+                    if let Ok(text) = std::str::from_utf8(&data[s..i]) {
+                        strings.push(FoundString {
+                            text: text.to_string(),
+                            offset: s as u64,
+                            length: len as u32,
+                            // ... other fields
+                        });
+                    }
+                }
+            }
+            start = None;
+        } else {
+            start = None;
+        }
+    }
+
+    strings
+}
+```
+
+## Section Processing Optimization
+
+Process sections efficiently:
+
+```rust
+// Process sections in priority order (highest weight first)
+fn process_sections_prioritized(data: &[u8], sections: &[SectionInfo]) -> Vec<FoundString> {
+    let mut sections = sections.to_vec();
+
+    // Sort by weight (descending) to process high-value sections first
+    sections.sort_by(|a, b| b.weight.partial_cmp(&a.weight).unwrap());
+
+    let mut all_strings = Vec::new();
+
+    for section in sections {
+        if let Ok(strings) = extract_strings_from_section(data, &section) {
+            all_strings.extend(strings);
+        }
+    }
+
+    all_strings
+}
+```
+
+## Performance Testing
+
+Include performance regression tests:
+
+```rust
+#[test]
+fn test_format_detection_performance() {
+    let data = include_bytes!("../test_data/sample.elf");
+    let start = Instant::now();
+
+    for _ in 0..1000 {
+        let _format = detect_format(data);
+    }
+
+    let duration = start.elapsed();
+
+    // Must complete 1000 detections in < 100ms
+    assert!(duration < Duration::from_millis(100));
+}
+
+#[test]
+fn test_large_binary_processing() {
+    let data = fs::read("test_data/large_binary").unwrap();
+    let start = Instant::now();
+
+    let format = detect_format(&data);
+    let parser = create_parser(format).unwrap();
+    let _container_info = parser.parse(&data).unwrap();
+
+    let duration = start.elapsed();
+
+    // Must process 100MB binary in < 2 seconds
+    assert!(duration < Duration::from_secs(2));
+}
+```
+
+## Memory Usage Testing
+
+Test memory efficiency:
+
+```rust
+#[test]
+fn test_memory_efficiency() {
+    let data = fs::read("test_data/large_binary").unwrap();
+
+    // Process binary multiple times
+    for _ in 0..10 {
+        let format = detect_format(&data);
+        let parser = create_parser(format).unwrap();
+        let _container_info = parser.parse(&data).unwrap();
+    }
+
+    // Memory should not grow unbounded
+    // (In a real test, you'd measure actual memory usage)
+}
+```
diff --git a/.kiro/steering/rust/rust-standards.md b/.kiro/steering/rust/rust-standards.md
new file mode 100644
index 0000000..245a4c7
--- /dev/null
+++ b/.kiro/steering/rust/rust-standards.md
@@ -0,0 +1,43 @@
+---
+inclusion: fileMatch
+fileMatchPattern: ['**/*.rs']
+---
+
+# Rust Coding Standards for Stringy
+
+## Language and Edition
+
+- Always use **Rust 2024 Edition** (MSRV: 1.91+) as specified in [Cargo.toml](mdc:Cargo.toml)
+- Follow the package configuration in [Cargo.toml](mdc:Cargo.toml) with `unsafe_code = "forbid"` and `warnings = "deny"`
+
+## Code Quality Requirements
+
+- **Zero warnings policy**: All code must pass `cargo clippy -- -D warnings`
+- **No unsafe code**: `unsafe_code = "forbid"` is enforced at package level
+- **Formatting**: Use standard `rustfmt` with project-specific line length
+- **Error Handling**: Use `thiserror` for structured errors
+- **Synchronous Design**: This is a synchronous CLI tool - no async runtime needed
+- **Focused and Manageable Files**: Source files should be focused and manageable. Large files should be split into smaller, more focused files; no larger than 500-600 lines, when possible.
+- **Strictness**: `warnings = "deny"` enforced at package level; any use of `allow` **MUST** be accompanied by a justification in the code and cannot be applied to entire files or modules.
+
+## Code Organization
+
+- Use trait-based interfaces for format parsers (`ContainerParser` trait)
+- Implement comprehensive error handling with `thiserror`
+- Use strongly-typed structures with `serde` for serialization
+- Organize by domain: `container/`, `extraction/`, `classification/`, `output/`, `types/`
+
+## Module Structure
+
+- **container/**: Binary format detection and parsing (ELF, PE, Mach-O)
+- **extraction/**: String extraction algorithms
+- **classification/**: Semantic analysis and tagging
+- **output/**: Result formatting (JSON, human-readable, YARA-friendly)
+- **types/**: Core data structures and error handling
+
+## Testing Requirements
+
+- Include comprehensive tests with `insta` for snapshot testing
+- Test binary format detection and parsing
+- Test string extraction from various formats
+- Use `tempfile` for temporary binary files in tests
diff --git a/README.md b/README.md
index cd68c40..86f13ae 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ Presents the most relevant strings first using a scoring algorithm.
 - **Section targeting**: `.rodata`, `.rdata`, `__cstring`, resources, manifests
 - **Encoding support**: ASCII, UTF-8, UTF-16LE/BE with confidence scoring
 - **Smart classification**:
-  - URLs, domains, IPs
+  - URLs, domains, IPv4/IPv6 addresses (implemented)
   - Filepaths & registry keys
   - GUIDs & user agents
   - Format strings (`%s`, `%d`, etc.)
@@ -172,7 +172,7 @@ This project is in active development. Current implementation status:
 - ✅ **Section Analysis**: Smart classification of string-rich sections
 - ✅ **PE Resource Enumeration**: VERSIONINFO, STRINGTABLE, and MANIFEST resource detection (Phase 1 complete)
 - 🚧 **String Extraction**: ASCII/UTF-8 and UTF-16 extraction engines (framework ready)
-- 🚧 **Semantic Classification**: URL, domain, path, GUID pattern matching (types defined)
+- 🚧 **Semantic Classification**: IPv4/IPv6 detection implemented; URL, domain, path, GUID pattern matching in progress (types defined)
 - 🚧 **Ranking System**: Section-aware scoring algorithm (framework in place)
 - 🚧 **Output Formats**: JSONL, human-readable, and YARA-friendly output (types ready)
 - 🚧 **CLI Interface**: Basic argument parsing implemented, main pipeline in progress
diff --git a/docs/src/classification.md b/docs/src/classification.md
index fb033e9..8507b17 100644
--- a/docs/src/classification.md
+++ b/docs/src/classification.md
@@ -28,10 +28,14 @@ Raw String → Pattern Matching → Context Analysis → Tag Assignment → Conf
 
 #### IP Addresses
 
-- **IPv4 Pattern**: `\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b`
-- **IPv6 Pattern**: `\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b`
-- **Examples**: `192.168.1.1`, `2001:db8::1`
-- **Validation**: Range checking, reserved address detection
+- **IPv4 Pattern**: `\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b`
+- **IPv6 Pattern**: Comprehensive pattern supporting full notation, compressed notation (`::1`), and mixed notation (`::ffff:192.0.2.1`)
+- **Examples**: `192.168.1.1`, `2001:db8::1`, `[::1]:8080`
+- **Validation**: Two-stage validation using regex pre-filter followed by `std::net::IpAddr` parsing for correctness
+- **Port Handling**: IP addresses with ports (e.g., `192.168.1.1:8080`) are supported by automatically stripping the port suffix before validation
+- **IPv6 Bracket Handling**: Bracketed IPv6 addresses (e.g., `[::1]` and `[::1]:8080`) are supported
+- **False Positive Mitigation**: Version numbers like `1.2.3.4` are filtered out using heuristics (all octets < 20)
+- **Implementation**: See `src/classification/semantic.rs` for the complete implementation
 - **Security relevance**: High - infrastructure indicators
 
 ### File System Indicators
diff --git a/src/classification/mod.rs b/src/classification/mod.rs
index 09b5a75..f7c7a98 100644
--- a/src/classification/mod.rs
+++ b/src/classification/mod.rs
@@ -1,4 +1,49 @@
-// String analysis and tagging
+//! String analysis and tagging
+//!
+//! This module provides semantic analysis capabilities to identify and tag
+//! extracted strings based on their content patterns. The classification system
+//! uses pattern matching (regex) combined with validation to reduce false positives.
+//!
+//! ## Current Capabilities
+//!
+//! - **IPv4/IPv6 Address Detection**: Identifies IP addresses with support for
+//!   ports, bracketed IPv6 notation, and false positive mitigation for version numbers
+//! - **URL Detection**: Identifies HTTP/HTTPS URLs
+//! - **Domain Detection**: Identifies domain names with TLD validation
+//!
+//! ## Future Capabilities
+//!
+//! - File paths (POSIX and Windows)
+//! - Registry paths
+//! - GUIDs/UUIDs
+//! - Email addresses
+//! - Base64 data
+//! - Format strings
+//! - User agents
+//!
+//! ## Usage
+//!
+//! ```rust
+//! use stringy::classification::SemanticClassifier;
+//! use stringy::types::{FoundString, Encoding, StringSource, Tag};
+//!
+//! let classifier = SemanticClassifier::new();
+//! let found_string = FoundString {
+//!     text: "192.168.1.1:8080".to_string(),
+//!     encoding: Encoding::Ascii,
+//!     offset: 0,
+//!     rva: None,
+//!     section: None,
+//!     length: 15,
+//!     tags: Vec::new(),
+//!     score: 0,
+//!     source: StringSource::SectionData,
+//!     confidence: 1.0,
+//! };
+//!
+//! let tags = classifier.classify(&found_string);
+//! assert!(tags.contains(&Tag::IPv4));
+//! ```
 
 pub mod semantic;
 pub use semantic::SemanticClassifier;
diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs
index 4e04e09..da3143a 100644
--- a/src/classification/semantic.rs
+++ b/src/classification/semantic.rs
@@ -41,6 +41,8 @@
 use crate::types::{FoundString, Tag};
 use lazy_static::lazy_static;
 use regex::Regex;
+use std::net::{Ipv4Addr, Ipv6Addr};
+use std::str::FromStr;
 
 lazy_static! {
     /// Regular expression for matching HTTP/HTTPS URLs
@@ -55,6 +57,35 @@ lazy_static! {
     /// It ensures domains start and end with alphanumeric characters, allows hyphens
     /// in the middle, and requires at least a 2-character TLD.
     static ref DOMAIN_REGEX: Regex = Regex::new(r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b").unwrap();
+
+    /// Regular expression for matching IPv4 addresses
+    ///
+    /// Pattern matches IPv4 addresses with proper octet validation (0-255).
+    /// Matches the entire string (used after port stripping).
+    static ref IPV4_REGEX: Regex = Regex::new(r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap();
+
+    /// Regular expression for matching IPv6 addresses
+    ///
+    /// Pattern matches IPv6 addresses including:
+    /// - Full notation: 2001:0db8:85a3:0000:0000:8a2e:0370:7334
+    /// - Compressed notation: 2001:db8::1, ::1, fe80::1
+    /// - Mixed notation: ::ffff:192.0.2.1, 64:ff9b::192.0.2.1
+    /// This is a permissive pattern that checks for basic IPv6 structure (colons and hex digits).
+    /// Actual validation is performed by std::net::Ipv6Addr::from_str.
+    static ref IPV6_REGEX: Regex = Regex::new(r"(?i)^(?:[0-9a-f]{1,4}:){1,7}[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,7}:$|^(?:[0-9a-f]{1,4}:){1,6}:[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,5}(?::[0-9a-f]{1,4}){1,2}$|^(?:[0-9a-f]{1,4}:){1,4}(?::[0-9a-f]{1,4}){1,3}$|^(?:[0-9a-f]{1,4}:){1,3}(?::[0-9a-f]{1,4}){1,4}$|^(?:[0-9a-f]{1,4}:){1,2}(?::[0-9a-f]{1,4}){1,5}$|^[0-9a-f]{1,4}:(?::[0-9a-f]{1,4}){1,6}$|^:(?::[0-9a-f]{1,4}){1,7}$|^::$|^::ffff:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap();
+
+    /// Regular expression for detecting and stripping port suffixes
+    ///
+    /// Matches :port where port is in the valid range 0-65535.
+    /// Pattern: :[0-9]{1,4} matches 0-9999, |[1-5][0-9]{4} matches 10000-59999,
+    /// |6[0-4][0-9]{3} matches 60000-64999, |65[0-4][0-9]{2} matches 65000-65499,
+    /// |655[0-2][0-9] matches 65500-65529, |6553[0-5] matches 65530-65535.
+    static ref PORT_SUFFIX_REGEX: Regex = Regex::new(r":(?:[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])$").unwrap();
+
+    /// Regular expression for handling bracketed IPv6 addresses
+    ///
+    /// Matches [IPv6] format used in URLs like [::1]:8080.
+    static ref IPV6_BRACKETS_REGEX: Regex = Regex::new(r"^\[(.+)\]").unwrap();
 }
 
 /// Semantic classifier for identifying network indicators in extracted strings
@@ -153,7 +184,8 @@ impl SemanticClassifier {
     ///
     /// This method analyzes a `FoundString` and returns a vector of semantic
     /// tags that apply to the string. URLs are checked first, then domains
-    /// (which automatically excludes URLs to prevent double-tagging).
+    /// (which automatically excludes URLs to prevent double-tagging), then
+    /// IP addresses (IPv4 and IPv6).
     ///
     /// # Arguments
     ///
@@ -201,6 +233,10 @@ impl SemanticClassifier {
             tags.push(tag);
         }
 
+        // Check for IP addresses (IPv4 and IPv6)
+        let ip_tags = self.classify_ip_addresses(&string.text);
+        tags.extend(ip_tags);
+
         tags
     }
 
@@ -304,6 +340,206 @@ impl SemanticClassifier {
 
         valid_tlds.contains(&tld_lower.as_str())
     }
+
+    /// Strips port suffix from an IP address string
+    ///
+    /// Removes `:port` suffix if present (e.g., `192.168.1.1:8080` → `192.168.1.1`).
+    ///
+    /// # Arguments
+    ///
+    /// * `text` - The text that may contain a port suffix
+    ///
+    /// # Returns
+    ///
+    /// Returns a string slice without the port suffix.
+    fn strip_port<'a>(&self, text: &'a str) -> &'a str {
+        PORT_SUFFIX_REGEX
+            .find(text)
+            .map_or(text, |m| &text[..m.start()])
+    }
+
+    /// Strips bracketed notation from IPv6 addresses
+    ///
+    /// Removes `[` and `]` from bracketed IPv6 addresses (e.g., `[::1]` → `::1`).
+    ///
+    /// # Arguments
+    ///
+    /// * `text` - The text that may contain bracketed IPv6 notation
+    ///
+    /// # Returns
+    ///
+    /// Returns a string slice without brackets, or the original text if no brackets found.
+    fn strip_ipv6_brackets<'a>(&self, text: &'a str) -> &'a str {
+        IPV6_BRACKETS_REGEX
+            .captures(text)
+            .and_then(|caps| caps.get(1))
+            .map_or(text, |m| m.as_str())
+    }
+
+    /// Detects IPv4 addresses in the given text
+    ///
+    /// This method uses a two-stage validation approach:
+    /// 1. Regex pre-filter for performance
+    /// 2. `std::net::Ipv4Addr` validation for correctness
+    ///
+    /// It also handles port suffixes (e.g., "192.168.1.1:8080").
+    ///
+    /// # Note on Version Numbers
+    ///
+    /// This method accepts ALL valid IPv4 addresses in dotted-quad notation,
+    /// even if they could also be interpreted as version numbers (e.g., "1.2.3.4").
+    /// It is the responsibility of the caller to disambiguate between IP addresses
+    /// and version numbers based on context when necessary.
+    ///
+    /// # Arguments
+    ///
+    /// * `text` - The text to search for IPv4 addresses
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if a valid IPv4 address is found, `false` otherwise.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringy::classification::SemanticClassifier;
+    ///
+    /// let classifier = SemanticClassifier::new();
+    /// assert!(classifier.is_ipv4_address("192.168.1.1"));
+    /// assert!(classifier.is_ipv4_address("192.168.1.1:8080"));
+    /// assert!(classifier.is_ipv4_address("1.2.3.4")); // Valid IP (could also be a version number)
+    /// assert!(!classifier.is_ipv4_address("256.1.1.1")); // Invalid octet
+    /// ```
+    pub fn is_ipv4_address(&self, text: &str) -> bool {
+        // Strip port suffix if present
+        let text_without_port = self.strip_port(text);
+
+        // Two-stage validation: regex pre-filter first
+        if !IPV4_REGEX.is_match(text_without_port) {
+            return false;
+        }
+
+        // Check for leading zeros in octets (e.g., 192.168.01.1 should be rejected)
+        for octet_str in text_without_port.split('.') {
+            // If an octet has more than 1 digit and starts with '0', it's invalid
+            if octet_str.len() > 1 && octet_str.starts_with('0') {
+                return false;
+            }
+        }
+
+        // Validate using std::net::Ipv4Addr for correctness
+        // This is the authoritative check - regex is just a pre-filter
+        Ipv4Addr::from_str(text_without_port).is_ok()
+    }
+
+    /// Detects IPv6 addresses in the given text
+    ///
+    /// This method uses a two-stage validation approach:
+    /// 1. Basic structure check (contains colons, looks like IPv6)
+    /// 2. `std::net::Ipv6Addr` validation for correctness
+    ///
+    /// It handles bracketed notation (e.g., `[::1]`) and port suffixes.
+    ///
+    /// # Arguments
+    ///
+    /// * `text` - The text to search for IPv6 addresses
+    ///
+    /// # Returns
+    ///
+    /// Returns `true` if a valid IPv6 address is found, `false` otherwise.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringy::classification::SemanticClassifier;
+    ///
+    /// let classifier = SemanticClassifier::new();
+    /// assert!(classifier.is_ipv6_address("2001:db8::1"));
+    /// assert!(classifier.is_ipv6_address("::1"));
+    /// assert!(classifier.is_ipv6_address("[::1]:8080"));
+    /// assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex
+    /// ```
+    pub fn is_ipv6_address(&self, text: &str) -> bool {
+        // Handle bracketed IPv6 addresses like [::1] or [::1]:8080
+        // Strategy: strip port first (if present), then strip brackets
+
+        // If it looks like it has a port (contains ]:), strip port first
+        let after_port = if text.contains("]:") {
+            self.strip_port(text)
+        } else {
+            text
+        };
+
+        // Now strip brackets if present
+        let processed = self.strip_ipv6_brackets(after_port);
+
+        // Two-stage validation: regex pre-filter first
+        // Basic structure check: must contain colons (IPv6 addresses always have colons)
+        if !processed.contains(':') {
+            return false;
+        }
+
+        // For mixed notation (contains both colons and dots), skip regex check
+        // as the regex doesn't handle all mixed notation patterns
+        let is_mixed_notation = processed.contains('.');
+
+        if !is_mixed_notation {
+            // Use regex as pre-filter for non-mixed notation
+            if !IPV6_REGEX.is_match(processed) {
+                return false;
+            }
+        }
+
+        // Validate using std::net::Ipv6Addr for canonical validation
+        // This handles all IPv6 formats: full, compressed, mixed notation
+        Ipv6Addr::from_str(processed).is_ok()
+    }
+
+    /// Classifies IP addresses (IPv4 and IPv6) in the given text
+    ///
+    /// This method checks for both IPv4 and IPv6 addresses and returns
+    /// appropriate tags. A string may match both patterns (unlikely but possible).
+    ///
+    /// # Arguments
+    ///
+    /// * `text` - The text to search for IP addresses
+    ///
+    /// # Returns
+    ///
+    /// Returns a vector of `Tag` values (`Tag::IPv4` and/or `Tag::IPv6`).
+    /// The vector may be empty if no IP addresses are found.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use stringy::classification::SemanticClassifier;
+    /// use stringy::types::Tag;
+    ///
+    /// let classifier = SemanticClassifier::new();
+    /// let tags = classifier.classify_ip_addresses("192.168.1.1");
+    /// assert_eq!(tags, vec![Tag::IPv4]);
+    ///
+    /// let tags = classifier.classify_ip_addresses("::1");
+    /// assert_eq!(tags, vec![Tag::IPv6]);
+    ///
+    /// let tags = classifier.classify_ip_addresses("not an ip");
+    /// assert!(tags.is_empty());
+    /// ```
+    pub fn classify_ip_addresses(&self, text: &str) -> Vec<Tag> {
+        let mut tags = Vec::new();
+
+        // Check for IPv4
+        if self.is_ipv4_address(text) {
+            tags.push(Tag::IPv4);
+        }
+
+        // Check for IPv6
+        if self.is_ipv6_address(text) {
+            tags.push(Tag::IPv6);
+        }
+
+        tags
+    }
 }
 
 #[cfg(test)]
@@ -466,4 +702,174 @@ mod tests {
         let tags = classifier.classify(&malformed);
         assert_eq!(tags.len(), 0);
     }
+
+    #[test]
+    fn test_ipv4_valid_addresses() {
+        let classifier = SemanticClassifier::new();
+
+        // Valid IPv4 addresses
+        assert!(classifier.is_ipv4_address("192.168.1.1"));
+        assert!(classifier.is_ipv4_address("10.0.0.1"));
+        assert!(classifier.is_ipv4_address("8.8.8.8"));
+        assert!(classifier.is_ipv4_address("1.1.1.1"));
+        assert!(classifier.is_ipv4_address("127.0.0.1"));
+        assert!(classifier.is_ipv4_address("0.0.0.0"));
+        assert!(classifier.is_ipv4_address("255.255.255.255"));
+    }
+
+    #[test]
+    fn test_ipv4_invalid_addresses() {
+        let classifier = SemanticClassifier::new();
+
+        // Invalid IPv4 addresses
+        assert!(!classifier.is_ipv4_address("256.1.1.1")); // Octet > 255
+        assert!(!classifier.is_ipv4_address("192.168.1")); // Missing octet
+        assert!(!classifier.is_ipv4_address("192.168.1.1.1")); // Too many octets
+        assert!(!classifier.is_ipv4_address("999.999.999.999")); // All octets > 255
+        assert!(!classifier.is_ipv4_address("192.168.01.1")); // Leading zero (invalid format)
+    }
+
+    #[test]
+    fn test_ipv4_with_port() {
+        let classifier = SemanticClassifier::new();
+
+        // IPv4 addresses with ports should be detected
+        assert!(classifier.is_ipv4_address("192.168.1.1:8080"));
+        assert!(classifier.is_ipv4_address("10.0.0.1:443"));
+        assert!(classifier.is_ipv4_address("127.0.0.1:3000"));
+    }
+
+    #[test]
+    fn test_ipv4_version_numbers() {
+        let classifier = SemanticClassifier::new();
+
+        // Valid IPv4 addresses that could also be version numbers are accepted
+        // It's the caller's responsibility to disambiguate based on context
+        assert!(classifier.is_ipv4_address("1.2.3.4"));
+        assert!(classifier.is_ipv4_address("2.0.1.0"));
+        assert!(classifier.is_ipv4_address("10.5.2.1"));
+        assert!(classifier.is_ipv4_address("10.5.2.20"));
+    }
+
+    #[test]
+    fn test_ipv4_edge_cases() {
+        let classifier = SemanticClassifier::new();
+
+        // Boundary values
+        assert!(classifier.is_ipv4_address("0.0.0.0"));
+        assert!(classifier.is_ipv4_address("255.255.255.255"));
+        assert!(classifier.is_ipv4_address("192.0.0.1"));
+        assert!(classifier.is_ipv4_address("0.255.0.255"));
+
+        // Private network addresses
+        assert!(classifier.is_ipv4_address("192.168.0.1"));
+        assert!(classifier.is_ipv4_address("10.0.0.1"));
+        assert!(classifier.is_ipv4_address("172.16.0.1"));
+    }
+
+    #[test]
+    fn test_ipv6_full_notation() {
+        let classifier = SemanticClassifier::new();
+
+        // Full IPv6 notation
+        assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334"));
+        assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334"));
+    }
+
+    #[test]
+    fn test_ipv6_compressed() {
+        let classifier = SemanticClassifier::new();
+
+        // Compressed IPv6 notation
+        assert!(classifier.is_ipv6_address("2001:db8::1"));
+        assert!(classifier.is_ipv6_address("::1"));
+        assert!(classifier.is_ipv6_address("fe80::1"));
+        assert!(classifier.is_ipv6_address("::"));
+    }
+
+    #[test]
+    fn test_ipv6_mixed_notation() {
+        let classifier = SemanticClassifier::new();
+
+        // Mixed IPv4/IPv6 notation
+        assert!(classifier.is_ipv6_address("::ffff:192.0.2.1"));
+        assert!(classifier.is_ipv6_address("64:ff9b::192.0.2.1"));
+    }
+
+    #[test]
+    fn test_ipv6_invalid() {
+        let classifier = SemanticClassifier::new();
+
+        // Invalid IPv6 addresses
+        assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex
+        assert!(!classifier.is_ipv6_address("2001:db8::1::2")); // Double ::
+        assert!(!classifier.is_ipv6_address("2001:db8:1")); // Too short
+    }
+
+    #[test]
+    fn test_ipv6_with_brackets() {
+        let classifier = SemanticClassifier::new();
+
+        // IPv6 addresses with brackets
+        assert!(classifier.is_ipv6_address("[2001:db8::1]"));
+        assert!(classifier.is_ipv6_address("[::1]"));
+    }
+
+    #[test]
+    fn test_ipv6_with_port() {
+        let classifier = SemanticClassifier::new();
+
+        // IPv6 addresses with brackets and ports
+        assert!(classifier.is_ipv6_address("[2001:db8::1]:8080"));
+        assert!(classifier.is_ipv6_address("[::1]:8080"));
+    }
+
+    #[test]
+    fn test_classify_ipv4() {
+        let classifier = SemanticClassifier::new();
+        let found_string = create_test_string("192.168.1.1");
+
+        let tags = classifier.classify(&found_string);
+        assert_eq!(tags.len(), 1);
+        assert!(matches!(tags[0], Tag::IPv4));
+    }
+
+    #[test]
+    fn test_classify_ipv6() {
+        let classifier = SemanticClassifier::new();
+        let found_string = create_test_string("::1");
+
+        let tags = classifier.classify(&found_string);
+        assert_eq!(tags.len(), 1);
+        assert!(matches!(tags[0], Tag::IPv6));
+    }
+
+    #[test]
+    fn test_classify_no_ip() {
+        let classifier = SemanticClassifier::new();
+        let found_string = create_test_string("not an ip address");
+
+        let tags = classifier.classify_ip_addresses(&found_string.text);
+        assert!(tags.is_empty());
+    }
+
+    #[test]
+    fn test_classify_ipv4_with_port() {
+        let classifier = SemanticClassifier::new();
+        let found_string = create_test_string("192.168.1.1:8080");
+
+        let tags = classifier.classify(&found_string);
+        assert_eq!(tags.len(), 1);
+        assert!(matches!(tags[0], Tag::IPv4));
+    }
+
+    #[test]
+    fn test_classify_ipv6_with_brackets_and_port() {
+        let classifier = SemanticClassifier::new();
+        let found_string = create_test_string("[::1]:8080");
+
+        let tags = classifier.classify(&found_string);
+        assert_eq!(tags.len(), 1);
+        assert!(matches!(tags[0], Tag::IPv6));
+    }
 }
diff --git a/src/extraction/utf16.rs b/src/extraction/utf16.rs
index b7d02ab..711ffbb 100644
--- a/src/extraction/utf16.rs
+++ b/src/extraction/utf16.rs
@@ -706,131 +706,7 @@ fn extract_utf16le_strings_internal(
     data: &[u8],
     config: &Utf16ExtractionConfig,
 ) -> Vec<FoundString> {
-    let mut strings = Vec::new();
-
-    // Need at least 2 bytes for a UTF-16LE character
-    if data.len() < 2 {
-        return strings;
-    }
-
-    // Helper function to scan from a given start offset
-    fn scan_from_offset_le(
-        data: &[u8],
-        config: &Utf16ExtractionConfig,
-        start_offset: usize,
-    ) -> Vec<FoundString> {
-        let mut found_strings = Vec::new();
-        let mut i = start_offset;
-        while i + 1 < data.len() {
-            let mut char_count = 0;
-            let start = i;
-            let mut has_null_terminator = false;
-            let mut chars = Vec::new();
-
-            // Accumulate printable UTF-16LE characters
-            while i + 1 < data.len() {
-                // Read current code unit as u16
-                let code_unit = u16::from_le_bytes([data[i], data[i + 1]]);
-
-                // Check for null terminator (0x0000)
-                if code_unit == 0x0000 {
-                    has_null_terminator = true;
-                    break;
-                }
-
-                // Check if we have a next code unit for surrogate pair detection
-                let next_code_unit = if i + 3 < data.len() {
-                    Some(u16::from_le_bytes([data[i + 2], data[i + 3]]))
-                } else {
-                    None
-                };
-
-                // Check if character is printable (handles surrogate pairs)
-                let (is_printable, consumed_units) =
-                    is_printable_code_unit_or_pair(code_unit, next_code_unit);
-
-                if is_printable {
-                    char_count += 1; // Count as single character (even if surrogate pair)
-                    chars.push(code_unit);
-                    if consumed_units == 2 {
-                        // Surrogate pair - also add the low surrogate
-                        if i + 3 < data.len() {
-                            chars.push(u16::from_le_bytes([data[i + 2], data[i + 3]]));
-                        }
-                    }
-                    i += consumed_units * 2; // Advance by number of bytes (2 per code unit)
-                } else {
-                    // Non-printable character or lone surrogate, end of string candidate
-                    break;
-                }
-            }
-
-            // Check if we found a valid string
-            if char_count >= config.min_length {
-                // Check maximum length if configured
-                if let Some(max_len) = config.max_length
-                    && char_count > max_len
-                {
-                    // Skip this string, move to next position
-                    i += 2;
-                    continue;
-                }
-
-                // Calculate end position (including null terminator if present)
-                let end = if has_null_terminator { i + 2 } else { i };
-
-                // Extract the string bytes (excluding null terminator for decoding)
-                let string_bytes = &data[start..end.min(data.len())];
-                let bytes_for_decoding = if has_null_terminator && string_bytes.len() >= 2 {
-                    &string_bytes[..string_bytes.len() - 2]
-                } else {
-                    string_bytes
-                };
-
-                // Decode to UTF-8 and get u16 vector
-                if let Ok((text, u16_vec)) = decode_utf16le(bytes_for_decoding) {
-                    // Calculate UTF-16-specific confidence
-                    let utf16_confidence = calculate_utf16_confidence(&u16_vec, ByteOrder::LE);
-
-                    // Apply confidence threshold
-                    if utf16_confidence >= config.confidence_threshold {
-                        found_strings.push(FoundString {
-                            text,
-                            encoding: Encoding::Utf16Le,
-                            offset: start as u64,
-                            rva: None,
-                            section: None,
-                            length: bytes_for_decoding.len() as u32,
-                            tags: Vec::new(),
-                            score: 0,
-                            source: StringSource::SectionData,
-                            confidence: utf16_confidence,
-                        });
-                    }
-                }
-            }
-
-            // Move to next potential start position
-            // If we found a null terminator, skip past it
-            if has_null_terminator {
-                i += 2;
-            } else {
-                // Move forward by 2 bytes to try next alignment
-                i += 2;
-            }
-        }
-        found_strings
-    }
-
-    // First pass: scan starting at even offset (index 0)
-    strings.extend(scan_from_offset_le(data, config, 0));
-
-    // Second pass: scan starting at odd offset (index 1) if enabled
-    if config.scan_both_alignments && data.len() >= 3 {
-        strings.extend(scan_from_offset_le(data, config, 1));
-    }
-
-    strings
+    extract_utf16_strings_with_byte_order(data, config, ByteOrder::LE)
 }
 
 /// Extract UTF-16BE strings from a byte slice (internal)
@@ -850,32 +726,42 @@ fn extract_utf16le_strings_internal(
 fn extract_utf16be_strings_internal(
     data: &[u8],
     config: &Utf16ExtractionConfig,
+) -> Vec<FoundString> {
+    extract_utf16_strings_with_byte_order(data, config, ByteOrder::BE)
+}
+
+/// Generic UTF-16 string extraction with specified byte order
+///
+/// Scans through the byte slice looking for contiguous sequences of printable UTF-16
+/// characters in the specified byte order. Handles both even and odd alignment scanning.
+fn extract_utf16_strings_with_byte_order(
+    data: &[u8],
+    config: &Utf16ExtractionConfig,
+    byte_order: ByteOrder,
 ) -> Vec<FoundString> {
     let mut strings = Vec::new();
 
-    // Need at least 2 bytes for a UTF-16BE character
     if data.len() < 2 {
         return strings;
     }
 
     // Helper function to scan from a given start offset
-    fn scan_from_offset_be(
-        data: &[u8],
-        config: &Utf16ExtractionConfig,
-        start_offset: usize,
-    ) -> Vec<FoundString> {
+    let scan_from_offset = |start_offset: usize| -> Vec<FoundString> {
         let mut found_strings = Vec::new();
         let mut i = start_offset;
         while i + 1 < data.len() {
             let mut char_count = 0;
             let start = i;
             let mut has_null_terminator = false;
-            let mut chars = Vec::new();
 
-            // Accumulate printable UTF-16BE characters
+            // Accumulate printable UTF-16 characters
             while i + 1 < data.len() {
-                // Read current code unit as u16 (big-endian)
-                let code_unit = u16::from_be_bytes([data[i], data[i + 1]]);
+                // Read current code unit as u16
+                let code_unit = match byte_order {
+                    ByteOrder::LE => u16::from_le_bytes([data[i], data[i + 1]]),
+                    ByteOrder::BE => u16::from_be_bytes([data[i], data[i + 1]]),
+                    ByteOrder::Auto => unreachable!(),
+                };
 
                 // Check for null terminator (0x0000)
                 if code_unit == 0x0000 {
@@ -885,7 +771,11 @@ fn extract_utf16be_strings_internal(
 
                 // Check if we have a next code unit for surrogate pair detection
                 let next_code_unit = if i + 3 < data.len() {
-                    Some(u16::from_be_bytes([data[i + 2], data[i + 3]]))
+                    Some(match byte_order {
+                        ByteOrder::LE => u16::from_le_bytes([data[i + 2], data[i + 3]]),
+                        ByteOrder::BE => u16::from_be_bytes([data[i + 2], data[i + 3]]),
+                        ByteOrder::Auto => unreachable!(),
+                    })
                 } else {
                     None
                 };
@@ -895,36 +785,23 @@ fn extract_utf16be_strings_internal(
                     is_printable_code_unit_or_pair(code_unit, next_code_unit);
 
                 if is_printable {
-                    char_count += 1; // Count as single character (even if surrogate pair)
-                    chars.push(code_unit);
-                    if consumed_units == 2 {
-                        // Surrogate pair - also add the low surrogate
-                        if i + 3 < data.len() {
-                            chars.push(u16::from_be_bytes([data[i + 2], data[i + 3]]));
-                        }
-                    }
-                    i += consumed_units * 2; // Advance by number of bytes (2 per code unit)
+                    char_count += 1;
+                    i += consumed_units * 2;
                 } else {
-                    // Non-printable character or lone surrogate, end of string candidate
                     break;
                 }
             }
 
             // Check if we found a valid string
             if char_count >= config.min_length {
-                // Check maximum length if configured
                 if let Some(max_len) = config.max_length
                     && char_count > max_len
                 {
-                    // Skip this string, move to next position
                     i += 2;
                     continue;
                 }
 
-                // Calculate end position (including null terminator if present)
                 let end = if has_null_terminator { i + 2 } else { i };
-
-                // Extract the string bytes (excluding null terminator for decoding)
                 let string_bytes = &data[start..end.min(data.len())];
                 let bytes_for_decoding = if has_null_terminator && string_bytes.len() >= 2 {
                     &string_bytes[..string_bytes.len() - 2]
@@ -933,15 +810,23 @@ fn extract_utf16be_strings_internal(
                 };
 
                 // Decode to UTF-8 and get u16 vector
-                if let Ok((text, u16_vec)) = decode_utf16be(bytes_for_decoding) {
-                    // Calculate UTF-16-specific confidence
-                    let utf16_confidence = calculate_utf16_confidence(&u16_vec, ByteOrder::BE);
+                let decode_result = match byte_order {
+                    ByteOrder::LE => decode_utf16le(bytes_for_decoding),
+                    ByteOrder::BE => decode_utf16be(bytes_for_decoding),
+                    ByteOrder::Auto => unreachable!(),
+                };
+
+                if let Ok((text, u16_vec)) = decode_result {
+                    let utf16_confidence = calculate_utf16_confidence(&u16_vec, byte_order);
 
-                    // Apply confidence threshold
                     if utf16_confidence >= config.confidence_threshold {
                         found_strings.push(FoundString {
                             text,
-                            encoding: Encoding::Utf16Be,
+                            encoding: match byte_order {
+                                ByteOrder::LE => Encoding::Utf16Le,
+                                ByteOrder::BE => Encoding::Utf16Be,
+                                ByteOrder::Auto => unreachable!(),
+                            },
                             offset: start as u64,
                             rva: None,
                             section: None,
@@ -956,23 +841,17 @@ fn extract_utf16be_strings_internal(
             }
 
             // Move to next potential start position
-            // If we found a null terminator, skip past it
-            if has_null_terminator {
-                i += 2;
-            } else {
-                // Move forward by 2 bytes to try next alignment
-                i += 2;
-            }
+            i += 2;
         }
         found_strings
-    }
+    };
 
     // First pass: scan starting at even offset (index 0)
-    strings.extend(scan_from_offset_be(data, config, 0));
+    strings.extend(scan_from_offset(0));
 
     // Second pass: scan starting at odd offset (index 1) if enabled
     if config.scan_both_alignments && data.len() >= 3 {
-        strings.extend(scan_from_offset_be(data, config, 1));
+        strings.extend(scan_from_offset(1));
     }
 
     strings
@@ -1005,42 +884,26 @@ pub fn extract_utf16_strings(data: &[u8], config: &Utf16ExtractionConfig) -> Vec
             let le_strings = extract_utf16le_strings_internal(data, config);
             let be_strings = extract_utf16be_strings_internal(data, config);
 
-            // Merge results, deduplicating by (offset, encoding, text) to avoid true duplicates
-            // while retaining distinct strings at the same offset (different encoding or text)
-            // Maintain a Vec and check for exact duplicates manually, preferring higher confidence
-            for string in le_strings {
-                // Check if we already have an exact duplicate
+            // Helper to add string with deduplication
+            let mut add_with_dedup = |string: FoundString| {
                 if let Some(existing) = strings.iter_mut().find(|s| {
                     s.offset == string.offset
                         && s.encoding == string.encoding
                         && s.text == string.text
                 }) {
-                    // Prefer the one with higher confidence
                     if string.confidence > existing.confidence {
                         *existing = string;
                     }
                 } else {
-                    // New distinct string, add it
                     strings.push(string);
                 }
-            }
+            };
 
-            // Add BE strings, preferring higher confidence for exact duplicates
+            for string in le_strings {
+                add_with_dedup(string);
+            }
             for string in be_strings {
-                // Check if we already have an exact duplicate
-                if let Some(existing) = strings.iter_mut().find(|s| {
-                    s.offset == string.offset
-                        && s.encoding == string.encoding
-                        && s.text == string.text
-                }) {
-                    // Prefer the one with higher confidence
-                    if string.confidence > existing.confidence {
-                        *existing = string;
-                    }
-                } else {
-                    // New distinct string, add it
-                    strings.push(string);
-                }
+                add_with_dedup(string);
             }
         }
     }