diff --git a/.cursor/commands/simplicity_review.md b/.cursor/commands/simplicity_review.md new file mode 100644 index 0000000..527d914 --- /dev/null +++ b/.cursor/commands/simplicity_review.md @@ -0,0 +1 @@ +Follow the steps in the .github/prompt/simplicity-review.prompt.md file to review the code for simplicity. diff --git a/.github/prompt/simplicity-review.prompt.md b/.github/prompt/simplicity-review.prompt.md new file mode 100644 index 0000000..7464254 --- /dev/null +++ b/.github/prompt/simplicity-review.prompt.md @@ -0,0 +1,36 @@ +CODE SIMPLIFICATION REVIEW + +Start by examining the uncommitted changes in the current codebase. + +ANALYSIS STEPS: + +1. Identify what files have been modified or added +2. Review the actual code changes +3. Apply simplification principles below +4. Refactor directly, then show what you changed + +SIMPLIFICATION PRINCIPLES: + +Complexity Reduction: + +- Remove abstraction layers that don't provide clear value +- Replace complex patterns with straightforward implementations +- Use language idioms over custom abstractions +- If a simple function/lambda works, use itβ€”don't create classes + +Test Proportionality: + +- Keep only tests for critical functionality and real edge cases +- Delete tests for trivial operations, framework behavior, or hypothetical scenarios +- For small projects: aim for \<10 meaningful tests per feature +- Test code should be shorter than implementation + +Idiomatic Code: + +- Use conventional patterns for the language +- Prioritize readability and maintainability +- Apply the principle of least surprise + +Ask yourself: "What's the simplest version that actually works reliably?" + +Make the refactoring changes, then summarize what you simplified and why. Always finish by running `just ci-check` and ensuring that all checks and tests remain green. diff --git a/.kiro/hooks/code-review-refactor.kiro.hook b/.kiro/hooks/code-review-refactor.kiro.hook new file mode 100644 index 0000000..7ccacf3 --- /dev/null +++ b/.kiro/hooks/code-review-refactor.kiro.hook @@ -0,0 +1,13 @@ +{ + "enabled": true, + "name": "Code Simplicity Checker", + "description": "When the agent finishes its work, automatically trigger a comprehensive code review to eliminate unnecessary complexity, refactor for simplicity, reduce test bloat, and verify idiomatic style before finalizing any code changes", + "version": "1", + "when": { + "type": "agentStop" + }, + "then": { + "type": "askAgent", + "prompt": "CODE SIMPLIFICATION REVIEW\n\nStart by examining the uncommitted changes in the current codebase.\n\nANALYSIS STEPS:\n1. Identify what files have been modified or added\n2. Review the actual code changes\n3. Apply simplification principles below\n4. Refactor directly, then show what you changed\n\nSIMPLIFICATION PRINCIPLES:\n\nComplexity Reduction:\n- Remove abstraction layers that don't provide clear value\n- Replace complex patterns with straightforward implementations\n- Use language idioms over custom abstractions\n- If a simple function/lambda works, use itβ€”don't create classes\n\nTest Proportionality:\n- Keep only tests for critical functionality and real edge cases\n- Delete tests for trivial operations, framework behavior, or hypothetical scenarios\n- For small projects: aim for <10 meaningful tests per feature\n- Test code should be shorter than implementation\n\nIdiomatic Code:\n- Use conventional patterns for the language\n- Prioritize readability and maintainability\n- Apply the principle of least surprise\n\nAsk yourself: \"What's the simplest version that actually works reliably?\"\n\nMake the refactoring changes, then summarize what you simplified and why. Always finish by running `just ci-check` and ensuring that all checks and tests remain green." + } +} \ No newline at end of file diff --git a/.kiro/steering/rust/cargo-toml.md b/.kiro/steering/rust/cargo-toml.md new file mode 100644 index 0000000..1843b93 --- /dev/null +++ b/.kiro/steering/rust/cargo-toml.md @@ -0,0 +1,84 @@ +--- +inclusion: fileMatch +fileMatchPattern: [Cargo.toml] +--- + +# Cargo.toml Standards for Stringy + +## Package Configuration + +- Use **Rust 2024 Edition** (MSRV: 1.91+) as specified in the package +- Single crate structure (not a workspace) +- Enforce lint policy via `[lints.rust]` configuration + - Forbid unsafe code globally + - Deny all warnings to preserve code quality + +Example `Cargo.toml` structure: + +```toml +[package] +name = "stringy" +version = "0.1.0" +edition = "2024" +authors = ["UncleSp1d3r "] +description = "A smarter alternative to the strings command that leverages format-specific knowledge" +license = "Apache-2.0" +repository = "https://github.com/EvilBit-Labs/StringyMcStringFace" +homepage = "http://evilbitlabs.io/StringyMcStringFace/" +keywords = ["binary", "strings", "analysis", "reverse-engineering", "malware"] +categories = ["command-line-utilities", "development-tools"] + +[lib] +name = "stringy" +path = "src/lib.rs" + +[[bin]] +name = "stringy" +path = "src/main.rs" + +[lints.rust] +unsafe_code = "forbid" +warnings = "deny" +``` + +## Dependencies + +- **Core dependencies**: + + - `clap = { version = "4.5", features = ["derive"] }` - CLI argument parsing + - `goblin = "0.10"` - Binary format parsing (ELF, PE, Mach-O) + - `serde = { version = "1.0", features = ["derive"] }` - Serialization + - `serde_json = "1.0"` - JSON output + - `thiserror = "2.0"` - Structured error types + +- **Dev dependencies**: + + - `criterion = "0.7"` - Benchmarking + - `insta = "1.0"` - Snapshot testing + - `tempfile = "3.8"` - Temporary file handling in tests + +## Build Profiles + +- Use `[profile.dist]` for distribution builds with LTO: + +```toml +[profile.dist] +inherits = "release" +lto = "thin" +``` + +## Benchmarks + +- Define benchmarks in `[[bench]]` sections: + +```toml +[[bench]] +name = "elf" +harness = false +``` + +## Package Metadata + +- Include proper license (Apache-2.0) +- Provide clear description for binary analysis tool +- Include relevant keywords for discoverability diff --git a/.kiro/steering/rust/configuration-management.md b/.kiro/steering/rust/configuration-management.md new file mode 100644 index 0000000..985b721 --- /dev/null +++ b/.kiro/steering/rust/configuration-management.md @@ -0,0 +1,80 @@ +--- +inclusion: fileMatch +fileMatchPattern: ['**/config*.rs', '**/*config*.rs'] +--- + +# Configuration Management Standards for Stringy + +## Configuration Architecture + +Stringy uses **CLI arguments only** for configuration via `clap`: + +- **Command-line flags** (only source of configuration) +- **No configuration files** - all options specified via CLI +- **No environment variables** - use CLI flags instead +- **No hierarchical configuration** - simple argument parsing + +## CLI Configuration + +Define CLI arguments using `clap` with derive macros: + +```rust +use clap::Parser; +use std::path::PathBuf; + +#[derive(Parser)] +#[command(name = "stringy")] +#[command(about = "Extract meaningful strings from binary files")] +struct Cli { + /// Input binary file to analyze + #[arg(value_name = "FILE")] + input: PathBuf, + + /// Minimum string length + #[arg(short, long, default_value_t = 4)] + min_len: usize, + + /// Output format (json, text, yara) + #[arg(short, long, default_value = "text")] + format: String, + + /// Only extract strings matching specific tags + #[arg(long)] + only: Option>, +} +``` + +## Configuration Validation + +Validate CLI arguments: + +```rust +impl Cli { + pub fn validate(&self) -> Result<(), StringyError> { + if self.min_len < 1 { + return Err(StringyError::ConfigError( + "Minimum string length must be at least 1".to_string(), + )); + } + + let valid_formats = ["json", "text", "yara"]; + if !valid_formats.contains(&self.format.as_str()) { + return Err(StringyError::ConfigError(format!( + "Invalid output format: {}. Must be one of: {:?}", + self.format, valid_formats + ))); + } + + Ok(()) + } +} +``` + +## No File-Based Configuration + +Stringy intentionally does not support configuration files to keep it simple and portable: + +- All configuration comes from CLI arguments +- No need to manage config file locations +- Works the same way across all environments +- Easier to use in scripts and pipelines diff --git a/.kiro/steering/rust/error-handling-patterns.md b/.kiro/steering/rust/error-handling-patterns.md new file mode 100644 index 0000000..0262e2a --- /dev/null +++ b/.kiro/steering/rust/error-handling-patterns.md @@ -0,0 +1,251 @@ +--- +inclusion: fileMatch +fileMatchPattern: ['**/*.rs'] +--- + +# Error Handling Patterns for Stringy + +## Error Handling Philosophy + +Stringy uses structured error handling with clear error boundaries: + +- **Structured Errors**: Use `thiserror` for all error types with derive macros +- **Error Context**: Provide detailed context in error messages (offsets, section names, file paths) +- **Error Boundaries**: Implement proper error boundaries for different components (parsing, extraction, classification) +- **Recovery Strategies**: Continue processing when possible, provide partial results + +## Error Type Definition + +Define structured error types using thiserror: + +```rust +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum StringyError { + #[error("Unsupported file format")] + UnsupportedFormat, + + #[error("File I/O error: {0}")] + IoError(#[from] std::io::Error), + + #[error("Binary parsing error: {0}")] + ParseError(String), + + #[error("Invalid encoding in string at offset {offset}")] + EncodingError { offset: u64 }, + + #[error("Configuration error: {0}")] + ConfigError(String), + + #[error("Memory mapping error: {0}")] + MemoryMapError(String), +} + +// Convert from goblin errors +impl From for StringyError { + fn from(err: goblin::error::Error) -> Self { + StringyError::ParseError(err.to_string()) + } +} +``` + +## Error Context + +Provide detailed error context: + +```rust +fn parse_elf_section(data: &[u8], section_name: &str) -> Result { + let section = goblin::elf::Elf::parse(data).map_err(|e| { + StringyError::ParseError(format!( + "Failed to parse ELF section '{}': {}", + section_name, e + )) + })?; + + // ... parsing logic + Ok(section_info) +} +``` + +## Component-Specific Error Handling + +Implement error boundaries for different components: + +```rust +// Container parsing errors +#[derive(Debug, Error)] +pub enum ContainerError { + #[error("Failed to detect binary format")] + FormatDetectionFailed, + + #[error("Unsupported binary format: {format:?}")] + UnsupportedFormat { format: BinaryFormat }, + + #[error("Section parsing failed: {section}: {error}")] + SectionParseFailed { section: String, error: String }, +} + +// String extraction errors +#[derive(Debug, Error)] +pub enum ExtractionError { + #[error("Invalid encoding at offset {offset}")] + InvalidEncoding { offset: u64 }, + + #[error("String extraction failed: {0}")] + ExtractionFailed(String), +} + +// Classification errors +#[derive(Debug, Error)] +pub enum ClassificationError { + #[error("Tagging failed: {0}")] + TaggingFailed(String), +} +``` + +## Error Recovery Patterns + +Implement graceful degradation: + +```rust +fn extract_strings_from_sections(data: &[u8], sections: &[SectionInfo]) -> Vec { + let mut found_strings = Vec::new(); + + for section in sections { + match extract_strings_from_section(data, section) { + Ok(strings) => found_strings.extend(strings), + Err(e) => { + eprintln!( + "Warning: Failed to extract strings from section '{}': {}", + section.name, e + ); + // Continue with other sections + } + } + } + + found_strings +} +``` + +## Error Logging + +Implement structured error logging: + +```rust +fn handle_parsing_error(error: StringyError, context: &str) { + match error { + StringyError::UnsupportedFormat => { + eprintln!("Error: {} - Unsupported binary format", context); + } + StringyError::ParseError(msg) => { + eprintln!("Error: {} - Parse error: {}", context, msg); + } + StringyError::EncodingError { offset } => { + eprintln!( + "Error: {} - Invalid encoding at offset 0x{:x}", + context, offset + ); + } + StringyError::IoError(e) => { + eprintln!("Error: {} - I/O error: {}", context, e); + } + _ => { + eprintln!("Error: {} - {}", context, error); + } + } +} +``` + +## Error Propagation + +Use proper error propagation patterns: + +```rust +// Use ? operator for early returns +fn parse_binary_file(path: &Path) -> Result { + let data = std::fs::read(path).map_err(|e| StringyError::IoError(e))?; + + let format = detect_format(&data); + let parser = create_parser(format)?; + let container_info = parser.parse(&data)?; + + Ok(container_info) +} + +// Use map_err for error transformation +fn extract_from_section(data: &[u8], section: &SectionInfo) -> Result> { + let section_data = &data[section.offset as usize..(section.offset + section.size) as usize]; + + extract_strings(section_data).map_err(|e| StringyError::ExtractionError { + section: section.name.clone(), + error: e.to_string(), + }) +} +``` + +## Error Testing + +Test error conditions thoroughly: + +```rust +#[test] +fn test_unsupported_format() { + let data = b"NOT_A_BINARY_FORMAT"; + let format = detect_format(data); + assert_eq!(format, BinaryFormat::Unknown); + + let result = create_parser(format); + assert!(matches!(result, Err(StringyError::UnsupportedFormat))); +} + +#[test] +fn test_malformed_binary() { + let data = b"\x7fELF\x01\x01\x01"; // Invalid ELF header + let parser = ElfParser::new(); + let result = parser.parse(data); + + assert!(result.is_err()); + if let Err(StringyError::ParseError(_)) = result { + // Expected + } else { + panic!("Expected ParseError"); + } +} + +#[test] +fn test_error_recovery() { + // Test that errors in one section don't stop processing of others + let result = extract_strings_from_sections(&data, §ions); + // Should return partial results even if some sections fail + assert!(!result.is_empty()); +} +``` + +## Error Documentation + +Document error conditions in rustdoc: + +````rust +/// Parses a binary file and extracts container information. +/// +/// # Errors +/// +/// Returns [`StringyError`] for: +/// - Unsupported binary formats +/// - I/O errors reading the file +/// - Binary parsing errors (malformed headers, invalid structures) +/// +/// # Examples +/// +/// ```rust,no_run +/// use stringy::container::parse_binary_file; +/// +/// let info = parse_binary_file("binary.exe")?; +/// println!("Format: {:?}", info.format); +/// ``` +pub fn parse_binary_file(path: &Path) -> Result { + // Implementation +} +```` diff --git a/.kiro/steering/rust/error-handling.md b/.kiro/steering/rust/error-handling.md new file mode 100644 index 0000000..a0729f6 --- /dev/null +++ b/.kiro/steering/rust/error-handling.md @@ -0,0 +1,63 @@ +--- +inclusion: fileMatch +fileMatchPattern: ['**/*.rs'] +--- + +# Error Handling Standards for Stringy + +## Error Types + +Use `thiserror` for structured error types: + +```rust +#[derive(Debug, thiserror::Error)] +pub enum StringyError { + #[error("Unsupported file format")] + UnsupportedFormat, + + #[error("File I/O error: {0}")] + IoError(#[from] std::io::Error), + + #[error("Binary parsing error: {0}")] + ParseError(String), + + #[error("Invalid encoding in string at offset {offset}")] + EncodingError { offset: u64 }, + + #[error("Configuration error: {0}")] + ConfigError(String), + + #[error("Memory mapping error: {0}")] + MemoryMapError(String), +} +``` + +## Error Context + +- Provide detailed error messages with actionable suggestions +- Include relevant context information (file paths, offsets, section names, etc.) +- Convert `goblin` errors to `StringyError` using `From` implementations + +## Error Propagation + +- Use `?` operator for error propagation +- Convert between error types using `From` implementations +- Avoid `unwrap()` and `expect()` in production code + +## Binary Parsing Errors + +- Handle malformed binary files gracefully +- Provide clear error messages indicating what went wrong +- Include file offset information when available + +## Error Recovery + +- Continue processing other sections when one section fails +- Provide partial results when possible +- Log errors but don't crash on non-critical failures + +## Error Testing + +- Test all error conditions (invalid formats, malformed binaries, I/O errors) +- Validate error messages and context +- Test error propagation through the parsing pipeline diff --git a/.kiro/steering/rust/linting-rules.md b/.kiro/steering/rust/linting-rules.md new file mode 100644 index 0000000..ab7a881 --- /dev/null +++ b/.kiro/steering/rust/linting-rules.md @@ -0,0 +1,370 @@ +--- +inclusion: fileMatch +fileMatchPattern: ['**/*.rs'] +--- + +# Rust Linting Rules for Stringy + +This document explains the intent behind each clippy lint rule in Stringy's Cargo.toml. These rules are carefully chosen for a binary analysis tool and should not be disabled without understanding their purpose. + +## Critical Security Rules (FORBIDDEN) + +### `panic = "forbid"` + +**Intent**: Panics crash the binary analysis tool, leaving users without results. In a CLI tool context, this is unacceptable. **Why not to disable**: A crashed tool provides no value to users analyzing binaries. + +### `unwrap_used = "forbid"` + +**Intent**: Unwraps can cause unexpected crashes. Production CLI tools must handle all error cases gracefully. **Why not to disable**: Silent failures in error handling can mask parsing errors or provide incomplete results. + +### `await_holding_lock = "deny"` + +**Intent**: Prevents deadlocks in async code (though Stringy is synchronous, this rule still applies if async is added). **Why not to disable**: Deadlocks freeze the tool, providing no value to users. + +## Memory Safety Rules + +### `as_conversions = "warn"` + +**Intent**: Prevents potentially lossy type conversions that could corrupt data or cause security bypasses. **Why not to disable**: Data corruption in security monitoring can lead to false negatives. + +### `as_ptr_cast_mut = "warn"` + +**Intent**: Prevents dangerous mutable pointer casts that could lead to memory corruption or use-after-free. **Why not to disable**: Memory corruption in security-critical code is a major vulnerability. + +### `cast_ptr_alignment = "warn"` + +**Intent**: Ensures pointer alignment is correct to prevent undefined behavior and potential crashes. **Why not to disable**: Misaligned pointers can cause crashes or security issues. + +### `indexing_slicing = "warn"` + +**Intent**: Prevents out-of-bounds array access that could cause crashes when parsing binary data. **Why not to disable**: Buffer overflows when parsing binary sections can crash the tool or produce incorrect results. + +## Arithmetic Safety Rules + +### `arithmetic_side_effects = "warn"` + +**Intent**: Catches unintended arithmetic operations that could lead to incorrect calculations or security bypasses. **Why not to disable**: Incorrect arithmetic in security calculations can create vulnerabilities. + +### `integer_division = "warn"` + +**Intent**: Warns about potential division by zero that could crash the system. **Why not to disable**: Division by zero crashes can be exploited or cause monitoring failures. + +### `modulo_arithmetic = "warn"` + +**Intent**: Prevents modulo by zero errors that could crash the system. **Why not to disable**: Similar to division by zero, this can cause system crashes. + +### `float_cmp = "warn"` + +**Intent**: Ensures safe floating-point comparisons to prevent incorrect security decisions. **Why not to disable**: Incorrect float comparisons can lead to wrong threat assessments. + +## Performance Rules + +### `clone_on_ref_ptr = "warn"` + +**Intent**: Prevents unnecessary cloning of reference-counted types that wastes memory and CPU. **Why not to disable**: In a monitoring system, performance directly impacts detection capability. + +### `rc_buffer = "warn"` + +**Intent**: Optimizes reference-counted buffer usage for better performance. **Why not to disable**: Poor buffer management can cause memory pressure and slow detection. + +### `rc_mutex = "warn"` + +**Intent**: Warns about inefficient reference-counted mutex usage that can cause contention. **Why not to disable**: Lock contention can slow down threat detection. + +### `large_stack_arrays = "warn"` + +**Intent**: Prevents stack overflow from large arrays that could crash the system. **Why not to disable**: Stack overflows can crash the monitoring system. + +### `str_to_string = "warn"` + +**Intent**: Avoids unnecessary string allocations that waste memory. **Why not to disable**: String allocation overhead can impact performance in high-throughput monitoring. + +### `string_add = "warn"` + +**Intent**: Prevents inefficient string concatenation that can cause performance issues. **Why not to disable**: String concatenation performance matters in log processing. + +### `string_add_assign = "warn"` + +**Intent**: Optimizes string building operations for better performance. **Why not to disable**: String building is common in alert generation and logging. + +### `unused_async = "warn"` + +**Intent**: Removes unnecessary async overhead that wastes resources. **Why not to disable**: Unnecessary async can impact system responsiveness. + +## Correctness Rules + +### `correctness = { level = "deny", priority = -1 }` + +**Intent**: Denies all correctness issues that could lead to bugs or security vulnerabilities. **Why not to disable**: Correctness is fundamental to security monitoring. + +### `suspicious = { level = "warn", priority = -1 }` + +**Intent**: Warns about suspicious patterns that might indicate bugs or security issues. **Why not to disable**: Suspicious patterns often indicate real problems. + +### `perf = { level = "warn", priority = -1 }` + +**Intent**: Optimizes performance-critical code paths. **Why not to disable**: Performance directly impacts security monitoring effectiveness. + +## Error Handling Rules + +### `expect_used = "warn"` + +**Intent**: Prefers proper error handling over expect() for better error messages and handling. **Why not to disable**: Proper error handling is crucial for debugging security issues. + +### `map_err_ignore = "warn"` + +**Intent**: Ensures error transformations are meaningful and not ignored. **Why not to disable**: Ignored errors can mask security problems. + +### `let_underscore_must_use = "warn"` + +**Intent**: Prevents ignoring important return values that might indicate errors. **Why not to disable**: Ignored return values can hide security-relevant information. + +## Code Organization Rules + +### `missing_docs_in_private_items = "allow"` + +**Intent**: Private items don't need documentation to reduce noise. **Why this exception**: Private implementation details don't need public documentation. + +### `redundant_type_annotations = "warn"` + +**Intent**: Removes unnecessary type annotations that clutter code. **Why not to disable**: Clean code is easier to audit for security issues. + +### `ref_binding_to_reference = "warn"` + +**Intent**: Prevents unnecessary reference binding that can hide ownership issues. **Why not to disable**: Ownership issues can lead to use-after-free vulnerabilities. + +### `pattern_type_mismatch = "warn"` + +**Intent**: Ensures pattern matching is type-safe to prevent runtime errors. **Why not to disable**: Type mismatches can cause crashes or security bypasses. + +## Additional Security Rules + +### `dbg_macro = "warn"` + +**Intent**: Prevents debug output from accidentally reaching production logs. **Why not to disable**: Debug output in production can leak sensitive information. + +### `todo = "warn"` + +**Intent**: Ensures TODO comments are addressed before production deployment. **Why not to disable**: Unfinished code in production monitoring is a security risk. + +### `unimplemented = "warn"` + +**Intent**: Prevents unimplemented code from reaching production. **Why not to disable**: Unimplemented code will panic at runtime. + +### `unreachable = "warn"` + +**Intent**: Identifies unreachable code that might indicate logic errors. **Why not to disable**: Unreachable code often indicates security bypasses or bugs. + +## Performance and Resource Rules + +### `create_dir = "warn"` + +**Intent**: Ensures directory creation is handled properly to prevent race conditions. **Why not to disable**: Race conditions in file operations can cause security issues. + +### `exit = "warn"` + +**Intent**: Prevents unexpected program termination that could leave systems unmonitored. **Why not to disable**: Unexpected exits can leave security gaps. + +### `filetype_is_file = "warn"` + +**Intent**: Ensures proper file type checking to prevent security bypasses. **Why not to disable**: Incorrect file type checks can lead to security vulnerabilities. + +### `float_equality_without_abs = "warn"` + +**Intent**: Prevents incorrect floating-point comparisons that could affect security calculations. **Why not to disable**: Incorrect comparisons can lead to wrong security decisions. + +### `if_then_some_else_none = "warn"` + +**Intent**: Identifies potentially confusing conditional logic that might hide bugs. **Why not to disable**: Confusing logic can hide security vulnerabilities. + +### `lossy_float_literal = "warn"` + +**Intent**: Prevents precision loss in floating-point calculations that could affect security metrics. **Why not to disable**: Precision loss can lead to incorrect security assessments. + +### `match_same_arms = "warn"` + +**Intent**: Identifies duplicate match arms that might indicate copy-paste errors or logic bugs. **Why not to disable**: Duplicate arms can hide security logic errors. + +### `missing_assert_message = "warn"` + +**Intent**: Ensures assertions have meaningful messages for debugging security issues. **Why not to disable**: Good assertion messages are crucial for security debugging. + +### `mixed_read_write_in_expression = "warn"` + +**Intent**: Prevents confusing read/write operations that could hide race conditions. **Why not to disable**: Race conditions can lead to security vulnerabilities. + +### `mutex_atomic = "warn"` + +**Intent**: Suggests using atomic operations instead of mutexes for better performance. **Why not to disable**: Performance matters in high-throughput security monitoring. + +### `mutex_integer = "warn"` + +**Intent**: Suggests using atomic integers instead of mutex-protected integers. **Why not to disable**: Atomic operations are more efficient for simple data. + +### `non_ascii_literal = "warn"` + +**Intent**: Ensures non-ASCII literals are intentional and properly handled. **Why not to disable**: Improper handling of non-ASCII can lead to security bypasses. + +### `non_send_fields_in_send_ty = "warn"` + +**Intent**: Ensures thread safety in async code that processes security data. **Why not to disable**: Thread safety is crucial for concurrent security processing. + +### `partial_pub_fields = "warn"` + +**Intent**: Prevents partially public structs that can break encapsulation. **Why not to disable**: Encapsulation is important for security-critical data structures. + +### `same_name_method = "warn"` + +**Intent**: Prevents method name conflicts that could lead to confusion or bugs. **Why not to disable**: Confusing method names can hide security logic errors. + +### `self_named_module_files = "warn"` + +**Intent**: Ensures consistent module naming that aids in code organization and security auditing. **Why not to disable**: Consistent naming helps with security code reviews. + +### `semicolon_inside_block = "warn"` + +**Intent**: Prevents confusing semicolon usage that could change code behavior. **Why not to disable**: Incorrect semicolons can change security logic. + +### `semicolon_outside_block = "warn"` + +**Intent**: Ensures proper semicolon usage for clear code structure. **Why not to disable**: Clear code structure aids in security auditing. + +### `shadow_reuse = "warn"` + +**Intent**: Prevents variable shadowing that can hide bugs or security issues. **Why not to disable**: Variable shadowing can hide security logic errors. + +### `shadow_same = "warn"` + +**Intent**: Prevents shadowing variables with the same name. **Why not to disable**: Same-name shadowing can hide bugs. + +### `shadow_unrelated = "warn"` + +**Intent**: Prevents shadowing unrelated variables that can cause confusion. **Why not to disable**: Unrelated shadowing can hide security bugs. + +### `string_lit_as_bytes = "warn"` + +**Intent**: Prevents unnecessary string literal to bytes conversion. **Why not to disable**: Unnecessary conversions waste resources in high-throughput monitoring. + +### `string_slice = "warn"` + +**Intent**: Optimizes string slicing operations for better performance. **Why not to disable**: String operations are common in log processing and alert generation. + +### `suspicious_operation_groupings = "warn"` + +**Intent**: Identifies suspicious operation groupings that might indicate bugs. **Why not to disable**: Suspicious patterns often indicate real security issues. + +### `trailing_empty_array = "warn"` + +**Intent**: Prevents trailing empty arrays that can cause confusion or bugs. **Why not to disable**: Confusing array structures can hide security bugs. + +### `transmute_undefined_repr = "warn"` + +**Intent**: Prevents undefined behavior from transmute operations. **Why not to disable**: Undefined behavior can lead to security vulnerabilities. + +### `trivial_regex = "warn"` + +**Intent**: Identifies trivial regex patterns that could be simplified. **Why not to disable**: Simple patterns are easier to audit for security issues. + +### `undocumented_unsafe_blocks = "warn"` + +**Intent**: Ensures unsafe blocks are documented to explain their necessity. **Why not to disable**: Unsafe code must be carefully documented for security auditing. + +### `unnecessary_self_imports = "warn"` + +**Intent**: Removes unnecessary self imports that clutter code. **Why not to disable**: Clean code is easier to audit for security issues. + +### `unseparated_literal_suffix = "warn"` + +**Intent**: Ensures proper literal suffix formatting for readability. **Why not to disable**: Readable code aids in security auditing. + +### `unused_peekable = "warn"` + +**Intent**: Removes unused peekable iterators that waste resources. **Why not to disable**: Unused resources can impact performance in monitoring systems. + +### `unused_rounding = "warn"` + +**Intent**: Removes unused rounding operations that waste CPU cycles. **Why not to disable**: CPU cycles matter in high-throughput security monitoring. + +### `use_debug = "warn"` + +**Intent**: Prevents debug formatting in production code that can leak information. **Why not to disable**: Debug formatting can leak sensitive information in logs. + +### `verbose_file_reads = "warn"` + +**Intent**: Optimizes file reading operations for better performance. **Why not to disable**: File I/O performance matters in log processing. + +### `wildcard_enum_match_arm = "warn"` + +**Intent**: Prevents wildcard enum matching that can hide security logic errors. **Why not to disable**: Wildcard matching can hide important security cases. + +### `zero_sized_map_values = "warn"` + +**Intent**: Identifies zero-sized map values that might indicate inefficient data structures. **Why not to disable**: Inefficient data structures can impact monitoring performance. + +## Pragmatic Exceptions + +These exceptions are allowed currently while the project is in early development. + +### `missing_errors_doc = "allow"` + +**Intent**: Error documentation can be verbose and obvious from context. **Why this exception**: Reduces noise while maintaining code clarity. + +### `missing_panics_doc = "allow"` + +**Intent**: Panic documentation is often obvious from the panic message. **Why this exception**: Reduces documentation overhead for obvious cases. + +### `must_use_candidate = "allow"` + +**Intent**: Some must-use candidates are too noisy for this project. **Why this exception**: Balances safety with developer productivity. + +### `cast_possible_truncation = "allow"` + +**Intent**: Some truncation warnings are too noisy for this project. **Why this exception**: Reduces noise while maintaining type safety. + +### `cast_precision_loss = "allow"` + +**Intent**: Some precision loss warnings are acceptable in this context. **Why this exception**: Balances precision with practical considerations. + +### `cast_sign_loss = "allow"` + +**Intent**: Some sign loss warnings are acceptable in this context. **Why this exception**: Reduces noise while maintaining correctness. + +### `module_name_repetitions = "allow"` + +**Intent**: Module name repetitions are sometimes necessary for clarity. **Why this exception**: Allows clear module organization. + +### `similar_names = "allow"` + +**Intent**: Similar names are sometimes necessary for related functionality. **Why this exception**: Reduces noise while maintaining clear naming. + +### `too_many_lines = "allow"` + +**Intent**: Some modules are naturally large due to their complexity. **Why this exception**: Allows complex modules when necessary. + +### `type_complexity = "allow"` + +**Intent**: Complex types are sometimes necessary for security-critical code. **Why this exception**: Balances complexity with functionality. + +### `async_yields_async = "allow"` + +**Intent**: Some async yields are necessary for proper async patterns. **Why this exception**: Allows necessary async patterns. + +### `large_futures = "allow"` + +**Intent**: Some futures are naturally large due to their complexity. **Why this exception**: Allows complex futures when necessary. + +### `result_large_err = "allow"` + +**Intent**: Some error types are naturally large due to their complexity. **Why this exception**: Allows complex error types when necessary. + +### `cargo_common_metadata = "allow"` + +**Intent**: Common metadata warnings are too noisy for this project. **Why this exception**: Reduces noise while maintaining package metadata. + +## Summary + +These linting rules are carefully chosen for a binary analysis tool. Each rule serves a specific purpose in preventing crashes, ensuring performance, and maintaining code quality. They should not be disabled without understanding their intent and the implications of doing so. + +## AI Assistant Restrictions + +**CRITICAL**: AI assistants are explicitly prohibited from removing clippy restrictions or allowing linters marked as `deny` without explicit permission. All `-D warnings` and `deny` attributes must be preserved. Any changes to linting configuration require explicit user approval. diff --git a/.kiro/steering/rust/performance-optimization.md b/.kiro/steering/rust/performance-optimization.md new file mode 100644 index 0000000..942933b --- /dev/null +++ b/.kiro/steering/rust/performance-optimization.md @@ -0,0 +1,273 @@ +--- +inclusion: fileMatch +fileMatchPattern: ['**/benches/**/*.rs', '**/*bench*.rs', '**/*performance*.rs'] +--- + +# Performance Optimization Standards for Stringy + +## High-Performance Binary Processing + +Stringy uses idiomatic best practices for high-performance binary analysis: + +- **Zero-Copy Parsing**: Use `goblin` for efficient binary format parsing without unnecessary allocations +- **Memory-Mapped Files**: Consider memory-mapped I/O for large binary files +- **Lazy Evaluation**: Process sections on-demand rather than loading everything into memory +- **Efficient String Extraction**: Use slice-based operations for string extraction to avoid allocations + +## Performance Targets + +Stringy must meet strict performance requirements: + +- **Large Binary Processing**: Handle binaries up to several GB efficiently +- **Memory Usage**: Minimize memory footprint, especially for large binaries +- **Processing Speed**: Process typical binaries (10-100 MB) in < 1 second +- **String Extraction**: Extract strings from sections efficiently without excessive allocations +- **Format Detection**: Detect binary format in < 10ms + +## Benchmarking with Criterion + +Use Criterion for performance benchmarking: + +```rust +use criterion::{Criterion, black_box, criterion_group, criterion_main}; +use std::fs; + +fn benchmark_format_detection(c: &mut Criterion) { + let mut group = c.benchmark_group("format_detection"); + + let elf_data = fs::read("test_data/sample.elf").unwrap(); + let pe_data = fs::read("test_data/sample.exe").unwrap(); + let macho_data = fs::read("test_data/sample.macho").unwrap(); + + group.bench_function("detect_elf", |b| { + b.iter(|| black_box(stringy::container::detect_format(&elf_data))) + }); + + group.bench_function("detect_pe", |b| { + b.iter(|| black_box(stringy::container::detect_format(&pe_data))) + }); + + group.bench_function("detect_macho", |b| { + b.iter(|| black_box(stringy::container::detect_format(&macho_data))) + }); + + group.finish(); +} + +fn benchmark_string_extraction(c: &mut Criterion) { + let mut group = c.benchmark_group("string_extraction"); + + let binary_data = fs::read("test_data/large_binary").unwrap(); + let container_info = stringy::container::parse_binary(&binary_data).unwrap(); + + group.bench_function("extract_strings", |b| { + b.iter(|| { + black_box(stringy::extraction::extract_strings( + &binary_data, + &container_info.sections, + )) + }) + }); + + group.finish(); +} + +criterion_group!( + benches, + benchmark_format_detection, + benchmark_string_extraction +); +criterion_main!(benches); +``` + +## Memory Management + +Implement efficient memory usage patterns for binary processing: + +```rust +use memmap2::MmapOptions; +use std::fs::File; + +// Memory-mapped file for large binaries +fn process_large_binary(path: &Path) -> Result { + let file = File::open(path)?; + let mmap = unsafe { MmapOptions::new().map(&file)? }; + + // Process memory-mapped data without loading entire file + let format = detect_format(&mmap); + let parser = create_parser(format)?; + let container_info = parser.parse(&mmap)?; + + Ok(container_info) +} + +// Slice-based string extraction to avoid allocations +fn extract_strings_efficient(data: &[u8]) -> Vec { + let mut strings = Vec::new(); + let mut i = 0; + + while i < data.len() { + if let Some(string) = find_string_at_offset(data, i) { + strings.push(string); + i += string.length as usize; + } else { + i += 1; + } + } + + strings +} +``` + +## Binary Parsing Optimization + +Optimize binary format parsing: + +```rust +// Use goblin's zero-copy parsing +fn parse_elf_efficient(data: &[u8]) -> Result { + let elf = goblin::elf::Elf::parse(data)?; + + // Process sections without cloning + let sections: Vec = elf + .section_headers + .iter() + .enumerate() + .filter_map(|(idx, header)| { + // Only process sections that are likely to contain strings + if is_string_section(&elf, idx) { + Some(parse_section_info(&elf, header, idx)) + } else { + None + } + }) + .collect(); + + Ok(ContainerInfo { + format: BinaryFormat::Elf, + sections, + imports: extract_imports(&elf)?, + exports: extract_exports(&elf)?, + }) +} +``` + +## String Extraction Optimization + +Optimize string extraction algorithms: + +```rust +// Efficient UTF-8 string extraction +fn extract_utf8_strings(data: &[u8], min_len: usize) -> Vec { + let mut strings = Vec::new(); + let mut start = None; + + for (i, &byte) in data.iter().enumerate() { + if byte.is_ascii() && byte >= 0x20 && byte < 0x7F { + if start.is_none() { + start = Some(i); + } + } else if byte == 0 { + if let Some(s) = start { + let len = i - s; + if len >= min_len { + if let Ok(text) = std::str::from_utf8(&data[s..i]) { + strings.push(FoundString { + text: text.to_string(), + offset: s as u64, + length: len as u32, + // ... other fields + }); + } + } + } + start = None; + } else { + start = None; + } + } + + strings +} +``` + +## Section Processing Optimization + +Process sections efficiently: + +```rust +// Process sections in priority order (highest weight first) +fn process_sections_prioritized(data: &[u8], sections: &[SectionInfo]) -> Vec { + let mut sections = sections.to_vec(); + + // Sort by weight (descending) to process high-value sections first + sections.sort_by(|a, b| b.weight.partial_cmp(&a.weight).unwrap()); + + let mut all_strings = Vec::new(); + + for section in sections { + if let Ok(strings) = extract_strings_from_section(data, §ion) { + all_strings.extend(strings); + } + } + + all_strings +} +``` + +## Performance Testing + +Include performance regression tests: + +```rust +#[test] +fn test_format_detection_performance() { + let data = include_bytes!("../test_data/sample.elf"); + let start = Instant::now(); + + for _ in 0..1000 { + let _format = detect_format(data); + } + + let duration = start.elapsed(); + + // Must complete 1000 detections in < 100ms + assert!(duration < Duration::from_millis(100)); +} + +#[test] +fn test_large_binary_processing() { + let data = fs::read("test_data/large_binary").unwrap(); + let start = Instant::now(); + + let format = detect_format(&data); + let parser = create_parser(format).unwrap(); + let _container_info = parser.parse(&data).unwrap(); + + let duration = start.elapsed(); + + // Must process 100MB binary in < 2 seconds + assert!(duration < Duration::from_secs(2)); +} +``` + +## Memory Usage Testing + +Test memory efficiency: + +```rust +#[test] +fn test_memory_efficiency() { + let data = fs::read("test_data/large_binary").unwrap(); + + // Process binary multiple times + for _ in 0..10 { + let format = detect_format(&data); + let parser = create_parser(format).unwrap(); + let _container_info = parser.parse(&data).unwrap(); + } + + // Memory should not grow unbounded + // (In a real test, you'd measure actual memory usage) +} +``` diff --git a/.kiro/steering/rust/rust-standards.md b/.kiro/steering/rust/rust-standards.md new file mode 100644 index 0000000..245a4c7 --- /dev/null +++ b/.kiro/steering/rust/rust-standards.md @@ -0,0 +1,43 @@ +--- +inclusion: fileMatch +fileMatchPattern: ['**/*.rs'] +--- + +# Rust Coding Standards for Stringy + +## Language and Edition + +- Always use **Rust 2024 Edition** (MSRV: 1.91+) as specified in [Cargo.toml](mdc:Cargo.toml) +- Follow the package configuration in [Cargo.toml](mdc:Cargo.toml) with `unsafe_code = "forbid"` and `warnings = "deny"` + +## Code Quality Requirements + +- **Zero warnings policy**: All code must pass `cargo clippy -- -D warnings` +- **No unsafe code**: `unsafe_code = "forbid"` is enforced at package level +- **Formatting**: Use standard `rustfmt` with project-specific line length +- **Error Handling**: Use `thiserror` for structured errors +- **Synchronous Design**: This is a synchronous CLI tool - no async runtime needed +- **Focused and Manageable Files**: Source files should be focused and manageable. Large files should be split into smaller, more focused files; no larger than 500-600 lines, when possible. +- **Strictness**: `warnings = "deny"` enforced at package level; any use of `allow` **MUST** be accompanied by a justification in the code and cannot be applied to entire files or modules. + +## Code Organization + +- Use trait-based interfaces for format parsers (`ContainerParser` trait) +- Implement comprehensive error handling with `thiserror` +- Use strongly-typed structures with `serde` for serialization +- Organize by domain: `container/`, `extraction/`, `classification/`, `output/`, `types/` + +## Module Structure + +- **container/**: Binary format detection and parsing (ELF, PE, Mach-O) +- **extraction/**: String extraction algorithms +- **classification/**: Semantic analysis and tagging +- **output/**: Result formatting (JSON, human-readable, YARA-friendly) +- **types/**: Core data structures and error handling + +## Testing Requirements + +- Include comprehensive tests with `insta` for snapshot testing +- Test binary format detection and parsing +- Test string extraction from various formats +- Use `tempfile` for temporary binary files in tests diff --git a/README.md b/README.md index cd68c40..86f13ae 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Presents the most relevant strings first using a scoring algorithm. - **Section targeting**: `.rodata`, `.rdata`, `__cstring`, resources, manifests - **Encoding support**: ASCII, UTF-8, UTF-16LE/BE with confidence scoring - **Smart classification**: - - URLs, domains, IPs + - URLs, domains, IPv4/IPv6 addresses (implemented) - Filepaths & registry keys - GUIDs & user agents - Format strings (`%s`, `%d`, etc.) @@ -172,7 +172,7 @@ This project is in active development. Current implementation status: - βœ… **Section Analysis**: Smart classification of string-rich sections - βœ… **PE Resource Enumeration**: VERSIONINFO, STRINGTABLE, and MANIFEST resource detection (Phase 1 complete) - 🚧 **String Extraction**: ASCII/UTF-8 and UTF-16 extraction engines (framework ready) -- 🚧 **Semantic Classification**: URL, domain, path, GUID pattern matching (types defined) +- 🚧 **Semantic Classification**: IPv4/IPv6 detection implemented; URL, domain, path, GUID pattern matching in progress (types defined) - 🚧 **Ranking System**: Section-aware scoring algorithm (framework in place) - 🚧 **Output Formats**: JSONL, human-readable, and YARA-friendly output (types ready) - 🚧 **CLI Interface**: Basic argument parsing implemented, main pipeline in progress diff --git a/docs/src/classification.md b/docs/src/classification.md index fb033e9..8507b17 100644 --- a/docs/src/classification.md +++ b/docs/src/classification.md @@ -28,10 +28,14 @@ Raw String β†’ Pattern Matching β†’ Context Analysis β†’ Tag Assignment β†’ Conf #### IP Addresses -- **IPv4 Pattern**: `\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b` -- **IPv6 Pattern**: `\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b` -- **Examples**: `192.168.1.1`, `2001:db8::1` -- **Validation**: Range checking, reserved address detection +- **IPv4 Pattern**: `\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b` +- **IPv6 Pattern**: Comprehensive pattern supporting full notation, compressed notation (`::1`), and mixed notation (`::ffff:192.0.2.1`) +- **Examples**: `192.168.1.1`, `2001:db8::1`, `[::1]:8080` +- **Validation**: Two-stage validation using regex pre-filter followed by `std::net::IpAddr` parsing for correctness +- **Port Handling**: IP addresses with ports (e.g., `192.168.1.1:8080`) are supported by automatically stripping the port suffix before validation +- **IPv6 Bracket Handling**: Bracketed IPv6 addresses (e.g., `[::1]` and `[::1]:8080`) are supported +- **False Positive Mitigation**: Version numbers like `1.2.3.4` are filtered out using heuristics (all octets < 20) +- **Implementation**: See `src/classification/semantic.rs` for the complete implementation - **Security relevance**: High - infrastructure indicators ### File System Indicators diff --git a/src/classification/mod.rs b/src/classification/mod.rs index 09b5a75..f7c7a98 100644 --- a/src/classification/mod.rs +++ b/src/classification/mod.rs @@ -1,4 +1,49 @@ -// String analysis and tagging +//! String analysis and tagging +//! +//! This module provides semantic analysis capabilities to identify and tag +//! extracted strings based on their content patterns. The classification system +//! uses pattern matching (regex) combined with validation to reduce false positives. +//! +//! ## Current Capabilities +//! +//! - **IPv4/IPv6 Address Detection**: Identifies IP addresses with support for +//! ports, bracketed IPv6 notation, and false positive mitigation for version numbers +//! - **URL Detection**: Identifies HTTP/HTTPS URLs +//! - **Domain Detection**: Identifies domain names with TLD validation +//! +//! ## Future Capabilities +//! +//! - File paths (POSIX and Windows) +//! - Registry paths +//! - GUIDs/UUIDs +//! - Email addresses +//! - Base64 data +//! - Format strings +//! - User agents +//! +//! ## Usage +//! +//! ```rust +//! use stringy::classification::SemanticClassifier; +//! use stringy::types::{FoundString, Encoding, StringSource, Tag}; +//! +//! let classifier = SemanticClassifier::new(); +//! let found_string = FoundString { +//! text: "192.168.1.1:8080".to_string(), +//! encoding: Encoding::Ascii, +//! offset: 0, +//! rva: None, +//! section: None, +//! length: 15, +//! tags: Vec::new(), +//! score: 0, +//! source: StringSource::SectionData, +//! confidence: 1.0, +//! }; +//! +//! let tags = classifier.classify(&found_string); +//! assert!(tags.contains(&Tag::IPv4)); +//! ``` pub mod semantic; pub use semantic::SemanticClassifier; diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index 4e04e09..da3143a 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -41,6 +41,8 @@ use crate::types::{FoundString, Tag}; use lazy_static::lazy_static; use regex::Regex; +use std::net::{Ipv4Addr, Ipv6Addr}; +use std::str::FromStr; lazy_static! { /// Regular expression for matching HTTP/HTTPS URLs @@ -55,6 +57,35 @@ lazy_static! { /// It ensures domains start and end with alphanumeric characters, allows hyphens /// in the middle, and requires at least a 2-character TLD. static ref DOMAIN_REGEX: Regex = Regex::new(r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}\b").unwrap(); + + /// Regular expression for matching IPv4 addresses + /// + /// Pattern matches IPv4 addresses with proper octet validation (0-255). + /// Matches the entire string (used after port stripping). + static ref IPV4_REGEX: Regex = Regex::new(r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap(); + + /// Regular expression for matching IPv6 addresses + /// + /// Pattern matches IPv6 addresses including: + /// - Full notation: 2001:0db8:85a3:0000:0000:8a2e:0370:7334 + /// - Compressed notation: 2001:db8::1, ::1, fe80::1 + /// - Mixed notation: ::ffff:192.0.2.1, 64:ff9b::192.0.2.1 + /// This is a permissive pattern that checks for basic IPv6 structure (colons and hex digits). + /// Actual validation is performed by std::net::Ipv6Addr::from_str. + static ref IPV6_REGEX: Regex = Regex::new(r"(?i)^(?:[0-9a-f]{1,4}:){1,7}[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,7}:$|^(?:[0-9a-f]{1,4}:){1,6}:[0-9a-f]{1,4}$|^(?:[0-9a-f]{1,4}:){1,5}(?::[0-9a-f]{1,4}){1,2}$|^(?:[0-9a-f]{1,4}:){1,4}(?::[0-9a-f]{1,4}){1,3}$|^(?:[0-9a-f]{1,4}:){1,3}(?::[0-9a-f]{1,4}){1,4}$|^(?:[0-9a-f]{1,4}:){1,2}(?::[0-9a-f]{1,4}){1,5}$|^[0-9a-f]{1,4}:(?::[0-9a-f]{1,4}){1,6}$|^:(?::[0-9a-f]{1,4}){1,7}$|^::$|^::ffff:(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$").unwrap(); + + /// Regular expression for detecting and stripping port suffixes + /// + /// Matches :port where port is in the valid range 0-65535. + /// Pattern: :[0-9]{1,4} matches 0-9999, |[1-5][0-9]{4} matches 10000-59999, + /// |6[0-4][0-9]{3} matches 60000-64999, |65[0-4][0-9]{2} matches 65000-65499, + /// |655[0-2][0-9] matches 65500-65529, |6553[0-5] matches 65530-65535. + static ref PORT_SUFFIX_REGEX: Regex = Regex::new(r":(?:[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])$").unwrap(); + + /// Regular expression for handling bracketed IPv6 addresses + /// + /// Matches [IPv6] format used in URLs like [::1]:8080. + static ref IPV6_BRACKETS_REGEX: Regex = Regex::new(r"^\[(.+)\]").unwrap(); } /// Semantic classifier for identifying network indicators in extracted strings @@ -153,7 +184,8 @@ impl SemanticClassifier { /// /// This method analyzes a `FoundString` and returns a vector of semantic /// tags that apply to the string. URLs are checked first, then domains - /// (which automatically excludes URLs to prevent double-tagging). + /// (which automatically excludes URLs to prevent double-tagging), then + /// IP addresses (IPv4 and IPv6). /// /// # Arguments /// @@ -201,6 +233,10 @@ impl SemanticClassifier { tags.push(tag); } + // Check for IP addresses (IPv4 and IPv6) + let ip_tags = self.classify_ip_addresses(&string.text); + tags.extend(ip_tags); + tags } @@ -304,6 +340,206 @@ impl SemanticClassifier { valid_tlds.contains(&tld_lower.as_str()) } + + /// Strips port suffix from an IP address string + /// + /// Removes `:port` suffix if present (e.g., `192.168.1.1:8080` β†’ `192.168.1.1`). + /// + /// # Arguments + /// + /// * `text` - The text that may contain a port suffix + /// + /// # Returns + /// + /// Returns a string slice without the port suffix. + fn strip_port<'a>(&self, text: &'a str) -> &'a str { + PORT_SUFFIX_REGEX + .find(text) + .map_or(text, |m| &text[..m.start()]) + } + + /// Strips bracketed notation from IPv6 addresses + /// + /// Removes `[` and `]` from bracketed IPv6 addresses (e.g., `[::1]` β†’ `::1`). + /// + /// # Arguments + /// + /// * `text` - The text that may contain bracketed IPv6 notation + /// + /// # Returns + /// + /// Returns a string slice without brackets, or the original text if no brackets found. + fn strip_ipv6_brackets<'a>(&self, text: &'a str) -> &'a str { + IPV6_BRACKETS_REGEX + .captures(text) + .and_then(|caps| caps.get(1)) + .map_or(text, |m| m.as_str()) + } + + /// Detects IPv4 addresses in the given text + /// + /// This method uses a two-stage validation approach: + /// 1. Regex pre-filter for performance + /// 2. `std::net::Ipv4Addr` validation for correctness + /// + /// It also handles port suffixes (e.g., "192.168.1.1:8080"). + /// + /// # Note on Version Numbers + /// + /// This method accepts ALL valid IPv4 addresses in dotted-quad notation, + /// even if they could also be interpreted as version numbers (e.g., "1.2.3.4"). + /// It is the responsibility of the caller to disambiguate between IP addresses + /// and version numbers based on context when necessary. + /// + /// # Arguments + /// + /// * `text` - The text to search for IPv4 addresses + /// + /// # Returns + /// + /// Returns `true` if a valid IPv4 address is found, `false` otherwise. + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SemanticClassifier; + /// + /// let classifier = SemanticClassifier::new(); + /// assert!(classifier.is_ipv4_address("192.168.1.1")); + /// assert!(classifier.is_ipv4_address("192.168.1.1:8080")); + /// assert!(classifier.is_ipv4_address("1.2.3.4")); // Valid IP (could also be a version number) + /// assert!(!classifier.is_ipv4_address("256.1.1.1")); // Invalid octet + /// ``` + pub fn is_ipv4_address(&self, text: &str) -> bool { + // Strip port suffix if present + let text_without_port = self.strip_port(text); + + // Two-stage validation: regex pre-filter first + if !IPV4_REGEX.is_match(text_without_port) { + return false; + } + + // Check for leading zeros in octets (e.g., 192.168.01.1 should be rejected) + for octet_str in text_without_port.split('.') { + // If an octet has more than 1 digit and starts with '0', it's invalid + if octet_str.len() > 1 && octet_str.starts_with('0') { + return false; + } + } + + // Validate using std::net::Ipv4Addr for correctness + // This is the authoritative check - regex is just a pre-filter + Ipv4Addr::from_str(text_without_port).is_ok() + } + + /// Detects IPv6 addresses in the given text + /// + /// This method uses a two-stage validation approach: + /// 1. Basic structure check (contains colons, looks like IPv6) + /// 2. `std::net::Ipv6Addr` validation for correctness + /// + /// It handles bracketed notation (e.g., `[::1]`) and port suffixes. + /// + /// # Arguments + /// + /// * `text` - The text to search for IPv6 addresses + /// + /// # Returns + /// + /// Returns `true` if a valid IPv6 address is found, `false` otherwise. + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SemanticClassifier; + /// + /// let classifier = SemanticClassifier::new(); + /// assert!(classifier.is_ipv6_address("2001:db8::1")); + /// assert!(classifier.is_ipv6_address("::1")); + /// assert!(classifier.is_ipv6_address("[::1]:8080")); + /// assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex + /// ``` + pub fn is_ipv6_address(&self, text: &str) -> bool { + // Handle bracketed IPv6 addresses like [::1] or [::1]:8080 + // Strategy: strip port first (if present), then strip brackets + + // If it looks like it has a port (contains ]:), strip port first + let after_port = if text.contains("]:") { + self.strip_port(text) + } else { + text + }; + + // Now strip brackets if present + let processed = self.strip_ipv6_brackets(after_port); + + // Two-stage validation: regex pre-filter first + // Basic structure check: must contain colons (IPv6 addresses always have colons) + if !processed.contains(':') { + return false; + } + + // For mixed notation (contains both colons and dots), skip regex check + // as the regex doesn't handle all mixed notation patterns + let is_mixed_notation = processed.contains('.'); + + if !is_mixed_notation { + // Use regex as pre-filter for non-mixed notation + if !IPV6_REGEX.is_match(processed) { + return false; + } + } + + // Validate using std::net::Ipv6Addr for canonical validation + // This handles all IPv6 formats: full, compressed, mixed notation + Ipv6Addr::from_str(processed).is_ok() + } + + /// Classifies IP addresses (IPv4 and IPv6) in the given text + /// + /// This method checks for both IPv4 and IPv6 addresses and returns + /// appropriate tags. A string may match both patterns (unlikely but possible). + /// + /// # Arguments + /// + /// * `text` - The text to search for IP addresses + /// + /// # Returns + /// + /// Returns a vector of `Tag` values (`Tag::IPv4` and/or `Tag::IPv6`). + /// The vector may be empty if no IP addresses are found. + /// + /// # Examples + /// + /// ``` + /// use stringy::classification::SemanticClassifier; + /// use stringy::types::Tag; + /// + /// let classifier = SemanticClassifier::new(); + /// let tags = classifier.classify_ip_addresses("192.168.1.1"); + /// assert_eq!(tags, vec![Tag::IPv4]); + /// + /// let tags = classifier.classify_ip_addresses("::1"); + /// assert_eq!(tags, vec![Tag::IPv6]); + /// + /// let tags = classifier.classify_ip_addresses("not an ip"); + /// assert!(tags.is_empty()); + /// ``` + pub fn classify_ip_addresses(&self, text: &str) -> Vec { + let mut tags = Vec::new(); + + // Check for IPv4 + if self.is_ipv4_address(text) { + tags.push(Tag::IPv4); + } + + // Check for IPv6 + if self.is_ipv6_address(text) { + tags.push(Tag::IPv6); + } + + tags + } } #[cfg(test)] @@ -466,4 +702,174 @@ mod tests { let tags = classifier.classify(&malformed); assert_eq!(tags.len(), 0); } + + #[test] + fn test_ipv4_valid_addresses() { + let classifier = SemanticClassifier::new(); + + // Valid IPv4 addresses + assert!(classifier.is_ipv4_address("192.168.1.1")); + assert!(classifier.is_ipv4_address("10.0.0.1")); + assert!(classifier.is_ipv4_address("8.8.8.8")); + assert!(classifier.is_ipv4_address("1.1.1.1")); + assert!(classifier.is_ipv4_address("127.0.0.1")); + assert!(classifier.is_ipv4_address("0.0.0.0")); + assert!(classifier.is_ipv4_address("255.255.255.255")); + } + + #[test] + fn test_ipv4_invalid_addresses() { + let classifier = SemanticClassifier::new(); + + // Invalid IPv4 addresses + assert!(!classifier.is_ipv4_address("256.1.1.1")); // Octet > 255 + assert!(!classifier.is_ipv4_address("192.168.1")); // Missing octet + assert!(!classifier.is_ipv4_address("192.168.1.1.1")); // Too many octets + assert!(!classifier.is_ipv4_address("999.999.999.999")); // All octets > 255 + assert!(!classifier.is_ipv4_address("192.168.01.1")); // Leading zero (invalid format) + } + + #[test] + fn test_ipv4_with_port() { + let classifier = SemanticClassifier::new(); + + // IPv4 addresses with ports should be detected + assert!(classifier.is_ipv4_address("192.168.1.1:8080")); + assert!(classifier.is_ipv4_address("10.0.0.1:443")); + assert!(classifier.is_ipv4_address("127.0.0.1:3000")); + } + + #[test] + fn test_ipv4_version_numbers() { + let classifier = SemanticClassifier::new(); + + // Valid IPv4 addresses that could also be version numbers are accepted + // It's the caller's responsibility to disambiguate based on context + assert!(classifier.is_ipv4_address("1.2.3.4")); + assert!(classifier.is_ipv4_address("2.0.1.0")); + assert!(classifier.is_ipv4_address("10.5.2.1")); + assert!(classifier.is_ipv4_address("10.5.2.20")); + } + + #[test] + fn test_ipv4_edge_cases() { + let classifier = SemanticClassifier::new(); + + // Boundary values + assert!(classifier.is_ipv4_address("0.0.0.0")); + assert!(classifier.is_ipv4_address("255.255.255.255")); + assert!(classifier.is_ipv4_address("192.0.0.1")); + assert!(classifier.is_ipv4_address("0.255.0.255")); + + // Private network addresses + assert!(classifier.is_ipv4_address("192.168.0.1")); + assert!(classifier.is_ipv4_address("10.0.0.1")); + assert!(classifier.is_ipv4_address("172.16.0.1")); + } + + #[test] + fn test_ipv6_full_notation() { + let classifier = SemanticClassifier::new(); + + // Full IPv6 notation + assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); + assert!(classifier.is_ipv6_address("2001:0db8:85a3:0000:0000:8a2e:0370:7334")); + } + + #[test] + fn test_ipv6_compressed() { + let classifier = SemanticClassifier::new(); + + // Compressed IPv6 notation + assert!(classifier.is_ipv6_address("2001:db8::1")); + assert!(classifier.is_ipv6_address("::1")); + assert!(classifier.is_ipv6_address("fe80::1")); + assert!(classifier.is_ipv6_address("::")); + } + + #[test] + fn test_ipv6_mixed_notation() { + let classifier = SemanticClassifier::new(); + + // Mixed IPv4/IPv6 notation + assert!(classifier.is_ipv6_address("::ffff:192.0.2.1")); + assert!(classifier.is_ipv6_address("64:ff9b::192.0.2.1")); + } + + #[test] + fn test_ipv6_invalid() { + let classifier = SemanticClassifier::new(); + + // Invalid IPv6 addresses + assert!(!classifier.is_ipv6_address("gggg::1")); // Invalid hex + assert!(!classifier.is_ipv6_address("2001:db8::1::2")); // Double :: + assert!(!classifier.is_ipv6_address("2001:db8:1")); // Too short + } + + #[test] + fn test_ipv6_with_brackets() { + let classifier = SemanticClassifier::new(); + + // IPv6 addresses with brackets + assert!(classifier.is_ipv6_address("[2001:db8::1]")); + assert!(classifier.is_ipv6_address("[::1]")); + } + + #[test] + fn test_ipv6_with_port() { + let classifier = SemanticClassifier::new(); + + // IPv6 addresses with brackets and ports + assert!(classifier.is_ipv6_address("[2001:db8::1]:8080")); + assert!(classifier.is_ipv6_address("[::1]:8080")); + } + + #[test] + fn test_classify_ipv4() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("192.168.1.1"); + + let tags = classifier.classify(&found_string); + assert_eq!(tags.len(), 1); + assert!(matches!(tags[0], Tag::IPv4)); + } + + #[test] + fn test_classify_ipv6() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("::1"); + + let tags = classifier.classify(&found_string); + assert_eq!(tags.len(), 1); + assert!(matches!(tags[0], Tag::IPv6)); + } + + #[test] + fn test_classify_no_ip() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("not an ip address"); + + let tags = classifier.classify_ip_addresses(&found_string.text); + assert!(tags.is_empty()); + } + + #[test] + fn test_classify_ipv4_with_port() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("192.168.1.1:8080"); + + let tags = classifier.classify(&found_string); + assert_eq!(tags.len(), 1); + assert!(matches!(tags[0], Tag::IPv4)); + } + + #[test] + fn test_classify_ipv6_with_brackets_and_port() { + let classifier = SemanticClassifier::new(); + let found_string = create_test_string("[::1]:8080"); + + let tags = classifier.classify(&found_string); + assert_eq!(tags.len(), 1); + assert!(matches!(tags[0], Tag::IPv6)); + } } diff --git a/src/extraction/utf16.rs b/src/extraction/utf16.rs index b7d02ab..711ffbb 100644 --- a/src/extraction/utf16.rs +++ b/src/extraction/utf16.rs @@ -706,131 +706,7 @@ fn extract_utf16le_strings_internal( data: &[u8], config: &Utf16ExtractionConfig, ) -> Vec { - let mut strings = Vec::new(); - - // Need at least 2 bytes for a UTF-16LE character - if data.len() < 2 { - return strings; - } - - // Helper function to scan from a given start offset - fn scan_from_offset_le( - data: &[u8], - config: &Utf16ExtractionConfig, - start_offset: usize, - ) -> Vec { - let mut found_strings = Vec::new(); - let mut i = start_offset; - while i + 1 < data.len() { - let mut char_count = 0; - let start = i; - let mut has_null_terminator = false; - let mut chars = Vec::new(); - - // Accumulate printable UTF-16LE characters - while i + 1 < data.len() { - // Read current code unit as u16 - let code_unit = u16::from_le_bytes([data[i], data[i + 1]]); - - // Check for null terminator (0x0000) - if code_unit == 0x0000 { - has_null_terminator = true; - break; - } - - // Check if we have a next code unit for surrogate pair detection - let next_code_unit = if i + 3 < data.len() { - Some(u16::from_le_bytes([data[i + 2], data[i + 3]])) - } else { - None - }; - - // Check if character is printable (handles surrogate pairs) - let (is_printable, consumed_units) = - is_printable_code_unit_or_pair(code_unit, next_code_unit); - - if is_printable { - char_count += 1; // Count as single character (even if surrogate pair) - chars.push(code_unit); - if consumed_units == 2 { - // Surrogate pair - also add the low surrogate - if i + 3 < data.len() { - chars.push(u16::from_le_bytes([data[i + 2], data[i + 3]])); - } - } - i += consumed_units * 2; // Advance by number of bytes (2 per code unit) - } else { - // Non-printable character or lone surrogate, end of string candidate - break; - } - } - - // Check if we found a valid string - if char_count >= config.min_length { - // Check maximum length if configured - if let Some(max_len) = config.max_length - && char_count > max_len - { - // Skip this string, move to next position - i += 2; - continue; - } - - // Calculate end position (including null terminator if present) - let end = if has_null_terminator { i + 2 } else { i }; - - // Extract the string bytes (excluding null terminator for decoding) - let string_bytes = &data[start..end.min(data.len())]; - let bytes_for_decoding = if has_null_terminator && string_bytes.len() >= 2 { - &string_bytes[..string_bytes.len() - 2] - } else { - string_bytes - }; - - // Decode to UTF-8 and get u16 vector - if let Ok((text, u16_vec)) = decode_utf16le(bytes_for_decoding) { - // Calculate UTF-16-specific confidence - let utf16_confidence = calculate_utf16_confidence(&u16_vec, ByteOrder::LE); - - // Apply confidence threshold - if utf16_confidence >= config.confidence_threshold { - found_strings.push(FoundString { - text, - encoding: Encoding::Utf16Le, - offset: start as u64, - rva: None, - section: None, - length: bytes_for_decoding.len() as u32, - tags: Vec::new(), - score: 0, - source: StringSource::SectionData, - confidence: utf16_confidence, - }); - } - } - } - - // Move to next potential start position - // If we found a null terminator, skip past it - if has_null_terminator { - i += 2; - } else { - // Move forward by 2 bytes to try next alignment - i += 2; - } - } - found_strings - } - - // First pass: scan starting at even offset (index 0) - strings.extend(scan_from_offset_le(data, config, 0)); - - // Second pass: scan starting at odd offset (index 1) if enabled - if config.scan_both_alignments && data.len() >= 3 { - strings.extend(scan_from_offset_le(data, config, 1)); - } - - strings + extract_utf16_strings_with_byte_order(data, config, ByteOrder::LE) } /// Extract UTF-16BE strings from a byte slice (internal) @@ -850,32 +726,42 @@ fn extract_utf16le_strings_internal( fn extract_utf16be_strings_internal( data: &[u8], config: &Utf16ExtractionConfig, +) -> Vec { + extract_utf16_strings_with_byte_order(data, config, ByteOrder::BE) +} + +/// Generic UTF-16 string extraction with specified byte order +/// +/// Scans through the byte slice looking for contiguous sequences of printable UTF-16 +/// characters in the specified byte order. Handles both even and odd alignment scanning. +fn extract_utf16_strings_with_byte_order( + data: &[u8], + config: &Utf16ExtractionConfig, + byte_order: ByteOrder, ) -> Vec { let mut strings = Vec::new(); - // Need at least 2 bytes for a UTF-16BE character if data.len() < 2 { return strings; } // Helper function to scan from a given start offset - fn scan_from_offset_be( - data: &[u8], - config: &Utf16ExtractionConfig, - start_offset: usize, - ) -> Vec { + let scan_from_offset = |start_offset: usize| -> Vec { let mut found_strings = Vec::new(); let mut i = start_offset; while i + 1 < data.len() { let mut char_count = 0; let start = i; let mut has_null_terminator = false; - let mut chars = Vec::new(); - // Accumulate printable UTF-16BE characters + // Accumulate printable UTF-16 characters while i + 1 < data.len() { - // Read current code unit as u16 (big-endian) - let code_unit = u16::from_be_bytes([data[i], data[i + 1]]); + // Read current code unit as u16 + let code_unit = match byte_order { + ByteOrder::LE => u16::from_le_bytes([data[i], data[i + 1]]), + ByteOrder::BE => u16::from_be_bytes([data[i], data[i + 1]]), + ByteOrder::Auto => unreachable!(), + }; // Check for null terminator (0x0000) if code_unit == 0x0000 { @@ -885,7 +771,11 @@ fn extract_utf16be_strings_internal( // Check if we have a next code unit for surrogate pair detection let next_code_unit = if i + 3 < data.len() { - Some(u16::from_be_bytes([data[i + 2], data[i + 3]])) + Some(match byte_order { + ByteOrder::LE => u16::from_le_bytes([data[i + 2], data[i + 3]]), + ByteOrder::BE => u16::from_be_bytes([data[i + 2], data[i + 3]]), + ByteOrder::Auto => unreachable!(), + }) } else { None }; @@ -895,36 +785,23 @@ fn extract_utf16be_strings_internal( is_printable_code_unit_or_pair(code_unit, next_code_unit); if is_printable { - char_count += 1; // Count as single character (even if surrogate pair) - chars.push(code_unit); - if consumed_units == 2 { - // Surrogate pair - also add the low surrogate - if i + 3 < data.len() { - chars.push(u16::from_be_bytes([data[i + 2], data[i + 3]])); - } - } - i += consumed_units * 2; // Advance by number of bytes (2 per code unit) + char_count += 1; + i += consumed_units * 2; } else { - // Non-printable character or lone surrogate, end of string candidate break; } } // Check if we found a valid string if char_count >= config.min_length { - // Check maximum length if configured if let Some(max_len) = config.max_length && char_count > max_len { - // Skip this string, move to next position i += 2; continue; } - // Calculate end position (including null terminator if present) let end = if has_null_terminator { i + 2 } else { i }; - - // Extract the string bytes (excluding null terminator for decoding) let string_bytes = &data[start..end.min(data.len())]; let bytes_for_decoding = if has_null_terminator && string_bytes.len() >= 2 { &string_bytes[..string_bytes.len() - 2] @@ -933,15 +810,23 @@ fn extract_utf16be_strings_internal( }; // Decode to UTF-8 and get u16 vector - if let Ok((text, u16_vec)) = decode_utf16be(bytes_for_decoding) { - // Calculate UTF-16-specific confidence - let utf16_confidence = calculate_utf16_confidence(&u16_vec, ByteOrder::BE); + let decode_result = match byte_order { + ByteOrder::LE => decode_utf16le(bytes_for_decoding), + ByteOrder::BE => decode_utf16be(bytes_for_decoding), + ByteOrder::Auto => unreachable!(), + }; + + if let Ok((text, u16_vec)) = decode_result { + let utf16_confidence = calculate_utf16_confidence(&u16_vec, byte_order); - // Apply confidence threshold if utf16_confidence >= config.confidence_threshold { found_strings.push(FoundString { text, - encoding: Encoding::Utf16Be, + encoding: match byte_order { + ByteOrder::LE => Encoding::Utf16Le, + ByteOrder::BE => Encoding::Utf16Be, + ByteOrder::Auto => unreachable!(), + }, offset: start as u64, rva: None, section: None, @@ -956,23 +841,17 @@ fn extract_utf16be_strings_internal( } // Move to next potential start position - // If we found a null terminator, skip past it - if has_null_terminator { - i += 2; - } else { - // Move forward by 2 bytes to try next alignment - i += 2; - } + i += 2; } found_strings - } + }; // First pass: scan starting at even offset (index 0) - strings.extend(scan_from_offset_be(data, config, 0)); + strings.extend(scan_from_offset(0)); // Second pass: scan starting at odd offset (index 1) if enabled if config.scan_both_alignments && data.len() >= 3 { - strings.extend(scan_from_offset_be(data, config, 1)); + strings.extend(scan_from_offset(1)); } strings @@ -1005,42 +884,26 @@ pub fn extract_utf16_strings(data: &[u8], config: &Utf16ExtractionConfig) -> Vec let le_strings = extract_utf16le_strings_internal(data, config); let be_strings = extract_utf16be_strings_internal(data, config); - // Merge results, deduplicating by (offset, encoding, text) to avoid true duplicates - // while retaining distinct strings at the same offset (different encoding or text) - // Maintain a Vec and check for exact duplicates manually, preferring higher confidence - for string in le_strings { - // Check if we already have an exact duplicate + // Helper to add string with deduplication + let mut add_with_dedup = |string: FoundString| { if let Some(existing) = strings.iter_mut().find(|s| { s.offset == string.offset && s.encoding == string.encoding && s.text == string.text }) { - // Prefer the one with higher confidence if string.confidence > existing.confidence { *existing = string; } } else { - // New distinct string, add it strings.push(string); } - } + }; - // Add BE strings, preferring higher confidence for exact duplicates + for string in le_strings { + add_with_dedup(string); + } for string in be_strings { - // Check if we already have an exact duplicate - if let Some(existing) = strings.iter_mut().find(|s| { - s.offset == string.offset - && s.encoding == string.encoding - && s.text == string.text - }) { - // Prefer the one with higher confidence - if string.confidence > existing.confidence { - *existing = string; - } - } else { - // New distinct string, add it - strings.push(string); - } + add_with_dedup(string); } } }