From 0cd354d9259aded774b3dce550e89f43cde9e189 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 5 Jan 2026 22:48:46 +0000 Subject: [PATCH 1/3] Initial plan From 6df0f27daecd047afb7c6ceadb95d64c77a3a837 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 5 Jan 2026 22:54:30 +0000 Subject: [PATCH 2/3] fix: address PR review comments - spelling, port validation, test comment, and documentation Co-authored-by: unclesp1d3r <251112+unclesp1d3r@users.noreply.github.com> --- .kiro/hooks/code-review-refactor.kiro.hook | 2 +- src/classification/semantic.rs | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.kiro/hooks/code-review-refactor.kiro.hook b/.kiro/hooks/code-review-refactor.kiro.hook index 17dcbc8..7ccacf3 100644 --- a/.kiro/hooks/code-review-refactor.kiro.hook +++ b/.kiro/hooks/code-review-refactor.kiro.hook @@ -1,6 +1,6 @@ { "enabled": true, - "name": "Code Simplicy Checker", + "name": "Code Simplicity Checker", "description": "When the agent finishes its work, automatically trigger a comprehensive code review to eliminate unnecessary complexity, refactor for simplicity, reduce test bloat, and verify idiomatic style before finalizing any code changes", "version": "1", "when": { diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index fa148a3..3729597 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -76,8 +76,11 @@ lazy_static! { /// Regular expression for detecting and stripping port suffixes /// - /// Matches :port where port is 1-5 digits (0-65535). - static ref PORT_SUFFIX_REGEX: Regex = Regex::new(r":[0-9]{1,5}$").unwrap(); + /// Matches :port where port is in the valid range 0-65535. + /// Pattern: :[0-9]{1,4} matches 0-9999, |[1-5][0-9]{4} matches 10000-59999, + /// |6[0-4][0-9]{3} matches 60000-64999, |65[0-4][0-9]{2} matches 65000-65499, + /// |655[0-2][0-9] matches 65500-65529, |6553[0-5] matches 65530-65535. + static ref PORT_SUFFIX_REGEX: Regex = Regex::new(r":(?:[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])$").unwrap(); /// Regular expression for handling bracketed IPv6 addresses /// @@ -381,6 +384,15 @@ impl SemanticClassifier { /// /// It also handles port suffixes and filters out version numbers. /// + /// # Version Number Heuristic + /// + /// To reduce false positives from version numbers (e.g., "1.2.3.4"), + /// this method rejects IPv4 addresses where all octets are less than 20. + /// This heuristic may occasionally produce false positives for legitimate + /// IP addresses that happen to match version number patterns (e.g., "10.5.2.15"). + /// Common addresses like 8.8.8.8 (Google DNS) and private network addresses + /// in well-known ranges are explicitly allowed to mitigate this. + /// /// # Arguments /// /// * `text` - The text to search for IPv4 addresses @@ -766,7 +778,7 @@ mod tests { // Version numbers should be rejected assert!(!classifier.is_ipv4_address("1.2.3.4")); assert!(!classifier.is_ipv4_address("2.0.1.0")); - assert!(!classifier.is_ipv4_address("10.5.2.1")); // Some octets < 20, but not all + assert!(!classifier.is_ipv4_address("10.5.2.1")); // All octets < 20 -> treated as version number, so rejected assert!(classifier.is_ipv4_address("10.5.2.20")); // Valid IP (not all < 20) } From 2e7c1ec4d627dc6282b6a7a7ac7eb23b33a055e8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 5 Jan 2026 23:06:46 +0000 Subject: [PATCH 3/3] refactor: remove version number heuristic, accept all valid IPv4 addresses Co-authored-by: unclesp1d3r <251112+unclesp1d3r@users.noreply.github.com> --- src/classification/semantic.rs | 59 ++++++++-------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) diff --git a/src/classification/semantic.rs b/src/classification/semantic.rs index 3729597..da3143a 100644 --- a/src/classification/semantic.rs +++ b/src/classification/semantic.rs @@ -382,16 +382,14 @@ impl SemanticClassifier { /// 1. Regex pre-filter for performance /// 2. `std::net::Ipv4Addr` validation for correctness /// - /// It also handles port suffixes and filters out version numbers. + /// It also handles port suffixes (e.g., "192.168.1.1:8080"). /// - /// # Version Number Heuristic + /// # Note on Version Numbers /// - /// To reduce false positives from version numbers (e.g., "1.2.3.4"), - /// this method rejects IPv4 addresses where all octets are less than 20. - /// This heuristic may occasionally produce false positives for legitimate - /// IP addresses that happen to match version number patterns (e.g., "10.5.2.15"). - /// Common addresses like 8.8.8.8 (Google DNS) and private network addresses - /// in well-known ranges are explicitly allowed to mitigate this. + /// This method accepts ALL valid IPv4 addresses in dotted-quad notation, + /// even if they could also be interpreted as version numbers (e.g., "1.2.3.4"). + /// It is the responsibility of the caller to disambiguate between IP addresses + /// and version numbers based on context when necessary. /// /// # Arguments /// @@ -409,7 +407,7 @@ impl SemanticClassifier { /// let classifier = SemanticClassifier::new(); /// assert!(classifier.is_ipv4_address("192.168.1.1")); /// assert!(classifier.is_ipv4_address("192.168.1.1:8080")); - /// assert!(!classifier.is_ipv4_address("1.2.3.4")); // Version number + /// assert!(classifier.is_ipv4_address("1.2.3.4")); // Valid IP (could also be a version number) /// assert!(!classifier.is_ipv4_address("256.1.1.1")); // Invalid octet /// ``` pub fn is_ipv4_address(&self, text: &str) -> bool { @@ -431,37 +429,7 @@ impl SemanticClassifier { // Validate using std::net::Ipv4Addr for correctness // This is the authoritative check - regex is just a pre-filter - let ip = match Ipv4Addr::from_str(text_without_port) { - Ok(ip) => ip, - Err(_) => return false, - }; - - // Apply false positive mitigation: reject version numbers - // Version numbers like 1.2.3.4 or 10.5.2.1 typically have all octets < 20 - // We use a heuristic: reject if all octets are < 20 (as per plan) - // But allow common real IP addresses and private network ranges - let octets = ip.octets(); - - // Allow 0.0.0.0 (unspecified address) and common single-digit IPs - // Also allow specific common private IPs that would otherwise be rejected - let common_ips = [ - [0, 0, 0, 0], // Unspecified - [1, 1, 1, 1], // Cloudflare DNS - [8, 8, 8, 8], // Google DNS - [8, 8, 4, 4], // Google DNS alt - [10, 0, 0, 1], // Common private IP - ]; - - if common_ips.contains(&octets) { - return true; - } - - // Reject if all octets are < 20 (likely a version number) - if octets.iter().all(|&octet| octet < 20) { - return false; - } - - true + Ipv4Addr::from_str(text_without_port).is_ok() } /// Detects IPv6 addresses in the given text @@ -775,11 +743,12 @@ mod tests { fn test_ipv4_version_numbers() { let classifier = SemanticClassifier::new(); - // Version numbers should be rejected - assert!(!classifier.is_ipv4_address("1.2.3.4")); - assert!(!classifier.is_ipv4_address("2.0.1.0")); - assert!(!classifier.is_ipv4_address("10.5.2.1")); // All octets < 20 -> treated as version number, so rejected - assert!(classifier.is_ipv4_address("10.5.2.20")); // Valid IP (not all < 20) + // Valid IPv4 addresses that could also be version numbers are accepted + // It's the caller's responsibility to disambiguate based on context + assert!(classifier.is_ipv4_address("1.2.3.4")); + assert!(classifier.is_ipv4_address("2.0.1.0")); + assert!(classifier.is_ipv4_address("10.5.2.1")); + assert!(classifier.is_ipv4_address("10.5.2.20")); } #[test]