From cedb13800318e8e57507d0aeca66f2edf6e4ecb6 Mon Sep 17 00:00:00 2001
From: Yeray <yeraymd44@gmail.com>
Date: Tue, 8 Jul 2025 13:33:35 +0200
Subject: [PATCH] Fix XML parsing: preserve attributes & handle UTF-16 BOM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Bypass extractous for .xml files and read raw bytes
- Detect BOM and decode UTF-16/UTF-8-sig correctly
- Ensure regex “passw” matches Password attributes in XML
---
 man_spider/lib/parser/parser.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py
index cea5911..0c002f2 100644
--- a/man_spider/lib/parser/parser.py
+++ b/man_spider/lib/parser/parser.py
@@ -141,7 +141,26 @@ def extractous(self, file, pretty_filename):
         if not self.match_magic(file):
             return matches
 
-        text_content, metadata = self.extractor.extract_file_to_string(str(file))
+        # XML files: read raw to preserve attributes and handle encoding
+        if suffix == '.xml':
+            log.debug(f'Parsing raw XML for {pretty_filename}')
+            try:
+                # read raw bytes and detect BOM for encoding
+                with open(file, 'rb') as f:
+                    raw_bytes = f.read()
+                if raw_bytes.startswith(b'\xff\xfe') or raw_bytes.startswith(b'\xfe\xff'):
+                    text_content = raw_bytes.decode('utf-16', errors='ignore')
+                elif raw_bytes.startswith(b'\xef\xbb\bf'):
+                    text_content = raw_bytes.decode('utf-8-sig', errors='ignore')
+                else:
+                    text_content = raw_bytes.decode('utf-8', errors='ignore')
+                metadata = {}
+            except Exception as e:
+                log.warning(f"Error reading raw XML for {pretty_filename}: {e}")
+                return matches
+        else:
+            # non-XML: extract text via extractous
+            text_content, metadata = self.extractor.extract_file_to_string(str(file))
 
         # try to convert to UTF-8 for grep-friendliness
         try:
@@ -162,4 +181,4 @@ def extractous(self, file, pretty_filename):
             if not self.quiet:
                 self.grep(binary_content, _filter.pattern)
 
-        return matches
+        return matches
\ No newline at end of file