From cedb13800318e8e57507d0aeca66f2edf6e4ecb6 Mon Sep 17 00:00:00 2001 From: Yeray Date: Tue, 8 Jul 2025 13:33:35 +0200 Subject: [PATCH] Fix XML parsing: preserve attributes & handle UTF-16 BOM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bypass extractous for .xml files and read raw bytes - Detect BOM and decode UTF-16/UTF-8-sig correctly - Ensure regex “passw” matches Password attributes in XML --- man_spider/lib/parser/parser.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py index cea5911..0c002f2 100644 --- a/man_spider/lib/parser/parser.py +++ b/man_spider/lib/parser/parser.py @@ -141,7 +141,26 @@ def extractous(self, file, pretty_filename): if not self.match_magic(file): return matches - text_content, metadata = self.extractor.extract_file_to_string(str(file)) + # XML files: read raw to preserve attributes and handle encoding + if suffix == '.xml': + log.debug(f'Parsing raw XML for {pretty_filename}') + try: + # read raw bytes and detect BOM for encoding + with open(file, 'rb') as f: + raw_bytes = f.read() + if raw_bytes.startswith(b'\xff\xfe') or raw_bytes.startswith(b'\xfe\xff'): + text_content = raw_bytes.decode('utf-16', errors='ignore') + elif raw_bytes.startswith(b'\xef\xbb\bf'): + text_content = raw_bytes.decode('utf-8-sig', errors='ignore') + else: + text_content = raw_bytes.decode('utf-8', errors='ignore') + metadata = {} + except Exception as e: + log.warning(f"Error reading raw XML for {pretty_filename}: {e}") + return matches + else: + # non-XML: extract text via extractous + text_content, metadata = self.extractor.extract_file_to_string(str(file)) # try to convert to UTF-8 for grep-friendliness try: @@ -162,4 +181,4 @@ def extractous(self, file, pretty_filename): if not self.quiet: self.grep(binary_content, _filter.pattern) - return matches + return matches \ No newline at end of file