From 6232593677ca02c530e8fd0a3eaedd06fff3fa3d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 5 Aug 2025 08:46:50 +0000 Subject: [PATCH] Add BOM removal function to handle encoding issues in GTFS files Co-authored-by: mvoina --- logicway/database/upload_data.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/logicway/database/upload_data.py b/logicway/database/upload_data.py index f5e9215c..f7387c26 100644 --- a/logicway/database/upload_data.py +++ b/logicway/database/upload_data.py @@ -2,6 +2,7 @@ import requests import zipfile import git +import glob url = "https://www.ztm.poznan.pl/pl/dla-deweloperow/getGTFSFile" headers = { @@ -30,6 +31,62 @@ def is_git_repository(directory): return os.path.isdir(os.path.join(directory, '.git')) +def remove_bom_from_files(): + """ + Remove BOM (Byte Order Mark) from all text files in the extracted data directory. + BOM can cause issues when reading CSV files, especially with pandas. + """ + if not os.path.exists(data_dir): + print(f"Data directory does not exist: {data_dir}") + return + + # Find all .txt files (GTFS data files) + txt_files = glob.glob(os.path.join(data_dir, "*.txt")) + + bom_removed_count = 0 + + for file_path in txt_files: + try: + # Read the file in binary mode to detect BOM + with open(file_path, 'rb') as f: + content = f.read() + + # Check for UTF-8 BOM (EF BB BF) + if content.startswith(b'\xef\xbb\xbf'): + # Remove BOM and write back + content_without_bom = content[3:] # Remove first 3 bytes (BOM) + with open(file_path, 'wb') as f: + f.write(content_without_bom) + bom_removed_count += 1 + print(f"BOM removed from: {os.path.basename(file_path)}") + + # Check for UTF-16 LE BOM (FF FE) + elif content.startswith(b'\xff\xfe'): + # Convert UTF-16 LE to UTF-8 without BOM + content_str = content.decode('utf-16-le') + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content_str) + bom_removed_count += 1 + print(f"UTF-16 LE BOM removed and converted to UTF-8: {os.path.basename(file_path)}") + + # Check for UTF-16 BE BOM (FE FF) + elif content.startswith(b'\xfe\xff'): + # Convert UTF-16 BE to UTF-8 without BOM + content_str = content.decode('utf-16-be') + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content_str) + bom_removed_count += 1 + print(f"UTF-16 BE BOM removed and converted to UTF-8: {os.path.basename(file_path)}") + + except Exception as e: + print(f"Error processing file {file_path}: {e}") + + if bom_removed_count > 0: + print(f"BOM removed from {bom_removed_count} file(s)") + else: + print("No BOM found in any files") + + ######################################################### @@ -41,6 +98,7 @@ def update_internal_storage(): download_from_external_storage() unzip_data() + remove_bom_from_files() # Add BOM removal after unzipping delete_zip() if repo.is_dirty():