From 064810a82c6e42563bed5ac1ed718516c71c579a Mon Sep 17 00:00:00 2001 From: Abbey Marye Date: Wed, 12 Nov 2025 12:18:41 -0700 Subject: [PATCH 1/2] Add files via upload Script to merge fastas and create date file for measles phylogenetic analysis. --- Measles_File_Prep.py | 97 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 Measles_File_Prep.py diff --git a/Measles_File_Prep.py b/Measles_File_Prep.py new file mode 100644 index 0000000..ea96af2 --- /dev/null +++ b/Measles_File_Prep.py @@ -0,0 +1,97 @@ +import os +from Bio import SeqIO +import pandas as pd + +# --- User Input Section --- +print("--- FASTA Merging Setup ---") +input_folder1 = input("Enter the path for the FASTA input folder (ML tree preparation): ") +input_folder2 = input("Enter the path for the FASTA input folder (BEAST preparation): ") + +# Define output paths for the FASTA merging (you can make these inputs too if desired) +output_file1 = "measles_merged_ML.fasta" +output_file2 = "measles_merged_Beast.fasta" + +print("\n--- CSV Processing Setup ---") +input_csv = input("Enter the path for the input CSV file: ") +# Define output path for the TSV (you can make this an input too if desired) +output_tsv = "measles_dates.txt" + + +# --- FASTA Merging Function (First Block) --- +def merge_fasta_files(input_folder, output_file): + """ + Finds all .fa files in a folder, merges them into a single FASTA file, + and replaces each sequence header with its original filename (prefix). + """ + try: + # Get all FASTA files in the folder + fasta_files = [f for f in os.listdir(input_folder) if f.endswith(".fa")] + + + + with open(output_file, "w") as out_f: + for file in fasta_files: + file_path = os.path.join(input_folder, file) + # Extract filename without extension for the new header + file_prefix = os.path.splitext(file)[0] + + with open(file_path, "r") as in_f: + for record in SeqIO.parse(in_f, "fasta"): + record.id = file_prefix # Replace header with filename + record.description = "" # Remove any additional description + SeqIO.write(record, out_f, "fasta") + + print(f"Merged {len(fasta_files)} FASTA files from '{input_folder}' into '{output_file}' with modified headers.") + except FileNotFoundError: + print(f"Error: The input folder '{input_folder}' was not found.") + except Exception as e: + print(f"An error occurred during FASTA merging for '{input_folder}': {e}") + +# Run the FASTA merging for both paths +merge_fasta_files(input_folder1, output_file1) +merge_fasta_files(input_folder2, output_file2) + +print("-" * 40) + +# --- CSV Processing Function (Second Block) --- +def process_data(input_csv, output_tsv): + """ + Reads a CSV, filters rows without required accession or date, and saves + the accession number and date columns to a TSV file. + + Args: + input_csv (str): The path to the input CSV file. + output_tsv (str): The path to the output TSV file. + """ + try: + # 1. Read the CSV file into a pandas DataFrame. + # Assuming 'UPHL_lab_accession' and 'patient_disease_onset_date' are the column names. + df = pd.read_csv(input_csv, dtype={'UPHL ID #': str}) + + # 2. Remove rows where the required columns are null or missing. + df_filtered = df.dropna(subset=['UPHL ID #', 'Collection Date']) + + # 3. Select only the required columns. + df_filtered = df_filtered[['UPHL ID #', 'Collection Date']] + + # 4. Rename the columns to "Accession" and "Date". + df_final = df_filtered.rename( + columns={'UPHL_lab_accession': 'Accession', + 'Collection Date': 'Date'} + ) + + # 5. Save the new DataFrame to a TSV file. + + df_final.to_csv(output_tsv, sep='\t', index=False) + + print(f"Successfully processed data. Filtered data saved to '{output_tsv}'.") + + except FileNotFoundError: + print(f"Error: The file '{input_csv}' was not found.") + except KeyError as e: + print(f"Error: A required column was not found. Please check your column names. Missing column: {e}") + except Exception as e: + print(f"An unexpected error occurred during CSV processing: {e}") + +# Run the CSV processing block +process_data(input_csv, output_tsv) \ No newline at end of file From bb3f0e70398fe46afb2e8762d9c41639c5ddabf9 Mon Sep 17 00:00:00 2001 From: Abbey Marye Date: Mon, 9 Feb 2026 08:41:43 -0700 Subject: [PATCH 2/2] Update to only create date file --- Measles_File_Prep.py | 66 ++++++-------------------------------------- 1 file changed, 8 insertions(+), 58 deletions(-) diff --git a/Measles_File_Prep.py b/Measles_File_Prep.py index ea96af2..54d7984 100644 --- a/Measles_File_Prep.py +++ b/Measles_File_Prep.py @@ -3,55 +3,8 @@ import pandas as pd # --- User Input Section --- -print("--- FASTA Merging Setup ---") -input_folder1 = input("Enter the path for the FASTA input folder (ML tree preparation): ") -input_folder2 = input("Enter the path for the FASTA input folder (BEAST preparation): ") +input_csv = input("Enter the path to Measles_Metadata.csv: ") -# Define output paths for the FASTA merging (you can make these inputs too if desired) -output_file1 = "measles_merged_ML.fasta" -output_file2 = "measles_merged_Beast.fasta" - -print("\n--- CSV Processing Setup ---") -input_csv = input("Enter the path for the input CSV file: ") -# Define output path for the TSV (you can make this an input too if desired) -output_tsv = "measles_dates.txt" - - -# --- FASTA Merging Function (First Block) --- -def merge_fasta_files(input_folder, output_file): - """ - Finds all .fa files in a folder, merges them into a single FASTA file, - and replaces each sequence header with its original filename (prefix). - """ - try: - # Get all FASTA files in the folder - fasta_files = [f for f in os.listdir(input_folder) if f.endswith(".fa")] - - - - with open(output_file, "w") as out_f: - for file in fasta_files: - file_path = os.path.join(input_folder, file) - # Extract filename without extension for the new header - file_prefix = os.path.splitext(file)[0] - - with open(file_path, "r") as in_f: - for record in SeqIO.parse(in_f, "fasta"): - record.id = file_prefix # Replace header with filename - record.description = "" # Remove any additional description - SeqIO.write(record, out_f, "fasta") - - print(f"Merged {len(fasta_files)} FASTA files from '{input_folder}' into '{output_file}' with modified headers.") - except FileNotFoundError: - print(f"Error: The input folder '{input_folder}' was not found.") - except Exception as e: - print(f"An error occurred during FASTA merging for '{input_folder}': {e}") - -# Run the FASTA merging for both paths -merge_fasta_files(input_folder1, output_file1) -merge_fasta_files(input_folder2, output_file2) - -print("-" * 40) # --- CSV Processing Function (Second Block) --- def process_data(input_csv, output_tsv): @@ -66,22 +19,19 @@ def process_data(input_csv, output_tsv): try: # 1. Read the CSV file into a pandas DataFrame. # Assuming 'UPHL_lab_accession' and 'patient_disease_onset_date' are the column names. - df = pd.read_csv(input_csv, dtype={'UPHL ID #': str}) + df = pd.read_csv(input_csv, dtype={'UPHL_lab_accession': str}) # 2. Remove rows where the required columns are null or missing. - df_filtered = df.dropna(subset=['UPHL ID #', 'Collection Date']) + df_filtered = df.dropna(subset=['UPHL_lab_accession']) + df_filtered = df_filtered.dropna(subset=['patient_disease_onset_date']) - # 3. Select only the required columns. - df_filtered = df_filtered[['UPHL ID #', 'Collection Date']] + # 3. Select only the "Accession" and "Date" columns. + df_filtered = df_filtered[['UPHL_lab_accession', 'patient_disease_onset_date']] # 4. Rename the columns to "Accession" and "Date". - df_final = df_filtered.rename( - columns={'UPHL_lab_accession': 'Accession', - 'Collection Date': 'Date'} - ) + df_final = df_filtered.rename(columns={'UPHL_lab_accession': 'Accession', 'patient_disease_onset_date': 'Date'}) # 5. Save the new DataFrame to a TSV file. - df_final.to_csv(output_tsv, sep='\t', index=False) print(f"Successfully processed data. Filtered data saved to '{output_tsv}'.") @@ -94,4 +44,4 @@ def process_data(input_csv, output_tsv): print(f"An unexpected error occurred during CSV processing: {e}") # Run the CSV processing block -process_data(input_csv, output_tsv) \ No newline at end of file +process_data(input_csv, 'measles_dates.txt')