From 26a86343a959895332951660fcc6124f5169c3b0 Mon Sep 17 00:00:00 2001 From: baudrly Date: Sun, 19 Oct 2025 22:39:26 +0200 Subject: [PATCH] Fix preprocessing export and filtering logic --- src/clop/data_utils/preprocessing.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/clop/data_utils/preprocessing.py b/src/clop/data_utils/preprocessing.py index cfe2259..b6930ca 100644 --- a/src/clop/data_utils/preprocessing.py +++ b/src/clop/data_utils/preprocessing.py @@ -1,4 +1,7 @@ +import os + import numpy as np +import pandas as pd from matplotlib import pyplot as plt @@ -72,13 +75,16 @@ def preprocess_inputs(path, path_save): sequences = df[input_column] labels = df[label_column] - labels = sequences[~sequences.str.contains("U|O")] - sequences = sequences[~sequences.str.contains("U|O")] - dna_sequences = sequences.apply(protein_to_dna) + mask = ~sequences.str.contains("U|O", na=False) + filtered_labels = labels[mask] + filtered_sequences = sequences[mask] + dna_sequences = filtered_sequences.apply(protein_to_dna) - labels.save(path_save + "labels.csv") - dna_sequences.save(path_save + "sequences.csv") - print("saved processed sequences and labels (annotations") + labels_path = os.path.join(path_save, "labels.csv") + sequences_path = os.path.join(path_save, "sequences.csv") + filtered_labels.to_csv(labels_path, index=False) + dna_sequences.to_csv(sequences_path, index=False) + print("saved processed sequences and labels (annotations)") if __name__ == "__main__":