From b6197fa9863adf8c26e39648be9844f52e66cc60 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 17:36:38 +0200 Subject: [PATCH 01/42] changed from target to category in line original_train_data --- DatasetLoad.py | 1 + SentimentAnalyzer.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DatasetLoad.py b/DatasetLoad.py index dc3f877..10a2f42 100644 --- a/DatasetLoad.py +++ b/DatasetLoad.py @@ -57,6 +57,7 @@ def load_datasets(self): data = data.rename(columns={'Tweet': 'text'}) # remove the rows of the text column in which the text is "Not Available" data = data[data['text'] != 'Not Available'] + data = data.dropna() # Ensure the first column is 'text' and the second column is 'category' data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]] diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 66546c3..642d1af 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -29,11 +29,11 @@ def map_label_to_target(self, label): return None # Generate synthetic data using LLMs to be defined - def generate_synthetic_data(self, topic, n_samples): + def generate_synthetic_data(self, topic, text, n_samples): openai.api_key = 'YOUR_API_KEY' synthetic_data = [] for _ in range(n_samples): - prompt = f"Generate six tweets related to {topic} that expresses sentiment." + prompt = f"Generate six tweets related to {topic} that expresses sentiment similar to {text}" response = openai.Completion.create( engine="text-davinci-003", prompt=prompt, From 7c9ae480147ce7203d24f8b0fa702b392dc44683 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 17:54:39 +0200 Subject: [PATCH 02/42] First implementation in main of generating new data --- SentimentAnalyzer.py | 24 +++++++-------- main.py | 73 +++++++++++++++++++++++++++++--------------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 642d1af..b8ca503 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -13,6 +13,10 @@ def __init__(self): self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device) self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device) + # Initialize FLAN model for synthetic data generation + self.flan_model_name = "google/flan-t5-small" + self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name) + self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device) def analyze_sentiment(self, text): results = self.classifier(text) return results[0]['label'] @@ -28,18 +32,15 @@ def map_label_to_target(self, label): else: return None - # Generate synthetic data using LLMs to be defined + # Generate synthetic data using Hugging Face model def generate_synthetic_data(self, topic, text, n_samples): - openai.api_key = 'YOUR_API_KEY' synthetic_data = [] for _ in range(n_samples): - prompt = f"Generate six tweets related to {topic} that expresses sentiment similar to {text}" - response = openai.Completion.create( - engine="text-davinci-003", - prompt=prompt, - max_tokens=60 - ) - synthetic_data.append(response.choices[0].text.strip()) + prompt = f"Generate a tweet related to {topic} that expresses sentiment similar to: '{text}'" + inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device) + outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1) + generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) + synthetic_data.append(generated_text) return synthetic_data def augment_training_data(self, topics, n_samples=100): @@ -47,7 +48,6 @@ def augment_training_data(self, topics, n_samples=100): augmented_data_with_topics = {'text': [], 'label': [], 'topic': []} for topic in topics: synthetic_texts = self.generate_synthetic_data(topic, n_samples) - # Assuming the sentiment label for generated data augmented_data['text'].extend(synthetic_texts) augmented_data['label'].extend([1] * len(synthetic_texts)) # Defaulting to neutral augmented_data_with_topics['text'].extend(synthetic_texts) @@ -55,10 +55,10 @@ def augment_training_data(self, topics, n_samples=100): augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts)) augmented_df = pd.DataFrame(augmented_data) - augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics) + augmented_df_with_topics = pd.DataFrame(augmented_data_with_topics) return augmented_df, augmented_df_with_topics - def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5): + def fine_tune_with_augmented_data(self, topics, n_samples=6, epochs=3, batch_size=16, learning_rate=2e-5): augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples) return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics diff --git a/main.py b/main.py index cc7b16e..32cca72 100644 --- a/main.py +++ b/main.py @@ -153,32 +153,55 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column print("\nValidation Metrics per Topic") print(val_metrics) - # Function to analyze disparities in sentiment predictions - def analyze_disparities(subgroups): - analysis_results = [] - for subgroup_name, subgroup_data in subgroups.items(): - if not subgroup_data.empty: - sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100 - analysis_results.append({ - 'subgroup': subgroup_name, - 'total': len(subgroup_data), - 'negative': sentiment_counts.get(0, 0), - 'neutral': sentiment_counts.get(1, 0), - 'positive': sentiment_counts.get(2, 0), - }) - return pd.DataFrame(analysis_results) + # Identify least performing topics + overall_accuracy = train_metrics['accuracy'].mean() + least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist() + print(f"Least performing topics: {least_performing_topics}") + + # Generate and augment data for least performing topics + synthetic_texts = generate_and_augment_data(sentiment_analyzer, least_performing_topics, original_train_data, + n_samples=100) + # Create a new DataFrame for the synthetic data + synthetic_df = pd.DataFrame({ + 'text': synthetic_texts, + 'category': [1] * len(synthetic_texts), # Assuming neutral category for synthetic data + 'topic': least_performing_topics * (len(synthetic_texts) // len(least_performing_topics)) + }) - # Analyze disparities for the datasets - train_analysis = analyze_disparities(train_subgroups) - test_analysis = analyze_disparities(test_subgroups) - val_analysis = analyze_disparities(val_subgroups) + # Augment original training data with synthetic data + augmented_train_data = pd.concat([original_train_data, synthetic_df], ignore_index=True) - # Print the analysis results - print("Train Percentage Analysis") - print(train_analysis) - print("\nTest Percentage Analysis") - print(test_analysis) - print("\nValidation Percentage Analysis") - print(val_analysis) + # Fine-tune the sentiment analyzer with the augmented dataset + augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data) + print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}") + + # Function to analyze disparities in sentiment predictions + # def analyze_disparities(subgroups): + # analysis_results = [] + # for subgroup_name, subgroup_data in subgroups.items(): + # if not subgroup_data.empty: + # sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100 + # analysis_results.append({ + # 'subgroup': subgroup_name, + # 'total': len(subgroup_data), + # 'negative': sentiment_counts.get(0, 0), + # 'neutral': sentiment_counts.get(1, 0), + # 'positive': sentiment_counts.get(2, 0), + # }) + # return pd.DataFrame(analysis_results) + # + # + # # Analyze disparities for the datasets + # train_analysis = analyze_disparities(train_subgroups) + # test_analysis = analyze_disparities(test_subgroups) + # val_analysis = analyze_disparities(val_subgroups) + # + # # Print the analysis results + # print("Train Percentage Analysis") + # print(train_analysis) + # print("\nTest Percentage Analysis") + # print(test_analysis) + # print("\nValidation Percentage Analysis") + # print(val_analysis) From cf24a153328c992ab0d42554ef30ae92a9e43f87 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 17:57:49 +0200 Subject: [PATCH 03/42] Check if saved model exists or not --- main.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 32cca72..b69aca1 100644 --- a/main.py +++ b/main.py @@ -48,12 +48,18 @@ # Initialize the sentiment analyzer sentiment_analyzer = SentimentAnalyzer() - # Fine-tune the sentiment analyzer with the original dataset - fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data) - print(f"Fine-tuning results: {fine_tuning_results}") - # Extract metadata for the datasets base_path = os.path.dirname(os.path.abspath(__file__)) + # Check if a saved model exists + if os.path.exists(model_save_path): + print("Loading the fine-tuned model from disk...") + sentiment_analyzer.model = torch.load(model_save_path) + else: + print("Fine-tuning the sentiment analyzer with the original dataset...") + fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data) + print(f"Fine-tuning results: {fine_tuning_results}") + # Save the fine-tuned model + torch.save(sentiment_analyzer.model, model_save_path) # Extract metadata for the datasets train_sentiment_file_name = os.path.join(base_path, f'train_sentiment_{args.dataset_type}_{args.percentage}.csv') From 36cc3d9f791e75233653021e35965ac93185bc7f Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Fri, 26 Jul 2024 18:04:28 +0200 Subject: [PATCH 04/42] implemented model_save_path --- main.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index b69aca1..d4f0d04 100644 --- a/main.py +++ b/main.py @@ -1,15 +1,15 @@ import argparse import os -import wandb + import pandas as pd import torch from sklearn.metrics import classification_report + from DatasetLoad import DatasetLoad from MetadataExtractor import MetadataExtractor from SentimentAnalyzer import SentimentAnalyzer from extract_stuff import augment_and_extract_metadata, predict_sentiment - os.environ["WANDB_API_KEY"] = "21cb0c9433eeca19401ee01e9b1bc9e4b6f7a696" if __name__ == "__main__": @@ -50,6 +50,7 @@ # Extract metadata for the datasets base_path = os.path.dirname(os.path.abspath(__file__)) + model_save_path = os.path.join(base_path, f'sentiment_model_{args.dataset_type}_{args.percentage}.pt') # Check if a saved model exists if os.path.exists(model_save_path): print("Loading the fine-tuned model from disk...") From 61c687c97ff172e62946daa616107803cedb96fc Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Fri, 26 Jul 2024 18:37:15 +0200 Subject: [PATCH 05/42] first implementation fo the bottom topics --- main.py | 117 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 33 deletions(-) diff --git a/main.py b/main.py index d4f0d04..25046e2 100644 --- a/main.py +++ b/main.py @@ -165,9 +165,90 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist() print(f"Least performing topics: {least_performing_topics}") + + # Function to analyze disparities in sentiment predictions + def analyze_disparities(subgroups): + analysis_results = [] + for subgroup_name, subgroup_data in subgroups.items(): + if not subgroup_data.empty: + sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100 + analysis_results.append({ + 'subgroup': subgroup_name, + 'total': len(subgroup_data), + 'negative': sentiment_counts.get(0, 0), + 'neutral': sentiment_counts.get(1, 0), + 'positive': sentiment_counts.get(2, 0), + }) + return pd.DataFrame(analysis_results) + + + # Analyze disparities for the datasets + train_analysis = analyze_disparities(train_subgroups) + test_analysis = analyze_disparities(test_subgroups) + val_analysis = analyze_disparities(val_subgroups) + + # Print the analysis results + print("Train Percentage Analysis") + print(train_analysis) + print("\nTest Percentage Analysis") + print(test_analysis) + print("\nValidation Percentage Analysis") + print(val_analysis) + + + def weighted_metrics(metrics_df, support_df, metric='accuracy'): + # Join metrics with their respective support counts + metrics_df = metrics_df.copy() + metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') + metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support'] + return metrics_df + + + def get_top_bottom_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): + # Get support for each topic + support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) + + # Compute weighted metrics + weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric) + + # Compute baseline accuracy + baseline_accuracy = weighted_metrics_df['accuracy'].mean() + + # Sort topics by their weighted metrics + sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) + + # Get top 3 and bottom 3 topics + top_3_topics = sorted_metrics.head(3)['topic'].tolist() + bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist() + + # Adjust for baseline accuracy + bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[ + 'topic'].tolist() + + return top_3_topics, bottom_3_topics_below_baseline + + + topics = get_top_bottom_topics(test_metrics, test_analysis, metric='accuracy') + print(f"Top 3 (lower score) topics: {topics[0]}") + + + def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples): + # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic). Then using the sentences as baseline, generate more that will later be useed to train the model + synthetic_texts = [] + for topic in topics: + topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic] + topic_samples = topic_data.sample(n_samples, replace=True) + for index, row in topic_samples.iterrows(): + synthetic_texts.extend(sentiment_analyzer.generate_synthetic_data(row['topic'], row['text'], n_samples)) + return synthetic_texts + # Generate and augment data for least performing topics - synthetic_texts = generate_and_augment_data(sentiment_analyzer, least_performing_topics, original_train_data, - n_samples=100) + synthetic_texts = generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, + n_samples=10) + if args.debug: + print(f"Generated {len(synthetic_texts)} synthetic texts for least performing topics") + print("Sample synthetic texts:") + print(synthetic_texts[:5]) # Create a new DataFrame for the synthetic data synthetic_df = pd.DataFrame({ @@ -181,34 +262,4 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column # Fine-tune the sentiment analyzer with the augmented dataset augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data) - print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}") - - # Function to analyze disparities in sentiment predictions - # def analyze_disparities(subgroups): - # analysis_results = [] - # for subgroup_name, subgroup_data in subgroups.items(): - # if not subgroup_data.empty: - # sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100 - # analysis_results.append({ - # 'subgroup': subgroup_name, - # 'total': len(subgroup_data), - # 'negative': sentiment_counts.get(0, 0), - # 'neutral': sentiment_counts.get(1, 0), - # 'positive': sentiment_counts.get(2, 0), - # }) - # return pd.DataFrame(analysis_results) - # - # - # # Analyze disparities for the datasets - # train_analysis = analyze_disparities(train_subgroups) - # test_analysis = analyze_disparities(test_subgroups) - # val_analysis = analyze_disparities(val_subgroups) - # - # # Print the analysis results - # print("Train Percentage Analysis") - # print(train_analysis) - # print("\nTest Percentage Analysis") - # print(test_analysis) - # print("\nValidation Percentage Analysis") - # print(val_analysis) - + print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}") \ No newline at end of file From 4af3e448b076779edae40a5d584777530da08475 Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Fri, 26 Jul 2024 18:45:55 +0200 Subject: [PATCH 06/42] missing import --- SentimentAnalyzer.py | 4 +++- main.py | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index b8ca503..febb21e 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -1,7 +1,9 @@ import pandas as pd import torch from sklearn.model_selection import train_test_split -from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline +from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \ + DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM + from datasets import Dataset diff --git a/main.py b/main.py index 25046e2..c2ae170 100644 --- a/main.py +++ b/main.py @@ -204,7 +204,7 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'): return metrics_df - def get_top_bottom_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): + def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): # Get support for each topic support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) @@ -228,12 +228,13 @@ def get_top_bottom_topics(test_metrics_df, test_percentage_analysis_df, metric=' return top_3_topics, bottom_3_topics_below_baseline - topics = get_top_bottom_topics(test_metrics, test_analysis, metric='accuracy') + topics = get_top_lower_topics(test_metrics, test_analysis, metric='accuracy') print(f"Top 3 (lower score) topics: {topics[0]}") def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples): - # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic). Then using the sentences as baseline, generate more that will later be useed to train the model + # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic). + # Then using the sentences as baseline, generate more that will later be used to train the model synthetic_texts = [] for topic in topics: topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic] From 8b0adbf8c00e98a6e7d9eb37e3e82ea8a3dadea4 Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Fri, 26 Jul 2024 18:56:52 +0200 Subject: [PATCH 07/42] use val instead of test --- main.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index c2ae170..af746bf 100644 --- a/main.py +++ b/main.py @@ -155,16 +155,17 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column print("Train Metrics per Topic") print(train_metrics) + + # Identify the least performing topics + overall_accuracy = train_metrics['accuracy'].mean() + least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist() + print(f"Least performing training topics: {least_performing_topics}") + print("\nTest Metrics per Topic") print(test_metrics) print("\nValidation Metrics per Topic") print(val_metrics) - # Identify least performing topics - overall_accuracy = train_metrics['accuracy'].mean() - least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist() - print(f"Least performing topics: {least_performing_topics}") - # Function to analyze disparities in sentiment predictions def analyze_disparities(subgroups): @@ -228,8 +229,8 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a return top_3_topics, bottom_3_topics_below_baseline - topics = get_top_lower_topics(test_metrics, test_analysis, metric='accuracy') - print(f"Top 3 (lower score) topics: {topics[0]}") + topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy') + print(f"Top 3 (lower score) validation topics: {topics[0]}") def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples): From 929c7324139bee74a608cdbff3ce79c2b6d762b3 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 19:08:37 +0200 Subject: [PATCH 08/42] Solving raul pull problems --- SentimentAnalyzer.py | 26 ++++++++++++++++---------- main.py | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index febb21e..63003fc 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -34,34 +34,40 @@ def map_label_to_target(self, label): else: return None - # Generate synthetic data using Hugging Face model - def generate_synthetic_data(self, topic, text, n_samples): + # Generate synthetic data using the FLAN model + def generate_synthetic_data(self, topic, text, sentiment, n_samples): synthetic_data = [] for _ in range(n_samples): - prompt = f"Generate a tweet related to {topic} that expresses sentiment similar to: '{text}'" + prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment similar to: '{text}' " inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device) outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) return synthetic_data - def augment_training_data(self, topics, n_samples=100): + # Augment the training data with synthetic data + def augment_training_data(self, topics, texts, sentiments, n_samples=6): augmented_data = {'text': [], 'label': []} augmented_data_with_topics = {'text': [], 'label': [], 'topic': []} - for topic in topics: - synthetic_texts = self.generate_synthetic_data(topic, n_samples) + + for topic, text, sentiment in zip(topics, texts, sentiments): + synthetic_texts = self.generate_synthetic_data(topic, text, sentiment, n_samples) + sentiment_label = self.map_label_to_target(sentiment) augmented_data['text'].extend(synthetic_texts) - augmented_data['label'].extend([1] * len(synthetic_texts)) # Defaulting to neutral + augmented_data['label'].extend([sentiment_label] * len(synthetic_texts)) augmented_data_with_topics['text'].extend(synthetic_texts) - augmented_data_with_topics['label'].extend([1] * len(synthetic_texts)) + augmented_data_with_topics['label'].extend([sentiment_label] * len(synthetic_texts)) augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts)) augmented_df = pd.DataFrame(augmented_data) augmented_df_with_topics = pd.DataFrame(augmented_data_with_topics) return augmented_df, augmented_df_with_topics - def fine_tune_with_augmented_data(self, topics, n_samples=6, epochs=3, batch_size=16, learning_rate=2e-5): - augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples) + # Fine-tune the model with augmented data + def fine_tune_with_augmented_data(self, topics, texts, sentiments, n_samples=6, epochs=3, batch_size=16, + learning_rate=2e-5): + augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, texts, sentiments, + n_samples) return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics # Fine-tune the model on a custom dataset diff --git a/main.py b/main.py index c2ae170..7d2dbad 100644 --- a/main.py +++ b/main.py @@ -263,4 +263,4 @@ def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metada # Fine-tune the sentiment analyzer with the augmented dataset augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data) - print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}") \ No newline at end of file + print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}") From 077b9a47afbdecccc487c229773600cc1db5accf Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Fri, 26 Jul 2024 19:12:39 +0200 Subject: [PATCH 09/42] use val instead of test --- main.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index 67e3632..7b77d23 100644 --- a/main.py +++ b/main.py @@ -155,12 +155,6 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column print("Train Metrics per Topic") print(train_metrics) - - # Identify the least performing topics - overall_accuracy = train_metrics['accuracy'].mean() - least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist() - print(f"Least performing training topics: {least_performing_topics}") - print("\nTest Metrics per Topic") print(test_metrics) print("\nValidation Metrics per Topic") @@ -234,8 +228,10 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples): - # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic). - # Then using the sentences as baseline, generate more that will later be used to train the model + # Ensure topics is a list of individual topics + if isinstance(topics[0], list): + topics = [item for sublist in topics for item in sublist] + synthetic_texts = [] for topic in topics: topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic] From e5eb93fd41e15b3776eff9f6f990417e8de8f999 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:54:55 +0200 Subject: [PATCH 10/42] is this the end? --- SentimentAnalyzer.py | 51 +++++++++++--------- main.py | 110 +++++++++++++++++++++++++++---------------- 2 files changed, 98 insertions(+), 63 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 63003fc..fbe6a82 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -3,7 +3,7 @@ from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \ DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM - +from sklearn.preprocessing import OneHotEncoder from datasets import Dataset @@ -34,11 +34,22 @@ def map_label_to_target(self, label): else: return None + def map_target_to_label(self, target): + # Map the target value to the sentiment label + if target == 0: + return "negative" + elif target == 1: + return "neutral" + elif target == 2: + return "positive" + else: + return None + # Generate synthetic data using the FLAN model def generate_synthetic_data(self, topic, text, sentiment, n_samples): synthetic_data = [] for _ in range(n_samples): - prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment similar to: '{text}' " + prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' " inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device) outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) @@ -46,29 +57,23 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples): return synthetic_data # Augment the training data with synthetic data - def augment_training_data(self, topics, texts, sentiments, n_samples=6): - augmented_data = {'text': [], 'label': []} - augmented_data_with_topics = {'text': [], 'label': [], 'topic': []} + def generate_training_data(self, topics, texts, sentiments, n_samples=6): + generated_data = {'text': [], 'label': []} + generated_data_with_topic = {'text': [], 'label': [], 'topic': []} for topic, text, sentiment in zip(topics, texts, sentiments): - synthetic_texts = self.generate_synthetic_data(topic, text, sentiment, n_samples) - sentiment_label = self.map_label_to_target(sentiment) - augmented_data['text'].extend(synthetic_texts) - augmented_data['label'].extend([sentiment_label] * len(synthetic_texts)) - augmented_data_with_topics['text'].extend(synthetic_texts) - augmented_data_with_topics['label'].extend([sentiment_label] * len(synthetic_texts)) - augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts)) - - augmented_df = pd.DataFrame(augmented_data) - augmented_df_with_topics = pd.DataFrame(augmented_data_with_topics) - return augmented_df, augmented_df_with_topics - - # Fine-tune the model with augmented data - def fine_tune_with_augmented_data(self, topics, texts, sentiments, n_samples=6, epochs=3, batch_size=16, - learning_rate=2e-5): - augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, texts, sentiments, - n_samples) - return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics + sentiment_text = self.map_target_to_label(sentiment) + synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples) # List of synthetic texts + generated_data['text'].extend(synthetic_texts) + generated_data['category'].extend([sentiment] * len(synthetic_texts)) # append sentiment to texts many times + generated_data_with_topic['text'].extend(synthetic_texts) + generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts)) + generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts)) + + generated_df = pd.DataFrame(generated_data) + generated_df_with_topics = pd.DataFrame(generated_data_with_topic) + return generated_df, generated_df_with_topics + # Fine-tune the model on a custom dataset def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5): diff --git a/main.py b/main.py index 7b77d23..469768a 100644 --- a/main.py +++ b/main.py @@ -62,7 +62,7 @@ # Save the fine-tuned model torch.save(sentiment_analyzer.model, model_save_path) - # Extract metadata for the datasets + # Define the file names for the sentiment predictions train_sentiment_file_name = os.path.join(base_path, f'train_sentiment_{args.dataset_type}_{args.percentage}.csv') test_sentiment_file_name = os.path.join(base_path, f'test_sentiment_{args.dataset_type}_{args.percentage}.csv') val_sentiment_file_name = os.path.join(base_path, f'val_sentiment_{args.dataset_type}_{args.percentage}.csv') @@ -210,7 +210,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a baseline_accuracy = weighted_metrics_df['accuracy'].mean() # Sort topics by their weighted metrics - sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) + sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending # Get top 3 and bottom 3 topics top_3_topics = sorted_metrics.head(3)['topic'].tolist() @@ -223,41 +223,71 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a return top_3_topics, bottom_3_topics_below_baseline - topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy') - print(f"Top 3 (lower score) validation topics: {topics[0]}") - - - def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples): - # Ensure topics is a list of individual topics - if isinstance(topics[0], list): - topics = [item for sublist in topics for item in sublist] - - synthetic_texts = [] - for topic in topics: - topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic] - topic_samples = topic_data.sample(n_samples, replace=True) - for index, row in topic_samples.iterrows(): - synthetic_texts.extend(sentiment_analyzer.generate_synthetic_data(row['topic'], row['text'], n_samples)) - return synthetic_texts - - # Generate and augment data for least performing topics - synthetic_texts = generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, - n_samples=10) - if args.debug: - print(f"Generated {len(synthetic_texts)} synthetic texts for least performing topics") - print("Sample synthetic texts:") - print(synthetic_texts[:5]) - - # Create a new DataFrame for the synthetic data - synthetic_df = pd.DataFrame({ - 'text': synthetic_texts, - 'category': [1] * len(synthetic_texts), # Assuming neutral category for synthetic data - 'topic': least_performing_topics * (len(synthetic_texts) // len(least_performing_topics)) - }) - - # Augment original training data with synthetic data - augmented_train_data = pd.concat([original_train_data, synthetic_df], ignore_index=True) - - # Fine-tune the sentiment analyzer with the augmented dataset - augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data) - print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}") + top_3_topics, bottom_3_topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy') + print(f"Bottom 3 validation topics: {bottom_3_topics }") + + # Randomly select rows from bottom three topics in the training set + train_data_bottom_3 = train_data_with_metadata[train_data_with_metadata['topic'].isin(bottom_3_topics)] + selected_samples = train_data_bottom_3.sample(n=50, random_state=42) # Select n samples from the bottom 3 topics + + # Augment the selected samples using the sentiment analyzer + generated_df, generated_df_with_metadata = sentiment_analyzer.generate_training_data( + selected_samples['topic'].tolist(), + selected_samples['text'].tolist(), + selected_samples['sentiment'].tolist() + ) + + # Combine the original and augmented datasets + train_original_and_generated_data = pd.concat([original_train_data, generated_df], ignore_index=True) + # Save the combined datasets + train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False) + + model_save_path = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt') + print("Fine-tuning the sentiment analyzer with the generated+original dataset...") + fine_tuning_results_new = sentiment_analyzer.fine_tune(new_training_data) # TODO NON CE + print(f"Fine-tuning results: {fine_tuning_results_new}") + # Save the fine-tuned model + torch.save(sentiment_analyzer.model, model_save_path) + + # Predict sentiment for the original dataset to see for improvements + test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') + val_sentiment_file_name_v2 = os.path.join(base_path, f'val_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') + test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer, test_sentiment_file_name_v2, args.debug) + val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name_v2, args.debug) + + # Compute metrics for the test dataset + test_true_labels = original_test_data['category'] + test_predicted_labels_v2 = test_data_with_sentiment_v2['sentiment'] + print("\nTest Classification Report:") + print(classification_report(test_true_labels, test_predicted_labels_v2, labels=[0, 1, 2], zero_division=0)) + + # Compute metrics for the validation dataset + val_true_labels = original_val_data['category'] + val_predicted_labels_v2 = val_data_with_sentiment_v2['sentiment'] + print("\nValidation Classification Report:") + print(classification_report(val_true_labels, val_predicted_labels_v2, labels=[0, 1, 2], zero_division=0)) + + test_file_name_v2 = os.path.join(base_path, f'test_augmented_v2_{args.dataset_type}_{args.percentage}.csv') + val_file_name_v2 = os.path.join(base_path, f'val_augmented_v2_{args.dataset_type}_{args.percentage}.csv') + test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor, topic_labels, test_file_name_v2, args.debug) + val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels, val_file_name_v2, args.debug) + + # Create subgroups for the datasets + test_subgroups_v2 = create_subgroups(test_data_with_metadata_v2) + val_subgroups_v2 = create_subgroups(val_data_with_metadata_v2) + + + test_metrics_v2 = compute_metrics(test_subgroups_v2) + val_metrics_v2 = compute_metrics(val_subgroups_v2) + + print("\nTest Metrics per Topic") + print(test_metrics_v2) + print("\nValidation Metrics per Topic") + print(val_metrics_v2) + + test_analysis_v2 = analyze_disparities(test_subgroups_v2) + val_analysis_v2 = analyze_disparities(val_subgroups_v2) + print("\nTest Percentage Analysis") + print(test_analysis_v2) + print("\nValidation Percentage Analysis") + print(val_analysis_v2) \ No newline at end of file From 63f353bcbbfc5f235ad330b6a3e395e4a9ac3a81 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 21:01:41 +0200 Subject: [PATCH 11/42] is this the end? --- SentimentAnalyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index fbe6a82..40d7b5a 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -58,8 +58,8 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples): # Augment the training data with synthetic data def generate_training_data(self, topics, texts, sentiments, n_samples=6): - generated_data = {'text': [], 'label': []} - generated_data_with_topic = {'text': [], 'label': [], 'topic': []} + generated_data = {'text': [], 'category': []} + generated_data_with_topic = {'text': [], 'category': [], 'topic': []} for topic, text, sentiment in zip(topics, texts, sentiments): sentiment_text = self.map_target_to_label(sentiment) From ea46935bd0eee25c92f659b32fb16e856d9d875c Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 21:09:44 +0200 Subject: [PATCH 12/42] is this the end? --- SentimentAnalyzer.py | 1 + main.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 40d7b5a..9f4e835 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -54,6 +54,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples): outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) + print(f"Generated Text: {generated_text}") return synthetic_data # Augment the training data with synthetic data diff --git a/main.py b/main.py index 469768a..58f6f8e 100644 --- a/main.py +++ b/main.py @@ -226,6 +226,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a top_3_topics, bottom_3_topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy') print(f"Bottom 3 validation topics: {bottom_3_topics }") + print("Augmenting the training dataset with synthetic data...") # Randomly select rows from bottom three topics in the training set train_data_bottom_3 = train_data_with_metadata[train_data_with_metadata['topic'].isin(bottom_3_topics)] selected_samples = train_data_bottom_3.sample(n=50, random_state=42) # Select n samples from the bottom 3 topics @@ -242,12 +243,12 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a # Save the combined datasets train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False) - model_save_path = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt') + model_save_path_v2 = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt') print("Fine-tuning the sentiment analyzer with the generated+original dataset...") - fine_tuning_results_new = sentiment_analyzer.fine_tune(new_training_data) # TODO NON CE + fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data) # TODO NON CE print(f"Fine-tuning results: {fine_tuning_results_new}") # Save the fine-tuned model - torch.save(sentiment_analyzer.model, model_save_path) + torch.save(sentiment_analyzer.model, model_save_path_v2) # Predict sentiment for the original dataset to see for improvements test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') From 7b27fb1c3e5e52ff026507ce5fe913e5b6dc6000 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 21:22:18 +0200 Subject: [PATCH 13/42] is this the end? --- SentimentAnalyzer.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 9f4e835..a6807b2 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -48,10 +48,19 @@ def map_target_to_label(self, target): # Generate synthetic data using the FLAN model def generate_synthetic_data(self, topic, text, sentiment, n_samples): synthetic_data = [] + print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") for _ in range(n_samples): - prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' " + prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' " inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device) - outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1) + # Use top-k sampling and temperature sampling for more diverse outputs + outputs = self.flan_model.generate( + inputs.input_ids, + max_length=60, + num_return_sequences=1, + do_sample=True, + top_k=50, # Consider top 50 tokens + temperature=0.7 # Adjust temperature to control diversity + ) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) print(f"Generated Text: {generated_text}") From 3a062fc5416839760cc373611739260416d06867 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Fri, 26 Jul 2024 21:33:20 +0200 Subject: [PATCH 14/42] is this the end? --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 58f6f8e..e9a2ea6 100644 --- a/main.py +++ b/main.py @@ -99,7 +99,7 @@ extractor = MetadataExtractor() # Define topic labels - topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"] + topic_labels = ["news", "entertainment", "sports", "technology", "health", "education", "business", "lifestyle", "opinions", "other"] # Define the base path where main.py is located base_path = os.path.dirname(os.path.abspath(__file__)) From a74fb8652ecc19920ed9a4153332c6e6f14fb24c Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 08:36:11 +0200 Subject: [PATCH 15/42] is this the end? --- SentimentAnalyzer.py | 6 +-- main.py | 94 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index a6807b2..7aa7db5 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -48,7 +48,7 @@ def map_target_to_label(self, target): # Generate synthetic data using the FLAN model def generate_synthetic_data(self, topic, text, sentiment, n_samples): synthetic_data = [] - print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") + #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") for _ in range(n_samples): prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' " inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device) @@ -59,11 +59,11 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples): num_return_sequences=1, do_sample=True, top_k=50, # Consider top 50 tokens - temperature=0.7 # Adjust temperature to control diversity + temperature=0.5 # Adjust temperature to control diversity ) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) - print(f"Generated Text: {generated_text}") + #print(f"Generated Text: {generated_text}") return synthetic_data # Augment the training data with synthetic data diff --git a/main.py b/main.py index e9a2ea6..06841b1 100644 --- a/main.py +++ b/main.py @@ -291,4 +291,96 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a print("\nTest Percentage Analysis") print(test_analysis_v2) print("\nValidation Percentage Analysis") - print(val_analysis_v2) \ No newline at end of file + print(val_analysis_v2) + + + def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'): + """ + Plots a comparison of the given metric before and after fine-tuning. + + Parameters: + - old_metrics: DataFrame containing the old metrics + - new_metrics: DataFrame containing the new metrics + - metric: The metric to compare (default is 'accuracy') + """ + # Merge the old and new metrics on the 'topic' column + comparison_df = old_metrics.merge(new_metrics, on='topic', suffixes=('_old', '_new')) + + # Sort the DataFrame by the new metric for better visualization + comparison_df = comparison_df.sort_values(by=f'{metric}_new', ascending=False) + + # Plot the comparison + plt.figure(figsize=(12, 8)) + bar_width = 0.4 + + # Positioning the bars + r1 = range(len(comparison_df)) + r2 = [x + bar_width for x in r1] + + plt.bar(r1, comparison_df[f'{metric}_old'], color='blue', width=bar_width, edgecolor='grey', label='Old') + plt.bar(r2, comparison_df[f'{metric}_new'], color='green', width=bar_width, edgecolor='grey', label='New') + + plt.xlabel('Topics', fontweight='bold') + plt.ylabel(metric.capitalize(), fontweight='bold') + plt.title(f'Comparison of {metric.capitalize()} by Topic', fontweight='bold') + plt.xticks([r + bar_width / 2 for r in range(len(comparison_df))], comparison_df['topic'], rotation=90) + plt.legend() + + plt.tight_layout() + plt.show() + + + # Plot the comparison for accuracy, precision, recall, and f1-score + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='accuracy') + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='precision') + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='recall') + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score') + + + def calculate_overall_accuracy(metrics_df): + """ + Calculate the overall accuracy from the metrics DataFrame. + + Parameters: + - metrics_df: DataFrame containing the metrics + + Returns: + - overall_accuracy: The overall accuracy + """ + total_support = metrics_df['total'].sum() + weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum() + overall_accuracy = weighted_accuracy_sum / total_support + return overall_accuracy + + + def plot_overall_accuracy_comparison(old_metrics, new_metrics): + """ + Plot the overall accuracy comparison before and after fine-tuning. + + Parameters: + - old_metrics: DataFrame containing the old metrics + - new_metrics: DataFrame containing the new metrics + """ + overall_accuracy_old = calculate_overall_accuracy(old_metrics) + overall_accuracy_new = calculate_overall_accuracy(new_metrics) + + accuracies = [overall_accuracy_old, overall_accuracy_new] + labels = ['Old Model', 'New Model'] + + plt.figure(figsize=(8, 6)) + plt.bar(labels, accuracies, color=['blue', 'green'], edgecolor='grey') + + plt.xlabel('Model', fontweight='bold') + plt.ylabel('Overall Accuracy', fontweight='bold') + plt.title('Overall Accuracy Comparison', fontweight='bold') + plt.ylim(0, 1) # Assuming accuracy is between 0 and 1 + + for i, v in enumerate(accuracies): + plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold') + + plt.tight_layout() + plt.show() + + + # Calculate and plot the overall accuracy comparison + plot_overall_accuracy_comparison(test_metrics, test_metrics_v2) \ No newline at end of file From 3290b60eb149a14f444983c05935ed2362a5a5a5 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 09:14:03 +0200 Subject: [PATCH 16/42] is this the end? --- SentimentAnalyzer.py | 10 ++++++---- main.py | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 7aa7db5..60491cf 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -46,7 +46,7 @@ def map_target_to_label(self, target): return None # Generate synthetic data using the FLAN model - def generate_synthetic_data(self, topic, text, sentiment, n_samples): + def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False): synthetic_data = [] #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") for _ in range(n_samples): @@ -58,12 +58,14 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples): max_length=60, num_return_sequences=1, do_sample=True, - top_k=50, # Consider top 50 tokens - temperature=0.5 # Adjust temperature to control diversity + top_k=50, + temperature=0.5 ) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) - #print(f"Generated Text: {generated_text}") + if debug: + print(f"Generated Text: {generated_text}") + print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}") return synthetic_data # Augment the training data with synthetic data diff --git a/main.py b/main.py index 06841b1..a7b2ca2 100644 --- a/main.py +++ b/main.py @@ -235,7 +235,8 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a generated_df, generated_df_with_metadata = sentiment_analyzer.generate_training_data( selected_samples['topic'].tolist(), selected_samples['text'].tolist(), - selected_samples['sentiment'].tolist() + selected_samples['sentiment'].tolist(), + debug=args.debug ) # Combine the original and augmented datasets From c8e22cc6c1381c185460e43092c129277590693c Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 09:16:50 +0200 Subject: [PATCH 17/42] is this the end? --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index a7b2ca2..0b0814b 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ import argparse import os - +import matplotlib.pyplot as plt import pandas as pd import torch from sklearn.metrics import classification_report From 74b0b848c1415874d154551f6af36937a13b5e5c Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 09:19:28 +0200 Subject: [PATCH 18/42] count was initialized --- SentimentAnalyzer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 60491cf..66f5677 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -3,7 +3,7 @@ from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \ DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM -from sklearn.preprocessing import OneHotEncoder + from datasets import Dataset @@ -49,6 +49,7 @@ def map_target_to_label(self, target): def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False): synthetic_data = [] #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") + count = 0 for _ in range(n_samples): prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' " inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device) From e12f205bfe1ef0b9918f4e36556f7248b19c60b7 Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 09:20:01 +0200 Subject: [PATCH 19/42] added matplotlib to reqs --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9a3e5b1..11143d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ scikit_learn==1.2.0 transformers==4.42.4 tensorflow gdown - +matplotlib From c0997dd5dae126e05554a6352d472e9c0c9550b2 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 09:23:06 +0200 Subject: [PATCH 20/42] is this the end? --- SentimentAnalyzer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 66f5677..2b1ffd4 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -64,6 +64,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False ) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) + count += 1 if debug: print(f"Generated Text: {generated_text}") print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}") From 1b911b5e90c1c0e200889d703b4318269d63309b Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 09:24:03 +0200 Subject: [PATCH 21/42] is this the end? --- SentimentAnalyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 2b1ffd4..1d3e2e1 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -56,7 +56,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False # Use top-k sampling and temperature sampling for more diverse outputs outputs = self.flan_model.generate( inputs.input_ids, - max_length=60, + max_length=100, num_return_sequences=1, do_sample=True, top_k=50, From b4df0cab9232cc606af6bf7bd09c4a8c57e3c372 Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 09:25:02 +0200 Subject: [PATCH 22/42] temperature set to 0.9 --- SentimentAnalyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 2b1ffd4..2c8ab0a 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -60,7 +60,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False num_return_sequences=1, do_sample=True, top_k=50, - temperature=0.5 + temperature=0.9 ) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) From bda7149cdb89aad631149341792563471972c4ac Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 09:26:17 +0200 Subject: [PATCH 23/42] max lenght 60 --- SentimentAnalyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 16bc427..2c8ab0a 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -56,7 +56,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False # Use top-k sampling and temperature sampling for more diverse outputs outputs = self.flan_model.generate( inputs.input_ids, - max_length=100, + max_length=60, num_return_sequences=1, do_sample=True, top_k=50, From 9eaddb61830e10131601089ac29f08c9ae8ee91f Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 09:45:15 +0200 Subject: [PATCH 24/42] implemenmted deep debug --- SentimentAnalyzer.py | 34 +++++++++++++++++++++------------- extract_stuff.py | 10 +++++----- main.py | 28 ++++++++++++++++++---------- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 2c8ab0a..a19fc77 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -12,13 +12,15 @@ def __init__(self): self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" self.device = 0 if torch.cuda.is_available() else -1 # Use GPU if available self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device) + self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, + ignore_mismatched_sizes=True).to(self.device) self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device) # Initialize FLAN model for synthetic data generation self.flan_model_name = "google/flan-t5-small" self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name) self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device) + def analyze_sentiment(self, text): results = self.classifier(text) return results[0]['label'] @@ -48,7 +50,7 @@ def map_target_to_label(self, target): # Generate synthetic data using the FLAN model def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False): synthetic_data = [] - #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") + # print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") count = 0 for _ in range(n_samples): prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' " @@ -66,40 +68,46 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False synthetic_data.append(generated_text) count += 1 if debug: - print(f"Generated Text: {generated_text}") + print(f"DEBUG - Generated Text: {generated_text}") + print(f"DEBUG - Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}") + if int(count % 5) == 0: print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}") return synthetic_data # Augment the training data with synthetic data - def generate_training_data(self, topics, texts, sentiments, n_samples=6): + def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=False): + print("Generating synthetic data...") generated_data = {'text': [], 'category': []} generated_data_with_topic = {'text': [], 'category': [], 'topic': []} for topic, text, sentiment in zip(topics, texts, sentiments): sentiment_text = self.map_target_to_label(sentiment) - synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples) # List of synthetic texts + synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, + n_samples) # List of synthetic texts generated_data['text'].extend(synthetic_texts) - generated_data['category'].extend([sentiment] * len(synthetic_texts)) # append sentiment to texts many times + generated_data['category'].extend( + [sentiment] * len(synthetic_texts)) # append sentiment to texts many times generated_data_with_topic['text'].extend(synthetic_texts) generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts)) generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts)) + if debug: + print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}") generated_df = pd.DataFrame(generated_data) generated_df_with_topics = pd.DataFrame(generated_data_with_topic) return generated_df, generated_df_with_topics - # Fine-tune the model on a custom dataset def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5): # Preprocess the dataset - df = df.rename(columns={"text": "text", "category": "label"}) # Rename the columns - df['label'] = df['label'].astype(int) # Ensure the labels are integers - train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset + df = df.rename(columns={"text": "text", "category": "label"}) # Rename the columns + df['label'] = df['label'].astype(int) # Ensure the labels are integers + train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset - train_dataset = Dataset.from_pandas(train_df) # Load the dataset + train_dataset = Dataset.from_pandas(train_df) # Load the dataset test_dataset = Dataset.from_pandas(test_df) - def tokenize_function(examples): # Tokenize the text + def tokenize_function(examples): # Tokenize the text return self.tokenizer(examples["text"], padding="max_length", truncation=True) train_dataset = train_dataset.map(tokenize_function, batched=True) # Tokenize the dataset @@ -108,7 +116,7 @@ def tokenize_function(examples): # Tokenize the text train_dataset = train_dataset.remove_columns(["text"]) # Remove the text column after tokenization test_dataset = test_dataset.remove_columns(["text"]) - train_dataset.set_format("torch") # Set the format to PyTorch + train_dataset.set_format("torch") # Set the format to PyTorch test_dataset.set_format("torch") # Define the data collator diff --git a/extract_stuff.py b/extract_stuff.py index 2d779e9..392c06e 100644 --- a/extract_stuff.py +++ b/extract_stuff.py @@ -28,9 +28,9 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de # If debug mode is enabled, print debug information percentage_complete = ((count + 1) / total_rows) * 100 if debug: - print(f"Text: {row['text']}") - print(f"Generated Metadata: Topic - {topic}") - print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") + print(f"DEBUG - Text: {row['text']}") + print(f"DEBUG - Generated Metadata: Topic - {topic}") + print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") if percentage_complete % 5 == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") @@ -70,8 +70,8 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch # Calculate the percentage of completion percentage_complete = ((end) / total_rows) * 100 if debug: - print(f"Processed batch {start // batch_size + 1}: {start} to {end}") - print(f"Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}") + print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}") + print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}") if percentage_complete % 5 == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%") diff --git a/main.py b/main.py index 0b0814b..e442de7 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ import argparse import os + import matplotlib.pyplot as plt import pandas as pd import torch @@ -19,6 +20,8 @@ parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'TODO'], help='Type of dataset to load') parser.add_argument('--debug', type=bool, default=False, + help='Enable debug mode to print even more additional information') + parser.add_argument('--deep_debug', type=bool, default=False, help='Enable debug mode to print additional information') parser.add_argument('--percentage', type=float, default=100.0, help='Percentage of the dataset to use (e.g., 0.1 for 0.1%)') @@ -26,6 +29,7 @@ # Parse command-line arguments args = parser.parse_args() print("Debugging is set to: ", args.debug) + print("Deep Debugging is set to: ", args.deep_debug) print("Percentage is set to: ", args.percentage) # Print Torch availability and device information @@ -69,11 +73,11 @@ # Predict sentiment for the datasets train_data_with_sentiment = predict_sentiment(original_train_data.copy(), sentiment_analyzer, - train_sentiment_file_name, args.debug) + train_sentiment_file_name, args.deep_debug) test_data_with_sentiment = predict_sentiment(original_test_data.copy(), sentiment_analyzer, - test_sentiment_file_name, args.debug) + test_sentiment_file_name, args.deep_debug) val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name, - args.debug) + args.deep_debug) # Compute metrics for the train dataset @@ -110,11 +114,11 @@ val_file_name = os.path.join(base_path, f'val_augmented_{args.dataset_type}_{args.percentage}.csv') train_data_with_metadata = augment_and_extract_metadata(train_data_with_sentiment.copy(), extractor, - topic_labels, train_file_name, args.debug) + topic_labels, train_file_name, args.deep_debug) test_data_with_metadata = augment_and_extract_metadata(test_data_with_sentiment.copy(), extractor, - topic_labels, test_file_name, args.debug) + topic_labels, test_file_name, args.deep_debug) val_data_with_metadata = augment_and_extract_metadata(val_data_with_sentiment.copy(), extractor, - topic_labels, val_file_name, args.debug) + topic_labels, val_file_name, args.deep_debug) # Function to create subgroups based on metadata @@ -254,8 +258,10 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a # Predict sentiment for the original dataset to see for improvements test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') val_sentiment_file_name_v2 = os.path.join(base_path, f'val_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') - test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer, test_sentiment_file_name_v2, args.debug) - val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name_v2, args.debug) + test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer, + test_sentiment_file_name_v2, args.deep_debug) + val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer, + val_sentiment_file_name_v2, args.deep_debug) # Compute metrics for the test dataset test_true_labels = original_test_data['category'] @@ -271,8 +277,10 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a test_file_name_v2 = os.path.join(base_path, f'test_augmented_v2_{args.dataset_type}_{args.percentage}.csv') val_file_name_v2 = os.path.join(base_path, f'val_augmented_v2_{args.dataset_type}_{args.percentage}.csv') - test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor, topic_labels, test_file_name_v2, args.debug) - val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels, val_file_name_v2, args.debug) + test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor, + topic_labels, test_file_name_v2, args.deep_debug) + val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels, + val_file_name_v2, args.deep_debug) # Create subgroups for the datasets test_subgroups_v2 = create_subgroups(test_data_with_metadata_v2) From fe5f47c9c8ee97307804db9344a15a379aaf2b1f Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 10:01:32 +0200 Subject: [PATCH 25/42] small fix --- SentimentAnalyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index a19fc77..ef39adc 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -82,8 +82,8 @@ def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=F for topic, text, sentiment in zip(topics, texts, sentiments): sentiment_text = self.map_target_to_label(sentiment) - synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, - n_samples) # List of synthetic texts + synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples, + debug) # List of synthetic texts generated_data['text'].extend(synthetic_texts) generated_data['category'].extend( [sentiment] * len(synthetic_texts)) # append sentiment to texts many times From 359cde1c8b3f35436e3da10c8ec3f4ed7a215f71 Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 10:34:43 +0200 Subject: [PATCH 26/42] small fix of print statements 5% --- extract_stuff.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extract_stuff.py b/extract_stuff.py index 392c06e..8d10f33 100644 --- a/extract_stuff.py +++ b/extract_stuff.py @@ -32,7 +32,7 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de print(f"DEBUG - Generated Metadata: Topic - {topic}") print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") - if percentage_complete % 5 == 0: + if int(percentage_complete % 5) == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") count += 1 @@ -68,11 +68,11 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch # Extend the sentiments list with the batch sentiments sentiments.extend(batch_sentiments) # Calculate the percentage of completion - percentage_complete = ((end) / total_rows) * 100 + percentage_complete = (end / total_rows) * 100 if debug: print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}") print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}") - if percentage_complete % 5 == 0: + if int(percentage_complete % 5) == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%") dataset['sentiment'] = sentiments From 1ac3ec8dd26236e0e7ecbf87a122a003eedee2a6 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 11:24:32 +0200 Subject: [PATCH 27/42] tried to fix plotting --- main.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index e442de7..9a05d71 100644 --- a/main.py +++ b/main.py @@ -202,7 +202,6 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'): metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support'] return metrics_df - def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): # Get support for each topic support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) @@ -391,5 +390,15 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics): plt.show() + # Ensure the 'total' column exists in the metrics DataFrame + test_analysis_v2 = analyze_disparities(test_subgroups_v2) + val_analysis_v2 = analyze_disparities(val_subgroups_v2) + + # Print the DataFrame to verify the 'total' column + print("\nTest Analysis V2:") + print(test_analysis_v2) + print("\nValidation Analysis V2:") + print(val_analysis_v2) + # Calculate and plot the overall accuracy comparison plot_overall_accuracy_comparison(test_metrics, test_metrics_v2) \ No newline at end of file From 58d00e2a0e28b1287bef3a4d7b5a5730eff70d9b Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 11:27:00 +0200 Subject: [PATCH 28/42] small fix of print statements 5%, more print statements --- SentimentAnalyzer.py | 9 +++++++++ extract_stuff.py | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index ef39adc..878d115 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -80,6 +80,8 @@ def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=F generated_data = {'text': [], 'category': []} generated_data_with_topic = {'text': [], 'category': [], 'topic': []} + count = 0 + total = len(texts) for topic, text, sentiment in zip(topics, texts, sentiments): sentiment_text = self.map_target_to_label(sentiment) synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples, @@ -90,8 +92,15 @@ def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=F generated_data_with_topic['text'].extend(synthetic_texts) generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts)) generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts)) + count += 1 if debug: print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}") + # Print percentage of completion of total texts + percentage_complete = count / total * 100 + if int(percentage_complete) % 5 == 0: + print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count} of {total}") + + generated_df = pd.DataFrame(generated_data) generated_df_with_topics = pd.DataFrame(generated_data_with_topic) diff --git a/extract_stuff.py b/extract_stuff.py index 8d10f33..706c2af 100644 --- a/extract_stuff.py +++ b/extract_stuff.py @@ -32,7 +32,7 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de print(f"DEBUG - Generated Metadata: Topic - {topic}") print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") - if int(percentage_complete % 5) == 0: + if int(percentage_complete) % 5 == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") count += 1 @@ -72,7 +72,7 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch if debug: print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}") print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}") - if int(percentage_complete % 5) == 0: + if int(percentage_complete) % 5 == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%") dataset['sentiment'] = sentiments From 37daa304978779471d898172d41a0ceeb66ca9b5 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 11:43:34 +0200 Subject: [PATCH 29/42] tried to fix plotting --- main.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 120 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 9a05d71..0edb9a9 100644 --- a/main.py +++ b/main.py @@ -199,9 +199,10 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'): # Join metrics with their respective support counts metrics_df = metrics_df.copy() metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') - metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support'] + metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total'] return metrics_df + def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): # Get support for each topic support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) @@ -361,16 +362,130 @@ def calculate_overall_accuracy(metrics_df): return overall_accuracy - def plot_overall_accuracy_comparison(old_metrics, new_metrics): + The + error + you + 're encountering, KeyError: ' + total + ', indicates that the DataFrame metrics_df does not contain a column named ' + total + '. This likely happens in the function calculate_overall_accuracy. + + To + resolve + this, you + need + to + ensure + that + the + DataFrame + passed + to + calculate_overall_accuracy + has + a + 'total' + column.The + 'total' + column + appears + to + represent + the + support(i.e., the + count + of + instances) for each topic in your analysis.This support data should be extracted from the analyze_disparities function. + + Here’s + a + modified + version + of + the + relevant + functions and parts + of + your + code, ensuring + that + the + 'total' + column is present in the + DataFrame + passed + to + calculate_overall_accuracy. + + python + + + def weighted_metrics(metrics_df, support_df, metric='accuracy'): + # Join metrics with their respective support counts + metrics_df = metrics_df.copy() + metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') + metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total'] + return metrics_df + + + def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): + # Get support for each topic + support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) + + # Compute weighted metrics + weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric) + + # Compute baseline accuracy + baseline_accuracy = weighted_metrics_df['accuracy'].mean() + + # Sort topics by their weighted metrics + sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending + + # Get top 3 and bottom 3 topics + top_3_topics = sorted_metrics.head(3)['topic'].tolist() + bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist() + + # Adjust for baseline accuracy + bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[ + 'topic'].tolist() + + return top_3_topics, bottom_3_topics_below_baseline + + + def calculate_overall_accuracy(metrics_df): + """ + Calculate the overall accuracy from the metrics DataFrame. + + Parameters: + - metrics_df: DataFrame containing the metrics + + Returns: + - overall_accuracy: The overall accuracy + """ + total_support = metrics_df['total'].sum() + weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum() + overall_accuracy = weighted_accuracy_sum / total_support + return overall_accuracy + + + def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, support_new): """ Plot the overall accuracy comparison before and after fine-tuning. Parameters: - old_metrics: DataFrame containing the old metrics - new_metrics: DataFrame containing the new metrics + - support_old: DataFrame containing the support data for the old metrics + - support_new: DataFrame containing the support data for the new metrics """ - overall_accuracy_old = calculate_overall_accuracy(old_metrics) - overall_accuracy_new = calculate_overall_accuracy(new_metrics) + old_metrics_with_total = old_metrics.merge(support_old[['subgroup', 'total']], left_on='topic', + right_on='subgroup') + new_metrics_with_total = new_metrics.merge(support_new[['subgroup', 'total']], left_on='topic', + right_on='subgroup') + + overall_accuracy_old = calculate_overall_accuracy(old_metrics_with_total) + overall_accuracy_new = calculate_overall_accuracy(new_metrics_with_total) accuracies = [overall_accuracy_old, overall_accuracy_new] labels = ['Old Model', 'New Model'] @@ -401,4 +516,4 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics): print(val_analysis_v2) # Calculate and plot the overall accuracy comparison - plot_overall_accuracy_comparison(test_metrics, test_metrics_v2) \ No newline at end of file + plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2) \ No newline at end of file From e7bd36fbba41fe2f47fefe5c6ea76a85ffad8c7b Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 11:53:02 +0200 Subject: [PATCH 30/42] tried to fix plotting --- main.py | 107 -------------------------------------------------------- 1 file changed, 107 deletions(-) diff --git a/main.py b/main.py index 0edb9a9..55a9f21 100644 --- a/main.py +++ b/main.py @@ -346,113 +346,6 @@ def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'): plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score') - def calculate_overall_accuracy(metrics_df): - """ - Calculate the overall accuracy from the metrics DataFrame. - - Parameters: - - metrics_df: DataFrame containing the metrics - - Returns: - - overall_accuracy: The overall accuracy - """ - total_support = metrics_df['total'].sum() - weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum() - overall_accuracy = weighted_accuracy_sum / total_support - return overall_accuracy - - - The - error - you - 're encountering, KeyError: ' - total - ', indicates that the DataFrame metrics_df does not contain a column named ' - total - '. This likely happens in the function calculate_overall_accuracy. - - To - resolve - this, you - need - to - ensure - that - the - DataFrame - passed - to - calculate_overall_accuracy - has - a - 'total' - column.The - 'total' - column - appears - to - represent - the - support(i.e., the - count - of - instances) for each topic in your analysis.This support data should be extracted from the analyze_disparities function. - - Here’s - a - modified - version - of - the - relevant - functions and parts - of - your - code, ensuring - that - the - 'total' - column is present in the - DataFrame - passed - to - calculate_overall_accuracy. - - python - - - def weighted_metrics(metrics_df, support_df, metric='accuracy'): - # Join metrics with their respective support counts - metrics_df = metrics_df.copy() - metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') - metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total'] - return metrics_df - - - def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): - # Get support for each topic - support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) - - # Compute weighted metrics - weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric) - - # Compute baseline accuracy - baseline_accuracy = weighted_metrics_df['accuracy'].mean() - - # Sort topics by their weighted metrics - sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending - - # Get top 3 and bottom 3 topics - top_3_topics = sorted_metrics.head(3)['topic'].tolist() - bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist() - - # Adjust for baseline accuracy - bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[ - 'topic'].tolist() - - return top_3_topics, bottom_3_topics_below_baseline - - def calculate_overall_accuracy(metrics_df): """ Calculate the overall accuracy from the metrics DataFrame. From 671040b255ebb1a456c2189cf8cbb375728a2b19 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 12:02:52 +0200 Subject: [PATCH 31/42] tried to fix plotting --- main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 55a9f21..b191253 100644 --- a/main.py +++ b/main.py @@ -180,7 +180,6 @@ def analyze_disparities(subgroups): }) return pd.DataFrame(analysis_results) - # Analyze disparities for the datasets train_analysis = analyze_disparities(train_subgroups) test_analysis = analyze_disparities(test_subgroups) @@ -198,11 +197,14 @@ def analyze_disparities(subgroups): def weighted_metrics(metrics_df, support_df, metric='accuracy'): # Join metrics with their respective support counts metrics_df = metrics_df.copy() + print("Columns before merging with support_df:", metrics_df.columns) + print("Support DataFrame columns:", support_df.columns) metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total'] return metrics_df + # Function to get top and bottom topics based on weighted metrics def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): # Get support for each topic support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) @@ -214,7 +216,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a baseline_accuracy = weighted_metrics_df['accuracy'].mean() # Sort topics by their weighted metrics - sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending + sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending # Get top 3 and bottom 3 topics top_3_topics = sorted_metrics.head(3)['topic'].tolist() From c318f9fe45a3a81496b5e68637751936c88c31a3 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 12:04:46 +0200 Subject: [PATCH 32/42] tried to fix plotting --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b191253..381957e 100644 --- a/main.py +++ b/main.py @@ -199,7 +199,7 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'): metrics_df = metrics_df.copy() print("Columns before merging with support_df:", metrics_df.columns) print("Support DataFrame columns:", support_df.columns) - metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') + metrics_df = metrics_df.merge(support_df, left_on='support', right_on='subgroup') metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total'] return metrics_df From bbc65bc2befb2e03b2c5fa5f74121fce84549794 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 12:07:12 +0200 Subject: [PATCH 33/42] tried to fix plotting --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 381957e..de4f7f7 100644 --- a/main.py +++ b/main.py @@ -199,8 +199,8 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'): metrics_df = metrics_df.copy() print("Columns before merging with support_df:", metrics_df.columns) print("Support DataFrame columns:", support_df.columns) - metrics_df = metrics_df.merge(support_df, left_on='support', right_on='subgroup') - metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total'] + metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') + metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support'] return metrics_df From 34beb4f073fe87a0926b15a6bec9fe2a763a9418 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 12:21:04 +0200 Subject: [PATCH 34/42] tried to fix plotting --- main.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index de4f7f7..d8bc9e2 100644 --- a/main.py +++ b/main.py @@ -197,8 +197,8 @@ def analyze_disparities(subgroups): def weighted_metrics(metrics_df, support_df, metric='accuracy'): # Join metrics with their respective support counts metrics_df = metrics_df.copy() - print("Columns before merging with support_df:", metrics_df.columns) - print("Support DataFrame columns:", support_df.columns) + # print("Columns before merging with support_df:", metrics_df.columns) + # print("Support DataFrame columns:", support_df.columns) metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support'] return metrics_df @@ -251,11 +251,15 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False) model_save_path_v2 = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt') - print("Fine-tuning the sentiment analyzer with the generated+original dataset...") - fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data) # TODO NON CE - print(f"Fine-tuning results: {fine_tuning_results_new}") - # Save the fine-tuned model - torch.save(sentiment_analyzer.model, model_save_path_v2) + if os.path.exists(model_save_path_v2): + print("Loading the fine-tuned model from disk...") + sentiment_analyzer.model = torch.load(model_save_path_v2) + else: + print("Fine-tuning the sentiment analyzer with the generated+original dataset...") + fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data) # TODO NON CE + print(f"Fine-tuning results: {fine_tuning_results_new}") + # Save the fine-tuned model + torch.save(sentiment_analyzer.model, model_save_path_v2) # Predict sentiment for the original dataset to see for improvements test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') @@ -338,8 +342,8 @@ def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'): plt.legend() plt.tight_layout() - plt.show() - + plt.savefig(f'comparison_{metric}.png') + plt.close() # Plot the comparison for accuracy, precision, recall, and f1-score plot_metrics_comparison(test_metrics, test_metrics_v2, metric='accuracy') @@ -347,7 +351,6 @@ def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'): plot_metrics_comparison(test_metrics, test_metrics_v2, metric='recall') plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score') - def calculate_overall_accuracy(metrics_df): """ Calculate the overall accuracy from the metrics DataFrame. @@ -363,7 +366,6 @@ def calculate_overall_accuracy(metrics_df): overall_accuracy = weighted_accuracy_sum / total_support return overall_accuracy - def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, support_new): """ Plot the overall accuracy comparison before and after fine-tuning. @@ -397,18 +399,13 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, supp plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold') plt.tight_layout() - plt.show() + plt.savefig('overall_accuracy_comparison.png') + plt.close() # Ensure the 'total' column exists in the metrics DataFrame test_analysis_v2 = analyze_disparities(test_subgroups_v2) val_analysis_v2 = analyze_disparities(val_subgroups_v2) - # Print the DataFrame to verify the 'total' column - print("\nTest Analysis V2:") - print(test_analysis_v2) - print("\nValidation Analysis V2:") - print(val_analysis_v2) - # Calculate and plot the overall accuracy comparison plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2) \ No newline at end of file From 724a4f29d119bf53625b397e2a4d26b3f351602d Mon Sep 17 00:00:00 2001 From: Leonardo Moraglia Date: Sat, 27 Jul 2024 14:55:06 +0200 Subject: [PATCH 35/42] reddit dataset is now enabled --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index d8bc9e2..aa33960 100644 --- a/main.py +++ b/main.py @@ -17,7 +17,7 @@ # Set up argument parser for command-line options parser = argparse.ArgumentParser(description='Load dataset') - parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'TODO'], + parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'reddit'], help='Type of dataset to load') parser.add_argument('--debug', type=bool, default=False, help='Enable debug mode to print even more additional information') @@ -39,7 +39,7 @@ # Initialize dataset loader with the specified type and base path base_path = os.path.dirname(os.path.abspath(__file__)) - dataset_loader = DatasetLoad('tweets', base_path, args.percentage) + dataset_loader = DatasetLoad(args.dataset_type, base_path, args.percentage) dataset_loader.load_datasets() # Load the original train, test, and validation datasets From 23b89072a4c545eba5feb81bfb22ebd638b9e86b Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 14:56:18 +0200 Subject: [PATCH 36/42] tried to fix plotting --- DatasetLoad.py | 3 ++- SentimentAnalyzer.py | 2 +- main.py | 38 +++++++++++++------------------------- 3 files changed, 16 insertions(+), 27 deletions(-) diff --git a/DatasetLoad.py b/DatasetLoad.py index 10a2f42..059a032 100644 --- a/DatasetLoad.py +++ b/DatasetLoad.py @@ -43,7 +43,8 @@ def load_datasets(self): data = data.rename(columns={'clean_comment': 'text'}) # truncate the text in the text column with over 512 characters data['text'] = data['text'].str.slice(0, 512) - + data['category'] = data['category'].map({-1: 0, 0: 1, 1: 2}) + data = data.dropna() elif self.dataset_type == 'tweets': print("Loading Twitter dataset...") diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 878d115..cd9c727 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -132,7 +132,7 @@ def tokenize_function(examples): # Tokenize the text data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) # Define training arguments - training_args = TrainingArguments( # Define the training arguments + training_args = TrainingArguments( # output_dir="./results", run_name="finetuning_sentiment_classifier", eval_strategy="epoch", diff --git a/main.py b/main.py index d8bc9e2..a5e7951 100644 --- a/main.py +++ b/main.py @@ -96,8 +96,10 @@ val_true_labels = original_val_data['category'] val_predicted_labels = val_data_with_sentiment['sentiment'] print("\nValidation Classification Report:") + val_report = classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0, + output_dict=True) print(classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0)) - + val_accuracy = val_report['accuracy'] # Initialize the metadata extractor extractor = MetadataExtractor() @@ -194,42 +196,33 @@ def analyze_disparities(subgroups): print(val_analysis) - def weighted_metrics(metrics_df, support_df, metric='accuracy'): + def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'): # Join metrics with their respective support counts metrics_df = metrics_df.copy() - # print("Columns before merging with support_df:", metrics_df.columns) - # print("Support DataFrame columns:", support_df.columns) metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') - metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support'] + metrics_df['weighted_metric'] = (metrics_df[metric] - accuracy) * metrics_df['support'] return metrics_df # Function to get top and bottom topics based on weighted metrics - def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'): + def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=val_accuracy, metric='accuracy'): # Get support for each topic support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) # Compute weighted metrics - weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric) - - # Compute baseline accuracy - baseline_accuracy = weighted_metrics_df['accuracy'].mean() + weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric) # Sort topics by their weighted metrics - sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending + sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending, so the first 3 are the most disadvantaged # Get top 3 and bottom 3 topics - top_3_topics = sorted_metrics.head(3)['topic'].tolist() - bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist() + bottom_3_topics = sorted_metrics.head(3)['topic'].tolist() + top_3_topics = sorted_metrics.tail(3)['topic'].tolist() - # Adjust for baseline accuracy - bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[ - 'topic'].tolist() + return bottom_3_topics, top_3_topics - return top_3_topics, bottom_3_topics_below_baseline - - top_3_topics, bottom_3_topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy') + bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, val_accuracy, metric='accuracy') print(f"Bottom 3 validation topics: {bottom_3_topics }") print("Augmenting the training dataset with synthetic data...") @@ -393,7 +386,7 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, supp plt.xlabel('Model', fontweight='bold') plt.ylabel('Overall Accuracy', fontweight='bold') plt.title('Overall Accuracy Comparison', fontweight='bold') - plt.ylim(0, 1) # Assuming accuracy is between 0 and 1 + plt.ylim(0, 1) for i, v in enumerate(accuracies): plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold') @@ -402,10 +395,5 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, supp plt.savefig('overall_accuracy_comparison.png') plt.close() - - # Ensure the 'total' column exists in the metrics DataFrame - test_analysis_v2 = analyze_disparities(test_subgroups_v2) - val_analysis_v2 = analyze_disparities(val_subgroups_v2) - # Calculate and plot the overall accuracy comparison plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2) \ No newline at end of file From a54d4d92f69834c3b0ebec252596850b546f3dd1 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 15:01:56 +0200 Subject: [PATCH 37/42] fixed bottom 3 --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index b1d37e1..2068297 100644 --- a/main.py +++ b/main.py @@ -200,7 +200,7 @@ def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'): # Join metrics with their respective support counts metrics_df = metrics_df.copy() metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') - metrics_df['weighted_metric'] = (metrics_df[metric] - accuracy) * metrics_df['support'] + metrics_df['weighted_metric'] = (accuracy - metrics_df[metric]) * metrics_df['support'] return metrics_df @@ -216,7 +216,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy= sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending, so the first 3 are the most disadvantaged # Get top 3 and bottom 3 topics - bottom_3_topics = sorted_metrics.head(3)['topic'].tolist() + bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist() top_3_topics = sorted_metrics.tail(3)['topic'].tolist() return bottom_3_topics, top_3_topics From 820e2833df02c08dd560cc5d697dc5a491325153 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 15:10:35 +0200 Subject: [PATCH 38/42] changed temperature --- SentimentAnalyzer.py | 2 +- main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index cd9c727..dbdd3d4 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -62,7 +62,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False num_return_sequences=1, do_sample=True, top_k=50, - temperature=0.9 + temperature=0.7 ) generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) synthetic_data.append(generated_text) diff --git a/main.py b/main.py index 2068297..52ebfea 100644 --- a/main.py +++ b/main.py @@ -223,7 +223,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy= bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, val_accuracy, metric='accuracy') - print(f"Bottom 3 validation topics: {bottom_3_topics }") + print(f"Bottom 3 validation topics: {bottom_3_topics}") print("Augmenting the training dataset with synthetic data...") # Randomly select rows from bottom three topics in the training set From 7fe919977852f3c7affbcc26d43c0f758fceac8b Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 15:27:39 +0200 Subject: [PATCH 39/42] debug for top_lower --- main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 52ebfea..7803442 100644 --- a/main.py +++ b/main.py @@ -100,7 +100,7 @@ output_dict=True) print(classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0)) val_accuracy = val_report['accuracy'] - + print(val_accuracy) # Initialize the metadata extractor extractor = MetadataExtractor() @@ -211,14 +211,14 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy= # Compute weighted metrics weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric) - + print(weighted_metrics_df) # Sort topics by their weighted metrics sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending, so the first 3 are the most disadvantaged - + print(sorted_metrics) # Get top 3 and bottom 3 topics bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist() top_3_topics = sorted_metrics.tail(3)['topic'].tolist() - + print(bottom_3_topics) return bottom_3_topics, top_3_topics From ec0a0a6608efb97720ec4c419317a9e2492c50b6 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 15:35:34 +0200 Subject: [PATCH 40/42] added stratification --- DatasetLoad.py | 4 ++-- main.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/DatasetLoad.py b/DatasetLoad.py index 059a032..23d2f69 100644 --- a/DatasetLoad.py +++ b/DatasetLoad.py @@ -63,8 +63,8 @@ def load_datasets(self): # Ensure the first column is 'text' and the second column is 'category' data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]] - train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42) - self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42) + train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['category']) + self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['category']) self.train_data = train_data if self.percentage < 100.0: diff --git a/main.py b/main.py index 7803442..cc23c76 100644 --- a/main.py +++ b/main.py @@ -211,14 +211,14 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy= # Compute weighted metrics weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric) - print(weighted_metrics_df) + # print(weighted_metrics_df) # Sort topics by their weighted metrics sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending, so the first 3 are the most disadvantaged - print(sorted_metrics) + # print(sorted_metrics) # Get top 3 and bottom 3 topics bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist() top_3_topics = sorted_metrics.tail(3)['topic'].tolist() - print(bottom_3_topics) + # print(bottom_3_topics) return bottom_3_topics, top_3_topics From eaff928e97e697bb0542f81ce4c1a69187b2ee69 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 15:41:40 +0200 Subject: [PATCH 41/42] removed stratification --- DatasetLoad.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DatasetLoad.py b/DatasetLoad.py index 23d2f69..059a032 100644 --- a/DatasetLoad.py +++ b/DatasetLoad.py @@ -63,8 +63,8 @@ def load_datasets(self): # Ensure the first column is 'text' and the second column is 'category' data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]] - train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['category']) - self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['category']) + train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42) + self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42) self.train_data = train_data if self.percentage < 100.0: From bacb3a9295a46778a8ab7dc139f352d9e99bbf51 Mon Sep 17 00:00:00 2001 From: Raul Gatto <126099719+Raoolo@users.noreply.github.com> Date: Sat, 27 Jul 2024 16:35:03 +0200 Subject: [PATCH 42/42] samples changed from 6 to 50 --- SentimentAnalyzer.py | 2 +- main.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index dbdd3d4..e726486 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -75,7 +75,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False return synthetic_data # Augment the training data with synthetic data - def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=False): + def generate_training_data(self, topics, texts, sentiments, n_samples=50, debug=False): print("Generating synthetic data...") generated_data = {'text': [], 'category': []} generated_data_with_topic = {'text': [], 'category': [], 'topic': []} diff --git a/main.py b/main.py index cc23c76..d811b48 100644 --- a/main.py +++ b/main.py @@ -205,7 +205,7 @@ def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'): # Function to get top and bottom topics based on weighted metrics - def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=val_accuracy, metric='accuracy'): + def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df,k=3, accuracy=val_accuracy, metric='accuracy'): # Get support for each topic support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) @@ -217,12 +217,12 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy= # print(sorted_metrics) # Get top 3 and bottom 3 topics bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist() - top_3_topics = sorted_metrics.tail(3)['topic'].tolist() + top_3_topics = sorted_metrics.tail(k)['topic'].tolist() # print(bottom_3_topics) return bottom_3_topics, top_3_topics - bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, val_accuracy, metric='accuracy') + bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, 3, val_accuracy, metric='accuracy') print(f"Bottom 3 validation topics: {bottom_3_topics}") print("Augmenting the training dataset with synthetic data...")