From b6197fa9863adf8c26e39648be9844f52e66cc60 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 17:36:38 +0200
Subject: [PATCH 01/42] changed from target to category in line
 original_train_data

---
 DatasetLoad.py       | 1 +
 SentimentAnalyzer.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/DatasetLoad.py b/DatasetLoad.py
index dc3f877..10a2f42 100644
--- a/DatasetLoad.py
+++ b/DatasetLoad.py
@@ -57,6 +57,7 @@ def load_datasets(self):
             data = data.rename(columns={'Tweet': 'text'})
             # remove the rows of the text column in which the text is "Not Available"
             data = data[data['text'] != 'Not Available']
+            data = data.dropna()
 
         # Ensure the first column is 'text' and the second column is 'category'
         data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]]
diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 66546c3..642d1af 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -29,11 +29,11 @@ def map_label_to_target(self, label):
             return None
 
     # Generate synthetic data using LLMs to be defined
-    def generate_synthetic_data(self, topic, n_samples):
+    def generate_synthetic_data(self, topic, text, n_samples):
         openai.api_key = 'YOUR_API_KEY'
         synthetic_data = []
         for _ in range(n_samples):
-            prompt = f"Generate six tweets related to {topic} that expresses sentiment."
+            prompt = f"Generate six tweets related to {topic} that expresses sentiment similar to {text}"
             response = openai.Completion.create(
                 engine="text-davinci-003",
                 prompt=prompt,

From 7c9ae480147ce7203d24f8b0fa702b392dc44683 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 17:54:39 +0200
Subject: [PATCH 02/42] First implementation in main of generating new data

---
 SentimentAnalyzer.py | 24 +++++++--------
 main.py              | 73 +++++++++++++++++++++++++++++---------------
 2 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 642d1af..b8ca503 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -13,6 +13,10 @@ def __init__(self):
         self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
         self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)
 
+        # Initialize FLAN model for synthetic data generation
+        self.flan_model_name = "google/flan-t5-small"
+        self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name)
+        self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device)
     def analyze_sentiment(self, text):
         results = self.classifier(text)
         return results[0]['label']
@@ -28,18 +32,15 @@ def map_label_to_target(self, label):
         else:
             return None
 
-    # Generate synthetic data using LLMs to be defined
+    # Generate synthetic data using Hugging Face model
     def generate_synthetic_data(self, topic, text, n_samples):
-        openai.api_key = 'YOUR_API_KEY'
         synthetic_data = []
         for _ in range(n_samples):
-            prompt = f"Generate six tweets related to {topic} that expresses sentiment similar to {text}"
-            response = openai.Completion.create(
-                engine="text-davinci-003",
-                prompt=prompt,
-                max_tokens=60
-            )
-            synthetic_data.append(response.choices[0].text.strip())
+            prompt = f"Generate a tweet related to {topic} that expresses sentiment similar to: '{text}'"
+            inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
+            outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1)
+            generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            synthetic_data.append(generated_text)
         return synthetic_data
 
     def augment_training_data(self, topics, n_samples=100):
@@ -47,7 +48,6 @@ def augment_training_data(self, topics, n_samples=100):
         augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
         for topic in topics:
             synthetic_texts = self.generate_synthetic_data(topic, n_samples)
-            # Assuming the sentiment label for generated data
             augmented_data['text'].extend(synthetic_texts)
             augmented_data['label'].extend([1] * len(synthetic_texts))  # Defaulting to neutral
             augmented_data_with_topics['text'].extend(synthetic_texts)
@@ -55,10 +55,10 @@ def augment_training_data(self, topics, n_samples=100):
             augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))
 
         augmented_df = pd.DataFrame(augmented_data)
-        augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics)
+        augmented_df_with_topics = pd.DataFrame(augmented_data_with_topics)
         return augmented_df, augmented_df_with_topics
 
-    def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5):
+    def fine_tune_with_augmented_data(self, topics, n_samples=6, epochs=3, batch_size=16, learning_rate=2e-5):
         augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples)
         return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
 
diff --git a/main.py b/main.py
index cc7b16e..32cca72 100644
--- a/main.py
+++ b/main.py
@@ -153,32 +153,55 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column
     print("\nValidation Metrics per Topic")
     print(val_metrics)
 
-    # Function to analyze disparities in sentiment predictions
-    def analyze_disparities(subgroups):
-        analysis_results = []
-        for subgroup_name, subgroup_data in subgroups.items():
-            if not subgroup_data.empty:
-                sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100
-                analysis_results.append({
-                    'subgroup': subgroup_name,
-                    'total': len(subgroup_data),
-                    'negative': sentiment_counts.get(0, 0),
-                    'neutral': sentiment_counts.get(1, 0),
-                    'positive': sentiment_counts.get(2, 0),
-                })
-        return pd.DataFrame(analysis_results)
+    # Identify least performing topics
+    overall_accuracy = train_metrics['accuracy'].mean()
+    least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist()
+    print(f"Least performing topics: {least_performing_topics}")
+
+    # Generate and augment data for least performing topics
+    synthetic_texts = generate_and_augment_data(sentiment_analyzer, least_performing_topics, original_train_data,
+                                                n_samples=100)
 
+    # Create a new DataFrame for the synthetic data
+    synthetic_df = pd.DataFrame({
+        'text': synthetic_texts,
+        'category': [1] * len(synthetic_texts),  # Assuming neutral category for synthetic data
+        'topic': least_performing_topics * (len(synthetic_texts) // len(least_performing_topics))
+    })
 
-    # Analyze disparities for the datasets
-    train_analysis = analyze_disparities(train_subgroups)
-    test_analysis = analyze_disparities(test_subgroups)
-    val_analysis = analyze_disparities(val_subgroups)
+    # Augment original training data with synthetic data
+    augmented_train_data = pd.concat([original_train_data, synthetic_df], ignore_index=True)
 
-    # Print the analysis results
-    print("Train Percentage Analysis")
-    print(train_analysis)
-    print("\nTest Percentage Analysis")
-    print(test_analysis)
-    print("\nValidation Percentage Analysis")
-    print(val_analysis)
+    # Fine-tune the sentiment analyzer with the augmented dataset
+    augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data)
+    print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}")
+
+    # Function to analyze disparities in sentiment predictions
+    # def analyze_disparities(subgroups):
+    #     analysis_results = []
+    #     for subgroup_name, subgroup_data in subgroups.items():
+    #         if not subgroup_data.empty:
+    #             sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100
+    #             analysis_results.append({
+    #                 'subgroup': subgroup_name,
+    #                 'total': len(subgroup_data),
+    #                 'negative': sentiment_counts.get(0, 0),
+    #                 'neutral': sentiment_counts.get(1, 0),
+    #                 'positive': sentiment_counts.get(2, 0),
+    #             })
+    #     return pd.DataFrame(analysis_results)
+    #
+    #
+    # # Analyze disparities for the datasets
+    # train_analysis = analyze_disparities(train_subgroups)
+    # test_analysis = analyze_disparities(test_subgroups)
+    # val_analysis = analyze_disparities(val_subgroups)
+    #
+    # # Print the analysis results
+    # print("Train Percentage Analysis")
+    # print(train_analysis)
+    # print("\nTest Percentage Analysis")
+    # print(test_analysis)
+    # print("\nValidation Percentage Analysis")
+    # print(val_analysis)
 

From cf24a153328c992ab0d42554ef30ae92a9e43f87 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 17:57:49 +0200
Subject: [PATCH 03/42] Check if saved model exists or not

---
 main.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 32cca72..b69aca1 100644
--- a/main.py
+++ b/main.py
@@ -48,12 +48,18 @@
     # Initialize the sentiment analyzer
     sentiment_analyzer = SentimentAnalyzer()
 
-    # Fine-tune the sentiment analyzer with the original dataset
-    fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
-    print(f"Fine-tuning results: {fine_tuning_results}")
-
     # Extract metadata for the datasets
     base_path = os.path.dirname(os.path.abspath(__file__))
+    # Check if a saved model exists
+    if os.path.exists(model_save_path):
+        print("Loading the fine-tuned model from disk...")
+        sentiment_analyzer.model = torch.load(model_save_path)
+    else:
+        print("Fine-tuning the sentiment analyzer with the original dataset...")
+        fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
+        print(f"Fine-tuning results: {fine_tuning_results}")
+        # Save the fine-tuned model
+        torch.save(sentiment_analyzer.model, model_save_path)
 
     # Extract metadata for the datasets
     train_sentiment_file_name = os.path.join(base_path, f'train_sentiment_{args.dataset_type}_{args.percentage}.csv')

From 36cc3d9f791e75233653021e35965ac93185bc7f Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Fri, 26 Jul 2024 18:04:28 +0200
Subject: [PATCH 04/42] implemented model_save_path

---
 main.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index b69aca1..d4f0d04 100644
--- a/main.py
+++ b/main.py
@@ -1,15 +1,15 @@
 import argparse
 import os
-import wandb
+
 import pandas as pd
 import torch
 from sklearn.metrics import classification_report
+
 from DatasetLoad import DatasetLoad
 from MetadataExtractor import MetadataExtractor
 from SentimentAnalyzer import SentimentAnalyzer
 from extract_stuff import augment_and_extract_metadata, predict_sentiment
 
-
 os.environ["WANDB_API_KEY"] = "21cb0c9433eeca19401ee01e9b1bc9e4b6f7a696"
 
 if __name__ == "__main__":
@@ -50,6 +50,7 @@
 
     # Extract metadata for the datasets
     base_path = os.path.dirname(os.path.abspath(__file__))
+    model_save_path = os.path.join(base_path, f'sentiment_model_{args.dataset_type}_{args.percentage}.pt')
     # Check if a saved model exists
     if os.path.exists(model_save_path):
         print("Loading the fine-tuned model from disk...")

From 61c687c97ff172e62946daa616107803cedb96fc Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Fri, 26 Jul 2024 18:37:15 +0200
Subject: [PATCH 05/42] first implementation fo the bottom topics

---
 main.py | 117 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 84 insertions(+), 33 deletions(-)

diff --git a/main.py b/main.py
index d4f0d04..25046e2 100644
--- a/main.py
+++ b/main.py
@@ -165,9 +165,90 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column
     least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist()
     print(f"Least performing topics: {least_performing_topics}")
 
+
+    # Function to analyze disparities in sentiment predictions
+    def analyze_disparities(subgroups):
+        analysis_results = []
+        for subgroup_name, subgroup_data in subgroups.items():
+            if not subgroup_data.empty:
+                sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100
+                analysis_results.append({
+                    'subgroup': subgroup_name,
+                    'total': len(subgroup_data),
+                    'negative': sentiment_counts.get(0, 0),
+                    'neutral': sentiment_counts.get(1, 0),
+                    'positive': sentiment_counts.get(2, 0),
+                })
+        return pd.DataFrame(analysis_results)
+
+
+    # Analyze disparities for the datasets
+    train_analysis = analyze_disparities(train_subgroups)
+    test_analysis = analyze_disparities(test_subgroups)
+    val_analysis = analyze_disparities(val_subgroups)
+
+    # Print the analysis results
+    print("Train Percentage Analysis")
+    print(train_analysis)
+    print("\nTest Percentage Analysis")
+    print(test_analysis)
+    print("\nValidation Percentage Analysis")
+    print(val_analysis)
+
+
+    def weighted_metrics(metrics_df, support_df, metric='accuracy'):
+        # Join metrics with their respective support counts
+        metrics_df = metrics_df.copy()
+        metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
+        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support']
+        return metrics_df
+
+
+    def get_top_bottom_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
+        # Get support for each topic
+        support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
+
+        # Compute weighted metrics
+        weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric)
+
+        # Compute baseline accuracy
+        baseline_accuracy = weighted_metrics_df['accuracy'].mean()
+
+        # Sort topics by their weighted metrics
+        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)
+
+        # Get top 3 and bottom 3 topics
+        top_3_topics = sorted_metrics.head(3)['topic'].tolist()
+        bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist()
+
+        # Adjust for baseline accuracy
+        bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[
+            'topic'].tolist()
+
+        return top_3_topics, bottom_3_topics_below_baseline
+
+
+    topics = get_top_bottom_topics(test_metrics, test_analysis, metric='accuracy')
+    print(f"Top 3 (lower score) topics: {topics[0]}")
+
+
+    def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples):
+        # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic). Then using the sentences as baseline, generate more that will later be useed to train the model
+        synthetic_texts = []
+        for topic in topics:
+            topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic]
+            topic_samples = topic_data.sample(n_samples, replace=True)
+            for index, row in topic_samples.iterrows():
+                synthetic_texts.extend(sentiment_analyzer.generate_synthetic_data(row['topic'], row['text'], n_samples))
+        return synthetic_texts
+
     # Generate and augment data for least performing topics
-    synthetic_texts = generate_and_augment_data(sentiment_analyzer, least_performing_topics, original_train_data,
-                                                n_samples=100)
+    synthetic_texts = generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata,
+                                                n_samples=10)
+    if args.debug:
+        print(f"Generated {len(synthetic_texts)} synthetic texts for least performing topics")
+        print("Sample synthetic texts:")
+        print(synthetic_texts[:5])
 
     # Create a new DataFrame for the synthetic data
     synthetic_df = pd.DataFrame({
@@ -181,34 +262,4 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column
 
     # Fine-tune the sentiment analyzer with the augmented dataset
     augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data)
-    print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}")
-
-    # Function to analyze disparities in sentiment predictions
-    # def analyze_disparities(subgroups):
-    #     analysis_results = []
-    #     for subgroup_name, subgroup_data in subgroups.items():
-    #         if not subgroup_data.empty:
-    #             sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100
-    #             analysis_results.append({
-    #                 'subgroup': subgroup_name,
-    #                 'total': len(subgroup_data),
-    #                 'negative': sentiment_counts.get(0, 0),
-    #                 'neutral': sentiment_counts.get(1, 0),
-    #                 'positive': sentiment_counts.get(2, 0),
-    #             })
-    #     return pd.DataFrame(analysis_results)
-    #
-    #
-    # # Analyze disparities for the datasets
-    # train_analysis = analyze_disparities(train_subgroups)
-    # test_analysis = analyze_disparities(test_subgroups)
-    # val_analysis = analyze_disparities(val_subgroups)
-    #
-    # # Print the analysis results
-    # print("Train Percentage Analysis")
-    # print(train_analysis)
-    # print("\nTest Percentage Analysis")
-    # print(test_analysis)
-    # print("\nValidation Percentage Analysis")
-    # print(val_analysis)
-
+    print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}")
\ No newline at end of file

From 4af3e448b076779edae40a5d584777530da08475 Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Fri, 26 Jul 2024 18:45:55 +0200
Subject: [PATCH 06/42] missing import

---
 SentimentAnalyzer.py | 4 +++-
 main.py              | 7 ++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index b8ca503..febb21e 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -1,7 +1,9 @@
 import pandas as pd
 import torch
 from sklearn.model_selection import train_test_split
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \
+    DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM
+
 from datasets import Dataset
 
 
diff --git a/main.py b/main.py
index 25046e2..c2ae170 100644
--- a/main.py
+++ b/main.py
@@ -204,7 +204,7 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'):
         return metrics_df
 
 
-    def get_top_bottom_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
+    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
         # Get support for each topic
         support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
 
@@ -228,12 +228,13 @@ def get_top_bottom_topics(test_metrics_df, test_percentage_analysis_df, metric='
         return top_3_topics, bottom_3_topics_below_baseline
 
 
-    topics = get_top_bottom_topics(test_metrics, test_analysis, metric='accuracy')
+    topics = get_top_lower_topics(test_metrics, test_analysis, metric='accuracy')
     print(f"Top 3 (lower score) topics: {topics[0]}")
 
 
     def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples):
-        # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic). Then using the sentences as baseline, generate more that will later be useed to train the model
+        # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic).
+        # Then using the sentences as baseline, generate more that will later be used to train the model
         synthetic_texts = []
         for topic in topics:
             topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic]

From 8b0adbf8c00e98a6e7d9eb37e3e82ea8a3dadea4 Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Fri, 26 Jul 2024 18:56:52 +0200
Subject: [PATCH 07/42] use val instead of test

---
 main.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index c2ae170..af746bf 100644
--- a/main.py
+++ b/main.py
@@ -155,16 +155,17 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column
 
     print("Train Metrics per Topic")
     print(train_metrics)
+
+    # Identify the least performing topics
+    overall_accuracy = train_metrics['accuracy'].mean()
+    least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist()
+    print(f"Least performing training topics: {least_performing_topics}")
+
     print("\nTest Metrics per Topic")
     print(test_metrics)
     print("\nValidation Metrics per Topic")
     print(val_metrics)
 
-    # Identify least performing topics
-    overall_accuracy = train_metrics['accuracy'].mean()
-    least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist()
-    print(f"Least performing topics: {least_performing_topics}")
-
 
     # Function to analyze disparities in sentiment predictions
     def analyze_disparities(subgroups):
@@ -228,8 +229,8 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
         return top_3_topics, bottom_3_topics_below_baseline
 
 
-    topics = get_top_lower_topics(test_metrics, test_analysis, metric='accuracy')
-    print(f"Top 3 (lower score) topics: {topics[0]}")
+    topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy')
+    print(f"Top 3 (lower score) validation topics: {topics[0]}")
 
 
     def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples):

From 929c7324139bee74a608cdbff3ce79c2b6d762b3 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 19:08:37 +0200
Subject: [PATCH 08/42] Solving raul pull problems

---
 SentimentAnalyzer.py | 26 ++++++++++++++++----------
 main.py              |  2 +-
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index febb21e..63003fc 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -34,34 +34,40 @@ def map_label_to_target(self, label):
         else:
             return None
 
-    # Generate synthetic data using Hugging Face model
-    def generate_synthetic_data(self, topic, text, n_samples):
+    # Generate synthetic data using the FLAN model
+    def generate_synthetic_data(self, topic, text, sentiment, n_samples):
         synthetic_data = []
         for _ in range(n_samples):
-            prompt = f"Generate a tweet related to {topic} that expresses sentiment similar to: '{text}'"
+            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment similar to: '{text}' "
             inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
             outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1)
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)
         return synthetic_data
 
-    def augment_training_data(self, topics, n_samples=100):
+    # Augment the training data with synthetic data
+    def augment_training_data(self, topics, texts, sentiments, n_samples=6):
         augmented_data = {'text': [], 'label': []}
         augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
-        for topic in topics:
-            synthetic_texts = self.generate_synthetic_data(topic, n_samples)
+
+        for topic, text, sentiment in zip(topics, texts, sentiments):
+            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment, n_samples)
+            sentiment_label = self.map_label_to_target(sentiment)
             augmented_data['text'].extend(synthetic_texts)
-            augmented_data['label'].extend([1] * len(synthetic_texts))  # Defaulting to neutral
+            augmented_data['label'].extend([sentiment_label] * len(synthetic_texts))
             augmented_data_with_topics['text'].extend(synthetic_texts)
-            augmented_data_with_topics['label'].extend([1] * len(synthetic_texts))
+            augmented_data_with_topics['label'].extend([sentiment_label] * len(synthetic_texts))
             augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))
 
         augmented_df = pd.DataFrame(augmented_data)
         augmented_df_with_topics = pd.DataFrame(augmented_data_with_topics)
         return augmented_df, augmented_df_with_topics
 
-    def fine_tune_with_augmented_data(self, topics, n_samples=6, epochs=3, batch_size=16, learning_rate=2e-5):
-        augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples)
+    # Fine-tune the model with augmented data
+    def fine_tune_with_augmented_data(self, topics, texts, sentiments, n_samples=6, epochs=3, batch_size=16,
+                                      learning_rate=2e-5):
+        augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, texts, sentiments,
+                                                                                            n_samples)
         return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
 
     # Fine-tune the model on a custom dataset
diff --git a/main.py b/main.py
index c2ae170..7d2dbad 100644
--- a/main.py
+++ b/main.py
@@ -263,4 +263,4 @@ def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metada
 
     # Fine-tune the sentiment analyzer with the augmented dataset
     augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data)
-    print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}")
\ No newline at end of file
+    print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}")

From 077b9a47afbdecccc487c229773600cc1db5accf Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Fri, 26 Jul 2024 19:12:39 +0200
Subject: [PATCH 09/42] use val instead of test

---
 main.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/main.py b/main.py
index 67e3632..7b77d23 100644
--- a/main.py
+++ b/main.py
@@ -155,12 +155,6 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column
 
     print("Train Metrics per Topic")
     print(train_metrics)
-
-    # Identify the least performing topics
-    overall_accuracy = train_metrics['accuracy'].mean()
-    least_performing_topics = train_metrics[train_metrics['accuracy'] < overall_accuracy]['topic'].tolist()
-    print(f"Least performing training topics: {least_performing_topics}")
-
     print("\nTest Metrics per Topic")
     print(test_metrics)
     print("\nValidation Metrics per Topic")
@@ -234,8 +228,10 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
 
 
     def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples):
-        # for each topic, select randomly n samples of text from the training data augmented with metadata (containing the topic).
-        # Then using the sentences as baseline, generate more that will later be used to train the model
+        # Ensure topics is a list of individual topics
+        if isinstance(topics[0], list):
+            topics = [item for sublist in topics for item in sublist]
+
         synthetic_texts = []
         for topic in topics:
             topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic]

From e5eb93fd41e15b3776eff9f6f990417e8de8f999 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:54:55 +0200
Subject: [PATCH 10/42] is this the end?

---
 SentimentAnalyzer.py |  51 +++++++++++---------
 main.py              | 110 +++++++++++++++++++++++++++----------------
 2 files changed, 98 insertions(+), 63 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 63003fc..fbe6a82 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -3,7 +3,7 @@
 from sklearn.model_selection import train_test_split
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \
     DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM
-
+from sklearn.preprocessing import OneHotEncoder
 from datasets import Dataset
 
 
@@ -34,11 +34,22 @@ def map_label_to_target(self, label):
         else:
             return None
 
+    def map_target_to_label(self, target):
+        # Map the target value to the sentiment label
+        if target == 0:
+            return "negative"
+        elif target == 1:
+            return "neutral"
+        elif target == 2:
+            return "positive"
+        else:
+            return None
+
     # Generate synthetic data using the FLAN model
     def generate_synthetic_data(self, topic, text, sentiment, n_samples):
         synthetic_data = []
         for _ in range(n_samples):
-            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment similar to: '{text}' "
+            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to:  '{text}' "
             inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
             outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1)
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -46,29 +57,23 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples):
         return synthetic_data
 
     # Augment the training data with synthetic data
-    def augment_training_data(self, topics, texts, sentiments, n_samples=6):
-        augmented_data = {'text': [], 'label': []}
-        augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
+    def generate_training_data(self, topics, texts, sentiments, n_samples=6):
+        generated_data = {'text': [], 'label': []}
+        generated_data_with_topic = {'text': [], 'label': [], 'topic': []}
 
         for topic, text, sentiment in zip(topics, texts, sentiments):
-            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment, n_samples)
-            sentiment_label = self.map_label_to_target(sentiment)
-            augmented_data['text'].extend(synthetic_texts)
-            augmented_data['label'].extend([sentiment_label] * len(synthetic_texts))
-            augmented_data_with_topics['text'].extend(synthetic_texts)
-            augmented_data_with_topics['label'].extend([sentiment_label] * len(synthetic_texts))
-            augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))
-
-        augmented_df = pd.DataFrame(augmented_data)
-        augmented_df_with_topics = pd.DataFrame(augmented_data_with_topics)
-        return augmented_df, augmented_df_with_topics
-
-    # Fine-tune the model with augmented data
-    def fine_tune_with_augmented_data(self, topics, texts, sentiments, n_samples=6, epochs=3, batch_size=16,
-                                      learning_rate=2e-5):
-        augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, texts, sentiments,
-                                                                                            n_samples)
-        return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
+            sentiment_text = self.map_target_to_label(sentiment)
+            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples)   # List of synthetic texts
+            generated_data['text'].extend(synthetic_texts)
+            generated_data['category'].extend([sentiment] * len(synthetic_texts))    # append sentiment to texts many times
+            generated_data_with_topic['text'].extend(synthetic_texts)
+            generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts))
+            generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts))
+
+        generated_df = pd.DataFrame(generated_data)
+        generated_df_with_topics = pd.DataFrame(generated_data_with_topic)
+        return generated_df, generated_df_with_topics
+
 
     # Fine-tune the model on a custom dataset
     def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
diff --git a/main.py b/main.py
index 7b77d23..469768a 100644
--- a/main.py
+++ b/main.py
@@ -62,7 +62,7 @@
         # Save the fine-tuned model
         torch.save(sentiment_analyzer.model, model_save_path)
 
-    # Extract metadata for the datasets
+    # Define the file names for the sentiment predictions
     train_sentiment_file_name = os.path.join(base_path, f'train_sentiment_{args.dataset_type}_{args.percentage}.csv')
     test_sentiment_file_name = os.path.join(base_path, f'test_sentiment_{args.dataset_type}_{args.percentage}.csv')
     val_sentiment_file_name = os.path.join(base_path, f'val_sentiment_{args.dataset_type}_{args.percentage}.csv')
@@ -210,7 +210,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
         baseline_accuracy = weighted_metrics_df['accuracy'].mean()
 
         # Sort topics by their weighted metrics
-        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)
+        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending
 
         # Get top 3 and bottom 3 topics
         top_3_topics = sorted_metrics.head(3)['topic'].tolist()
@@ -223,41 +223,71 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
         return top_3_topics, bottom_3_topics_below_baseline
 
 
-    topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy')
-    print(f"Top 3 (lower score) validation topics: {topics[0]}")
-
-
-    def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples):
-        # Ensure topics is a list of individual topics
-        if isinstance(topics[0], list):
-            topics = [item for sublist in topics for item in sublist]
-
-        synthetic_texts = []
-        for topic in topics:
-            topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic]
-            topic_samples = topic_data.sample(n_samples, replace=True)
-            for index, row in topic_samples.iterrows():
-                synthetic_texts.extend(sentiment_analyzer.generate_synthetic_data(row['topic'], row['text'], n_samples))
-        return synthetic_texts
-
-    # Generate and augment data for least performing topics
-    synthetic_texts = generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata,
-                                                n_samples=10)
-    if args.debug:
-        print(f"Generated {len(synthetic_texts)} synthetic texts for least performing topics")
-        print("Sample synthetic texts:")
-        print(synthetic_texts[:5])
-
-    # Create a new DataFrame for the synthetic data
-    synthetic_df = pd.DataFrame({
-        'text': synthetic_texts,
-        'category': [1] * len(synthetic_texts),  # Assuming neutral category for synthetic data
-        'topic': least_performing_topics * (len(synthetic_texts) // len(least_performing_topics))
-    })
-
-    # Augment original training data with synthetic data
-    augmented_train_data = pd.concat([original_train_data, synthetic_df], ignore_index=True)
-
-    # Fine-tune the sentiment analyzer with the augmented dataset
-    augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data)
-    print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}")
+    top_3_topics, bottom_3_topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy')
+    print(f"Bottom 3 validation topics: {bottom_3_topics }")
+
+    # Randomly select rows from bottom three topics in the training set
+    train_data_bottom_3 = train_data_with_metadata[train_data_with_metadata['topic'].isin(bottom_3_topics)]
+    selected_samples = train_data_bottom_3.sample(n=50, random_state=42)    # Select n samples from the bottom 3 topics
+
+    # Augment the selected samples using the sentiment analyzer
+    generated_df, generated_df_with_metadata = sentiment_analyzer.generate_training_data(
+        selected_samples['topic'].tolist(),
+        selected_samples['text'].tolist(),
+        selected_samples['sentiment'].tolist()
+    )
+
+    # Combine the original and augmented datasets
+    train_original_and_generated_data = pd.concat([original_train_data, generated_df], ignore_index=True)
+    # Save the combined datasets
+    train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False)
+
+    model_save_path = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt')
+    print("Fine-tuning the sentiment analyzer with the generated+original dataset...")
+    fine_tuning_results_new = sentiment_analyzer.fine_tune(new_training_data)  # TODO NON CE
+    print(f"Fine-tuning results: {fine_tuning_results_new}")
+    # Save the fine-tuned model
+    torch.save(sentiment_analyzer.model, model_save_path)
+
+    # Predict sentiment for the original dataset to see for improvements
+    test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')
+    val_sentiment_file_name_v2 = os.path.join(base_path, f'val_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')
+    test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer, test_sentiment_file_name_v2, args.debug)
+    val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name_v2, args.debug)
+
+    # Compute metrics for the test dataset
+    test_true_labels = original_test_data['category']
+    test_predicted_labels_v2 = test_data_with_sentiment_v2['sentiment']
+    print("\nTest Classification Report:")
+    print(classification_report(test_true_labels, test_predicted_labels_v2, labels=[0, 1, 2], zero_division=0))
+
+    # Compute metrics for the validation dataset
+    val_true_labels = original_val_data['category']
+    val_predicted_labels_v2 = val_data_with_sentiment_v2['sentiment']
+    print("\nValidation Classification Report:")
+    print(classification_report(val_true_labels, val_predicted_labels_v2, labels=[0, 1, 2], zero_division=0))
+
+    test_file_name_v2 = os.path.join(base_path, f'test_augmented_v2_{args.dataset_type}_{args.percentage}.csv')
+    val_file_name_v2 = os.path.join(base_path, f'val_augmented_v2_{args.dataset_type}_{args.percentage}.csv')
+    test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor, topic_labels, test_file_name_v2, args.debug)
+    val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels, val_file_name_v2, args.debug)
+
+    # Create subgroups for the datasets
+    test_subgroups_v2 = create_subgroups(test_data_with_metadata_v2)
+    val_subgroups_v2 = create_subgroups(val_data_with_metadata_v2)
+
+
+    test_metrics_v2 = compute_metrics(test_subgroups_v2)
+    val_metrics_v2 = compute_metrics(val_subgroups_v2)
+
+    print("\nTest Metrics per Topic")
+    print(test_metrics_v2)
+    print("\nValidation Metrics per Topic")
+    print(val_metrics_v2)
+
+    test_analysis_v2 = analyze_disparities(test_subgroups_v2)
+    val_analysis_v2 = analyze_disparities(val_subgroups_v2)
+    print("\nTest Percentage Analysis")
+    print(test_analysis_v2)
+    print("\nValidation Percentage Analysis")
+    print(val_analysis_v2)
\ No newline at end of file

From 63f353bcbbfc5f235ad330b6a3e395e4a9ac3a81 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 21:01:41 +0200
Subject: [PATCH 11/42] is this the end?

---
 SentimentAnalyzer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index fbe6a82..40d7b5a 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -58,8 +58,8 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples):
 
     # Augment the training data with synthetic data
     def generate_training_data(self, topics, texts, sentiments, n_samples=6):
-        generated_data = {'text': [], 'label': []}
-        generated_data_with_topic = {'text': [], 'label': [], 'topic': []}
+        generated_data = {'text': [], 'category': []}
+        generated_data_with_topic = {'text': [], 'category': [], 'topic': []}
 
         for topic, text, sentiment in zip(topics, texts, sentiments):
             sentiment_text = self.map_target_to_label(sentiment)

From ea46935bd0eee25c92f659b32fb16e856d9d875c Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 21:09:44 +0200
Subject: [PATCH 12/42] is this the end?

---
 SentimentAnalyzer.py | 1 +
 main.py              | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 40d7b5a..9f4e835 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -54,6 +54,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples):
             outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1)
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)
+            print(f"Generated Text: {generated_text}")
         return synthetic_data
 
     # Augment the training data with synthetic data
diff --git a/main.py b/main.py
index 469768a..58f6f8e 100644
--- a/main.py
+++ b/main.py
@@ -226,6 +226,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
     top_3_topics, bottom_3_topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy')
     print(f"Bottom 3 validation topics: {bottom_3_topics }")
 
+    print("Augmenting the training dataset with synthetic data...")
     # Randomly select rows from bottom three topics in the training set
     train_data_bottom_3 = train_data_with_metadata[train_data_with_metadata['topic'].isin(bottom_3_topics)]
     selected_samples = train_data_bottom_3.sample(n=50, random_state=42)    # Select n samples from the bottom 3 topics
@@ -242,12 +243,12 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
     # Save the combined datasets
     train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False)
 
-    model_save_path = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt')
+    model_save_path_v2 = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt')
     print("Fine-tuning the sentiment analyzer with the generated+original dataset...")
-    fine_tuning_results_new = sentiment_analyzer.fine_tune(new_training_data)  # TODO NON CE
+    fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data)  # TODO NON CE
     print(f"Fine-tuning results: {fine_tuning_results_new}")
     # Save the fine-tuned model
-    torch.save(sentiment_analyzer.model, model_save_path)
+    torch.save(sentiment_analyzer.model, model_save_path_v2)
 
     # Predict sentiment for the original dataset to see for improvements
     test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')

From 7b27fb1c3e5e52ff026507ce5fe913e5b6dc6000 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 21:22:18 +0200
Subject: [PATCH 13/42] is this the end?

---
 SentimentAnalyzer.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 9f4e835..a6807b2 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -48,10 +48,19 @@ def map_target_to_label(self, target):
     # Generate synthetic data using the FLAN model
     def generate_synthetic_data(self, topic, text, sentiment, n_samples):
         synthetic_data = []
+        print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
         for _ in range(n_samples):
-            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to:  '{text}' "
+            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' "
             inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
-            outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1)
+            # Use top-k sampling and temperature sampling for more diverse outputs
+            outputs = self.flan_model.generate(
+                inputs.input_ids,
+                max_length=60,
+                num_return_sequences=1,
+                do_sample=True,
+                top_k=50,  # Consider top 50 tokens
+                temperature=0.7  # Adjust temperature to control diversity
+            )
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)
             print(f"Generated Text: {generated_text}")

From 3a062fc5416839760cc373611739260416d06867 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Fri, 26 Jul 2024 21:33:20 +0200
Subject: [PATCH 14/42] is this the end?

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 58f6f8e..e9a2ea6 100644
--- a/main.py
+++ b/main.py
@@ -99,7 +99,7 @@
     extractor = MetadataExtractor()
 
     # Define topic labels
-    topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"]
+    topic_labels = ["news", "entertainment", "sports", "technology", "health", "education", "business", "lifestyle", "opinions", "other"]
 
     # Define the base path where main.py is located
     base_path = os.path.dirname(os.path.abspath(__file__))

From a74fb8652ecc19920ed9a4153332c6e6f14fb24c Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 08:36:11 +0200
Subject: [PATCH 15/42] is this the end?

---
 SentimentAnalyzer.py |  6 +--
 main.py              | 94 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index a6807b2..7aa7db5 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -48,7 +48,7 @@ def map_target_to_label(self, target):
     # Generate synthetic data using the FLAN model
     def generate_synthetic_data(self, topic, text, sentiment, n_samples):
         synthetic_data = []
-        print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
+        #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
         for _ in range(n_samples):
             prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' "
             inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
@@ -59,11 +59,11 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples):
                 num_return_sequences=1,
                 do_sample=True,
                 top_k=50,  # Consider top 50 tokens
-                temperature=0.7  # Adjust temperature to control diversity
+                temperature=0.5  # Adjust temperature to control diversity
             )
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)
-            print(f"Generated Text: {generated_text}")
+            #print(f"Generated Text: {generated_text}")
         return synthetic_data
 
     # Augment the training data with synthetic data
diff --git a/main.py b/main.py
index e9a2ea6..06841b1 100644
--- a/main.py
+++ b/main.py
@@ -291,4 +291,96 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
     print("\nTest Percentage Analysis")
     print(test_analysis_v2)
     print("\nValidation Percentage Analysis")
-    print(val_analysis_v2)
\ No newline at end of file
+    print(val_analysis_v2)
+
+
+    def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'):
+        """
+        Plots a comparison of the given metric before and after fine-tuning.
+
+        Parameters:
+        - old_metrics: DataFrame containing the old metrics
+        - new_metrics: DataFrame containing the new metrics
+        - metric: The metric to compare (default is 'accuracy')
+        """
+        # Merge the old and new metrics on the 'topic' column
+        comparison_df = old_metrics.merge(new_metrics, on='topic', suffixes=('_old', '_new'))
+
+        # Sort the DataFrame by the new metric for better visualization
+        comparison_df = comparison_df.sort_values(by=f'{metric}_new', ascending=False)
+
+        # Plot the comparison
+        plt.figure(figsize=(12, 8))
+        bar_width = 0.4
+
+        # Positioning the bars
+        r1 = range(len(comparison_df))
+        r2 = [x + bar_width for x in r1]
+
+        plt.bar(r1, comparison_df[f'{metric}_old'], color='blue', width=bar_width, edgecolor='grey', label='Old')
+        plt.bar(r2, comparison_df[f'{metric}_new'], color='green', width=bar_width, edgecolor='grey', label='New')
+
+        plt.xlabel('Topics', fontweight='bold')
+        plt.ylabel(metric.capitalize(), fontweight='bold')
+        plt.title(f'Comparison of {metric.capitalize()} by Topic', fontweight='bold')
+        plt.xticks([r + bar_width / 2 for r in range(len(comparison_df))], comparison_df['topic'], rotation=90)
+        plt.legend()
+
+        plt.tight_layout()
+        plt.show()
+
+
+    # Plot the comparison for accuracy, precision, recall, and f1-score
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='accuracy')
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='precision')
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='recall')
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score')
+
+
+    def calculate_overall_accuracy(metrics_df):
+        """
+        Calculate the overall accuracy from the metrics DataFrame.
+
+        Parameters:
+        - metrics_df: DataFrame containing the metrics
+
+        Returns:
+        - overall_accuracy: The overall accuracy
+        """
+        total_support = metrics_df['total'].sum()
+        weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum()
+        overall_accuracy = weighted_accuracy_sum / total_support
+        return overall_accuracy
+
+
+    def plot_overall_accuracy_comparison(old_metrics, new_metrics):
+        """
+        Plot the overall accuracy comparison before and after fine-tuning.
+
+        Parameters:
+        - old_metrics: DataFrame containing the old metrics
+        - new_metrics: DataFrame containing the new metrics
+        """
+        overall_accuracy_old = calculate_overall_accuracy(old_metrics)
+        overall_accuracy_new = calculate_overall_accuracy(new_metrics)
+
+        accuracies = [overall_accuracy_old, overall_accuracy_new]
+        labels = ['Old Model', 'New Model']
+
+        plt.figure(figsize=(8, 6))
+        plt.bar(labels, accuracies, color=['blue', 'green'], edgecolor='grey')
+
+        plt.xlabel('Model', fontweight='bold')
+        plt.ylabel('Overall Accuracy', fontweight='bold')
+        plt.title('Overall Accuracy Comparison', fontweight='bold')
+        plt.ylim(0, 1)  # Assuming accuracy is between 0 and 1
+
+        for i, v in enumerate(accuracies):
+            plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold')
+
+        plt.tight_layout()
+        plt.show()
+
+
+    # Calculate and plot the overall accuracy comparison
+    plot_overall_accuracy_comparison(test_metrics, test_metrics_v2)
\ No newline at end of file

From 3290b60eb149a14f444983c05935ed2362a5a5a5 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 09:14:03 +0200
Subject: [PATCH 16/42] is this the end?

---
 SentimentAnalyzer.py | 10 ++++++----
 main.py              |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 7aa7db5..60491cf 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -46,7 +46,7 @@ def map_target_to_label(self, target):
             return None
 
     # Generate synthetic data using the FLAN model
-    def generate_synthetic_data(self, topic, text, sentiment, n_samples):
+    def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False):
         synthetic_data = []
         #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
         for _ in range(n_samples):
@@ -58,12 +58,14 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples):
                 max_length=60,
                 num_return_sequences=1,
                 do_sample=True,
-                top_k=50,  # Consider top 50 tokens
-                temperature=0.5  # Adjust temperature to control diversity
+                top_k=50,
+                temperature=0.5
             )
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)
-            #print(f"Generated Text: {generated_text}")
+            if debug:
+                print(f"Generated Text: {generated_text}")
+                print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
         return synthetic_data
 
     # Augment the training data with synthetic data
diff --git a/main.py b/main.py
index 06841b1..a7b2ca2 100644
--- a/main.py
+++ b/main.py
@@ -235,7 +235,8 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
     generated_df, generated_df_with_metadata = sentiment_analyzer.generate_training_data(
         selected_samples['topic'].tolist(),
         selected_samples['text'].tolist(),
-        selected_samples['sentiment'].tolist()
+        selected_samples['sentiment'].tolist(),
+        debug=args.debug
     )
 
     # Combine the original and augmented datasets

From c8e22cc6c1381c185460e43092c129277590693c Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 09:16:50 +0200
Subject: [PATCH 17/42] is this the end?

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index a7b2ca2..0b0814b 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,6 @@
 import argparse
 import os
-
+import matplotlib.pyplot as plt
 import pandas as pd
 import torch
 from sklearn.metrics import classification_report

From 74b0b848c1415874d154551f6af36937a13b5e5c Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 09:19:28 +0200
Subject: [PATCH 18/42] count was initialized

---
 SentimentAnalyzer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 60491cf..66f5677 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -3,7 +3,7 @@
 from sklearn.model_selection import train_test_split
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \
     DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM
-from sklearn.preprocessing import OneHotEncoder
+
 from datasets import Dataset
 
 
@@ -49,6 +49,7 @@ def map_target_to_label(self, target):
     def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False):
         synthetic_data = []
         #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
+        count = 0
         for _ in range(n_samples):
             prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' "
             inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)

From e12f205bfe1ef0b9918f4e36556f7248b19c60b7 Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 09:20:01 +0200
Subject: [PATCH 19/42] added matplotlib to reqs

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 9a3e5b1..11143d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ scikit_learn==1.2.0
 transformers==4.42.4
 tensorflow
 gdown
-
+matplotlib

From c0997dd5dae126e05554a6352d472e9c0c9550b2 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 09:23:06 +0200
Subject: [PATCH 20/42] is this the end?

---
 SentimentAnalyzer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 66f5677..2b1ffd4 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -64,6 +64,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False
             )
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)
+            count += 1
             if debug:
                 print(f"Generated Text: {generated_text}")
                 print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")

From 1b911b5e90c1c0e200889d703b4318269d63309b Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 09:24:03 +0200
Subject: [PATCH 21/42] is this the end?

---
 SentimentAnalyzer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 2b1ffd4..1d3e2e1 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -56,7 +56,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False
             # Use top-k sampling and temperature sampling for more diverse outputs
             outputs = self.flan_model.generate(
                 inputs.input_ids,
-                max_length=60,
+                max_length=100,
                 num_return_sequences=1,
                 do_sample=True,
                 top_k=50,

From b4df0cab9232cc606af6bf7bd09c4a8c57e3c372 Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 09:25:02 +0200
Subject: [PATCH 22/42] temperature set to 0.9

---
 SentimentAnalyzer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 2b1ffd4..2c8ab0a 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -60,7 +60,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False
                 num_return_sequences=1,
                 do_sample=True,
                 top_k=50,
-                temperature=0.5
+                temperature=0.9
             )
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)

From bda7149cdb89aad631149341792563471972c4ac Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 09:26:17 +0200
Subject: [PATCH 23/42] max lenght 60

---
 SentimentAnalyzer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 16bc427..2c8ab0a 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -56,7 +56,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False
             # Use top-k sampling and temperature sampling for more diverse outputs
             outputs = self.flan_model.generate(
                 inputs.input_ids,
-                max_length=100,
+                max_length=60,
                 num_return_sequences=1,
                 do_sample=True,
                 top_k=50,

From 9eaddb61830e10131601089ac29f08c9ae8ee91f Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 09:45:15 +0200
Subject: [PATCH 24/42] implemenmted deep debug

---
 SentimentAnalyzer.py | 34 +++++++++++++++++++++-------------
 extract_stuff.py     | 10 +++++-----
 main.py              | 28 ++++++++++++++++++----------
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 2c8ab0a..a19fc77 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -12,13 +12,15 @@ def __init__(self):
         self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
         self.device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
+                                                                        ignore_mismatched_sizes=True).to(self.device)
         self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)
 
         # Initialize FLAN model for synthetic data generation
         self.flan_model_name = "google/flan-t5-small"
         self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name)
         self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device)
+
     def analyze_sentiment(self, text):
         results = self.classifier(text)
         return results[0]['label']
@@ -48,7 +50,7 @@ def map_target_to_label(self, target):
     # Generate synthetic data using the FLAN model
     def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False):
         synthetic_data = []
-        #print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
+        # print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
         count = 0
         for _ in range(n_samples):
             prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' "
@@ -66,40 +68,46 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False
             synthetic_data.append(generated_text)
             count += 1
             if debug:
-                print(f"Generated Text: {generated_text}")
+                print(f"DEBUG - Generated Text: {generated_text}")
+                print(f"DEBUG - Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
+            if int(count % 5) == 0:
                 print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
         return synthetic_data
 
     # Augment the training data with synthetic data
-    def generate_training_data(self, topics, texts, sentiments, n_samples=6):
+    def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=False):
+        print("Generating synthetic data...")
         generated_data = {'text': [], 'category': []}
         generated_data_with_topic = {'text': [], 'category': [], 'topic': []}
 
         for topic, text, sentiment in zip(topics, texts, sentiments):
             sentiment_text = self.map_target_to_label(sentiment)
-            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples)   # List of synthetic texts
+            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text,
+                                                           n_samples)  # List of synthetic texts
             generated_data['text'].extend(synthetic_texts)
-            generated_data['category'].extend([sentiment] * len(synthetic_texts))    # append sentiment to texts many times
+            generated_data['category'].extend(
+                [sentiment] * len(synthetic_texts))  # append sentiment to texts many times
             generated_data_with_topic['text'].extend(synthetic_texts)
             generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts))
             generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts))
+            if debug:
+                print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}")
 
         generated_df = pd.DataFrame(generated_data)
         generated_df_with_topics = pd.DataFrame(generated_data_with_topic)
         return generated_df, generated_df_with_topics
 
-
     # Fine-tune the model on a custom dataset
     def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
         # Preprocess the dataset
-        df = df.rename(columns={"text": "text", "category": "label"})     # Rename the columns
-        df['label'] = df['label'].astype(int)   # Ensure the labels are integers
-        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)        # Split the dataset
+        df = df.rename(columns={"text": "text", "category": "label"})  # Rename the columns
+        df['label'] = df['label'].astype(int)  # Ensure the labels are integers
+        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # Split the dataset
 
-        train_dataset = Dataset.from_pandas(train_df)   # Load the dataset
+        train_dataset = Dataset.from_pandas(train_df)  # Load the dataset
         test_dataset = Dataset.from_pandas(test_df)
 
-        def tokenize_function(examples):    # Tokenize the text
+        def tokenize_function(examples):  # Tokenize the text
             return self.tokenizer(examples["text"], padding="max_length", truncation=True)
 
         train_dataset = train_dataset.map(tokenize_function, batched=True)  # Tokenize the dataset
@@ -108,7 +116,7 @@ def tokenize_function(examples):    # Tokenize the text
         train_dataset = train_dataset.remove_columns(["text"])  # Remove the text column after tokenization
         test_dataset = test_dataset.remove_columns(["text"])
 
-        train_dataset.set_format("torch")   # Set the format to PyTorch
+        train_dataset.set_format("torch")  # Set the format to PyTorch
         test_dataset.set_format("torch")
 
         # Define the data collator
diff --git a/extract_stuff.py b/extract_stuff.py
index 2d779e9..392c06e 100644
--- a/extract_stuff.py
+++ b/extract_stuff.py
@@ -28,9 +28,9 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de
             # If debug mode is enabled, print debug information
             percentage_complete = ((count + 1) / total_rows) * 100
             if debug:
-                print(f"Text: {row['text']}")
-                print(f"Generated Metadata: Topic - {topic}")
-                print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
+                print(f"DEBUG - Text: {row['text']}")
+                print(f"DEBUG - Generated Metadata: Topic - {topic}")
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
             if percentage_complete % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
@@ -70,8 +70,8 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch
             # Calculate the percentage of completion
             percentage_complete = ((end) / total_rows) * 100
             if debug:
-                print(f"Processed batch {start // batch_size + 1}: {start} to {end}")
-                print(f"Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
+                print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}")
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
             if percentage_complete % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%")
 
diff --git a/main.py b/main.py
index 0b0814b..e442de7 100644
--- a/main.py
+++ b/main.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+
 import matplotlib.pyplot as plt
 import pandas as pd
 import torch
@@ -19,6 +20,8 @@
     parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'TODO'],
                         help='Type of dataset to load')
     parser.add_argument('--debug', type=bool, default=False,
+                        help='Enable debug mode to print even more additional information')
+    parser.add_argument('--deep_debug', type=bool, default=False,
                         help='Enable debug mode to print additional information')
     parser.add_argument('--percentage', type=float, default=100.0,
                         help='Percentage of the dataset to use (e.g., 0.1 for 0.1%)')
@@ -26,6 +29,7 @@
     # Parse command-line arguments
     args = parser.parse_args()
     print("Debugging is set to: ", args.debug)
+    print("Deep Debugging is set to: ", args.deep_debug)
     print("Percentage is set to: ", args.percentage)
 
     # Print Torch availability and device information
@@ -69,11 +73,11 @@
 
     # Predict sentiment for the datasets
     train_data_with_sentiment = predict_sentiment(original_train_data.copy(), sentiment_analyzer,
-                                                  train_sentiment_file_name, args.debug)
+                                                  train_sentiment_file_name, args.deep_debug)
     test_data_with_sentiment = predict_sentiment(original_test_data.copy(), sentiment_analyzer,
-                                                 test_sentiment_file_name, args.debug)
+                                                 test_sentiment_file_name, args.deep_debug)
     val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name,
-                                                args.debug)
+                                                args.deep_debug)
 
 
     # Compute metrics for the train dataset
@@ -110,11 +114,11 @@
     val_file_name = os.path.join(base_path, f'val_augmented_{args.dataset_type}_{args.percentage}.csv')
 
     train_data_with_metadata = augment_and_extract_metadata(train_data_with_sentiment.copy(), extractor,
-                                                            topic_labels, train_file_name, args.debug)
+                                                            topic_labels, train_file_name, args.deep_debug)
     test_data_with_metadata = augment_and_extract_metadata(test_data_with_sentiment.copy(), extractor,
-                                                           topic_labels, test_file_name, args.debug)
+                                                           topic_labels, test_file_name, args.deep_debug)
     val_data_with_metadata = augment_and_extract_metadata(val_data_with_sentiment.copy(), extractor,
-                                                          topic_labels, val_file_name, args.debug)
+                                                          topic_labels, val_file_name, args.deep_debug)
 
 
     # Function to create subgroups based on metadata
@@ -254,8 +258,10 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
     # Predict sentiment for the original dataset to see for improvements
     test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')
     val_sentiment_file_name_v2 = os.path.join(base_path, f'val_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')
-    test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer, test_sentiment_file_name_v2, args.debug)
-    val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name_v2, args.debug)
+    test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer,
+                                                    test_sentiment_file_name_v2, args.deep_debug)
+    val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer,
+                                                   val_sentiment_file_name_v2, args.deep_debug)
 
     # Compute metrics for the test dataset
     test_true_labels = original_test_data['category']
@@ -271,8 +277,10 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
 
     test_file_name_v2 = os.path.join(base_path, f'test_augmented_v2_{args.dataset_type}_{args.percentage}.csv')
     val_file_name_v2 = os.path.join(base_path, f'val_augmented_v2_{args.dataset_type}_{args.percentage}.csv')
-    test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor, topic_labels, test_file_name_v2, args.debug)
-    val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels, val_file_name_v2, args.debug)
+    test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor,
+                                                              topic_labels, test_file_name_v2, args.deep_debug)
+    val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels,
+                                                             val_file_name_v2, args.deep_debug)
 
     # Create subgroups for the datasets
     test_subgroups_v2 = create_subgroups(test_data_with_metadata_v2)

From fe5f47c9c8ee97307804db9344a15a379aaf2b1f Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 10:01:32 +0200
Subject: [PATCH 25/42] small fix

---
 SentimentAnalyzer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index a19fc77..ef39adc 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -82,8 +82,8 @@ def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=F
 
         for topic, text, sentiment in zip(topics, texts, sentiments):
             sentiment_text = self.map_target_to_label(sentiment)
-            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text,
-                                                           n_samples)  # List of synthetic texts
+            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples,
+                                                           debug)  # List of synthetic texts
             generated_data['text'].extend(synthetic_texts)
             generated_data['category'].extend(
                 [sentiment] * len(synthetic_texts))  # append sentiment to texts many times

From 359cde1c8b3f35436e3da10c8ec3f4ed7a215f71 Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 10:34:43 +0200
Subject: [PATCH 26/42] small fix of print statements 5%

---
 extract_stuff.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/extract_stuff.py b/extract_stuff.py
index 392c06e..8d10f33 100644
--- a/extract_stuff.py
+++ b/extract_stuff.py
@@ -32,7 +32,7 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de
                 print(f"DEBUG - Generated Metadata: Topic - {topic}")
                 print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
-            if percentage_complete % 5 == 0:
+            if int(percentage_complete % 5) == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
             count += 1
@@ -68,11 +68,11 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch
             # Extend the sentiments list with the batch sentiments
             sentiments.extend(batch_sentiments)
             # Calculate the percentage of completion
-            percentage_complete = ((end) / total_rows) * 100
+            percentage_complete = (end / total_rows) * 100
             if debug:
                 print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}")
                 print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
-            if percentage_complete % 5 == 0:
+            if int(percentage_complete % 5) == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%")
 
         dataset['sentiment'] = sentiments

From 1ac3ec8dd26236e0e7ecbf87a122a003eedee2a6 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 11:24:32 +0200
Subject: [PATCH 27/42] tried to fix plotting

---
 main.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index e442de7..9a05d71 100644
--- a/main.py
+++ b/main.py
@@ -202,7 +202,6 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'):
         metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support']
         return metrics_df
 
-
     def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
         # Get support for each topic
         support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
@@ -391,5 +390,15 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics):
         plt.show()
 
 
+    # Ensure the 'total' column exists in the metrics DataFrame
+    test_analysis_v2 = analyze_disparities(test_subgroups_v2)
+    val_analysis_v2 = analyze_disparities(val_subgroups_v2)
+
+    # Print the DataFrame to verify the 'total' column
+    print("\nTest Analysis V2:")
+    print(test_analysis_v2)
+    print("\nValidation Analysis V2:")
+    print(val_analysis_v2)
+
     # Calculate and plot the overall accuracy comparison
     plot_overall_accuracy_comparison(test_metrics, test_metrics_v2)
\ No newline at end of file

From 58d00e2a0e28b1287bef3a4d7b5a5730eff70d9b Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 11:27:00 +0200
Subject: [PATCH 28/42] small fix of print statements 5%, more print statements

---
 SentimentAnalyzer.py | 9 +++++++++
 extract_stuff.py     | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index ef39adc..878d115 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -80,6 +80,8 @@ def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=F
         generated_data = {'text': [], 'category': []}
         generated_data_with_topic = {'text': [], 'category': [], 'topic': []}
 
+        count = 0
+        total = len(texts)
         for topic, text, sentiment in zip(topics, texts, sentiments):
             sentiment_text = self.map_target_to_label(sentiment)
             synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples,
@@ -90,8 +92,15 @@ def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=F
             generated_data_with_topic['text'].extend(synthetic_texts)
             generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts))
             generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts))
+            count += 1
             if debug:
                 print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}")
+            # Print percentage of completion of total texts
+            percentage_complete = count / total * 100
+            if int(percentage_complete) % 5 == 0:
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count} of {total}")
+
+
 
         generated_df = pd.DataFrame(generated_data)
         generated_df_with_topics = pd.DataFrame(generated_data_with_topic)
diff --git a/extract_stuff.py b/extract_stuff.py
index 8d10f33..706c2af 100644
--- a/extract_stuff.py
+++ b/extract_stuff.py
@@ -32,7 +32,7 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de
                 print(f"DEBUG - Generated Metadata: Topic - {topic}")
                 print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
-            if int(percentage_complete % 5) == 0:
+            if int(percentage_complete) % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
             count += 1
@@ -72,7 +72,7 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch
             if debug:
                 print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}")
                 print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
-            if int(percentage_complete % 5) == 0:
+            if int(percentage_complete) % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%")
 
         dataset['sentiment'] = sentiments

From 37daa304978779471d898172d41a0ceeb66ca9b5 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 11:43:34 +0200
Subject: [PATCH 29/42] tried to fix plotting

---
 main.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 120 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 9a05d71..0edb9a9 100644
--- a/main.py
+++ b/main.py
@@ -199,9 +199,10 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'):
         # Join metrics with their respective support counts
         metrics_df = metrics_df.copy()
         metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
-        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support']
+        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total']
         return metrics_df
 
+
     def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
         # Get support for each topic
         support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
@@ -361,16 +362,130 @@ def calculate_overall_accuracy(metrics_df):
         return overall_accuracy
 
 
-    def plot_overall_accuracy_comparison(old_metrics, new_metrics):
+    The
+    error
+    you
+    're encountering, KeyError: '
+    total
+    ', indicates that the DataFrame metrics_df does not contain a column named '
+    total
+    '. This likely happens in the function calculate_overall_accuracy.
+
+    To
+    resolve
+    this, you
+    need
+    to
+    ensure
+    that
+    the
+    DataFrame
+    passed
+    to
+    calculate_overall_accuracy
+    has
+    a
+    'total'
+    column.The
+    'total'
+    column
+    appears
+    to
+    represent
+    the
+    support(i.e., the
+    count
+    of
+    instances) for each topic in your analysis.This support data should be extracted from the analyze_disparities function.
+
+    Here’s
+    a
+    modified
+    version
+    of
+    the
+    relevant
+    functions and parts
+    of
+    your
+    code, ensuring
+    that
+    the
+    'total'
+    column is present in the
+    DataFrame
+    passed
+    to
+    calculate_overall_accuracy.
+
+    python
+
+
+    def weighted_metrics(metrics_df, support_df, metric='accuracy'):
+        # Join metrics with their respective support counts
+        metrics_df = metrics_df.copy()
+        metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
+        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total']
+        return metrics_df
+
+
+    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
+        # Get support for each topic
+        support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
+
+        # Compute weighted metrics
+        weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric)
+
+        # Compute baseline accuracy
+        baseline_accuracy = weighted_metrics_df['accuracy'].mean()
+
+        # Sort topics by their weighted metrics
+        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending
+
+        # Get top 3 and bottom 3 topics
+        top_3_topics = sorted_metrics.head(3)['topic'].tolist()
+        bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist()
+
+        # Adjust for baseline accuracy
+        bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[
+            'topic'].tolist()
+
+        return top_3_topics, bottom_3_topics_below_baseline
+
+
+    def calculate_overall_accuracy(metrics_df):
+        """
+        Calculate the overall accuracy from the metrics DataFrame.
+
+        Parameters:
+        - metrics_df: DataFrame containing the metrics
+
+        Returns:
+        - overall_accuracy: The overall accuracy
+        """
+        total_support = metrics_df['total'].sum()
+        weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum()
+        overall_accuracy = weighted_accuracy_sum / total_support
+        return overall_accuracy
+
+
+    def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, support_new):
         """
         Plot the overall accuracy comparison before and after fine-tuning.
 
         Parameters:
         - old_metrics: DataFrame containing the old metrics
         - new_metrics: DataFrame containing the new metrics
+        - support_old: DataFrame containing the support data for the old metrics
+        - support_new: DataFrame containing the support data for the new metrics
         """
-        overall_accuracy_old = calculate_overall_accuracy(old_metrics)
-        overall_accuracy_new = calculate_overall_accuracy(new_metrics)
+        old_metrics_with_total = old_metrics.merge(support_old[['subgroup', 'total']], left_on='topic',
+                                                   right_on='subgroup')
+        new_metrics_with_total = new_metrics.merge(support_new[['subgroup', 'total']], left_on='topic',
+                                                   right_on='subgroup')
+
+        overall_accuracy_old = calculate_overall_accuracy(old_metrics_with_total)
+        overall_accuracy_new = calculate_overall_accuracy(new_metrics_with_total)
 
         accuracies = [overall_accuracy_old, overall_accuracy_new]
         labels = ['Old Model', 'New Model']
@@ -401,4 +516,4 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics):
     print(val_analysis_v2)
 
     # Calculate and plot the overall accuracy comparison
-    plot_overall_accuracy_comparison(test_metrics, test_metrics_v2)
\ No newline at end of file
+    plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2)
\ No newline at end of file

From e7bd36fbba41fe2f47fefe5c6ea76a85ffad8c7b Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 11:53:02 +0200
Subject: [PATCH 30/42] tried to fix plotting

---
 main.py | 107 --------------------------------------------------------
 1 file changed, 107 deletions(-)

diff --git a/main.py b/main.py
index 0edb9a9..55a9f21 100644
--- a/main.py
+++ b/main.py
@@ -346,113 +346,6 @@ def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'):
     plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score')
 
 
-    def calculate_overall_accuracy(metrics_df):
-        """
-        Calculate the overall accuracy from the metrics DataFrame.
-
-        Parameters:
-        - metrics_df: DataFrame containing the metrics
-
-        Returns:
-        - overall_accuracy: The overall accuracy
-        """
-        total_support = metrics_df['total'].sum()
-        weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum()
-        overall_accuracy = weighted_accuracy_sum / total_support
-        return overall_accuracy
-
-
-    The
-    error
-    you
-    're encountering, KeyError: '
-    total
-    ', indicates that the DataFrame metrics_df does not contain a column named '
-    total
-    '. This likely happens in the function calculate_overall_accuracy.
-
-    To
-    resolve
-    this, you
-    need
-    to
-    ensure
-    that
-    the
-    DataFrame
-    passed
-    to
-    calculate_overall_accuracy
-    has
-    a
-    'total'
-    column.The
-    'total'
-    column
-    appears
-    to
-    represent
-    the
-    support(i.e., the
-    count
-    of
-    instances) for each topic in your analysis.This support data should be extracted from the analyze_disparities function.
-
-    Here’s
-    a
-    modified
-    version
-    of
-    the
-    relevant
-    functions and parts
-    of
-    your
-    code, ensuring
-    that
-    the
-    'total'
-    column is present in the
-    DataFrame
-    passed
-    to
-    calculate_overall_accuracy.
-
-    python
-
-
-    def weighted_metrics(metrics_df, support_df, metric='accuracy'):
-        # Join metrics with their respective support counts
-        metrics_df = metrics_df.copy()
-        metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
-        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total']
-        return metrics_df
-
-
-    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
-        # Get support for each topic
-        support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
-
-        # Compute weighted metrics
-        weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric)
-
-        # Compute baseline accuracy
-        baseline_accuracy = weighted_metrics_df['accuracy'].mean()
-
-        # Sort topics by their weighted metrics
-        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending
-
-        # Get top 3 and bottom 3 topics
-        top_3_topics = sorted_metrics.head(3)['topic'].tolist()
-        bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist()
-
-        # Adjust for baseline accuracy
-        bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[
-            'topic'].tolist()
-
-        return top_3_topics, bottom_3_topics_below_baseline
-
-
     def calculate_overall_accuracy(metrics_df):
         """
         Calculate the overall accuracy from the metrics DataFrame.

From 671040b255ebb1a456c2189cf8cbb375728a2b19 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 12:02:52 +0200
Subject: [PATCH 31/42] tried to fix plotting

---
 main.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index 55a9f21..b191253 100644
--- a/main.py
+++ b/main.py
@@ -180,7 +180,6 @@ def analyze_disparities(subgroups):
                 })
         return pd.DataFrame(analysis_results)
 
-
     # Analyze disparities for the datasets
     train_analysis = analyze_disparities(train_subgroups)
     test_analysis = analyze_disparities(test_subgroups)
@@ -198,11 +197,14 @@ def analyze_disparities(subgroups):
     def weighted_metrics(metrics_df, support_df, metric='accuracy'):
         # Join metrics with their respective support counts
         metrics_df = metrics_df.copy()
+        print("Columns before merging with support_df:", metrics_df.columns)
+        print("Support DataFrame columns:", support_df.columns)
         metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
         metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total']
         return metrics_df
 
 
+    # Function to get top and bottom topics based on weighted metrics
     def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
         # Get support for each topic
         support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
@@ -214,7 +216,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
         baseline_accuracy = weighted_metrics_df['accuracy'].mean()
 
         # Sort topics by their weighted metrics
-        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending
+        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending
 
         # Get top 3 and bottom 3 topics
         top_3_topics = sorted_metrics.head(3)['topic'].tolist()

From c318f9fe45a3a81496b5e68637751936c88c31a3 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 12:04:46 +0200
Subject: [PATCH 32/42] tried to fix plotting

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index b191253..381957e 100644
--- a/main.py
+++ b/main.py
@@ -199,7 +199,7 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'):
         metrics_df = metrics_df.copy()
         print("Columns before merging with support_df:", metrics_df.columns)
         print("Support DataFrame columns:", support_df.columns)
-        metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
+        metrics_df = metrics_df.merge(support_df, left_on='support', right_on='subgroup')
         metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total']
         return metrics_df
 

From bbc65bc2befb2e03b2c5fa5f74121fce84549794 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 12:07:12 +0200
Subject: [PATCH 33/42] tried to fix plotting

---
 main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index 381957e..de4f7f7 100644
--- a/main.py
+++ b/main.py
@@ -199,8 +199,8 @@ def weighted_metrics(metrics_df, support_df, metric='accuracy'):
         metrics_df = metrics_df.copy()
         print("Columns before merging with support_df:", metrics_df.columns)
         print("Support DataFrame columns:", support_df.columns)
-        metrics_df = metrics_df.merge(support_df, left_on='support', right_on='subgroup')
-        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['total']
+        metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
+        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support']
         return metrics_df
 
 

From 34beb4f073fe87a0926b15a6bec9fe2a763a9418 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 12:21:04 +0200
Subject: [PATCH 34/42] tried to fix plotting

---
 main.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/main.py b/main.py
index de4f7f7..d8bc9e2 100644
--- a/main.py
+++ b/main.py
@@ -197,8 +197,8 @@ def analyze_disparities(subgroups):
     def weighted_metrics(metrics_df, support_df, metric='accuracy'):
         # Join metrics with their respective support counts
         metrics_df = metrics_df.copy()
-        print("Columns before merging with support_df:", metrics_df.columns)
-        print("Support DataFrame columns:", support_df.columns)
+        # print("Columns before merging with support_df:", metrics_df.columns)
+        # print("Support DataFrame columns:", support_df.columns)
         metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
         metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support']
         return metrics_df
@@ -251,11 +251,15 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='a
     train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False)
 
     model_save_path_v2 = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt')
-    print("Fine-tuning the sentiment analyzer with the generated+original dataset...")
-    fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data)  # TODO NON CE
-    print(f"Fine-tuning results: {fine_tuning_results_new}")
-    # Save the fine-tuned model
-    torch.save(sentiment_analyzer.model, model_save_path_v2)
+    if os.path.exists(model_save_path_v2):
+        print("Loading the fine-tuned model from disk...")
+        sentiment_analyzer.model = torch.load(model_save_path_v2)
+    else:
+        print("Fine-tuning the sentiment analyzer with the generated+original dataset...")
+        fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data)  # TODO NON CE
+        print(f"Fine-tuning results: {fine_tuning_results_new}")
+        # Save the fine-tuned model
+        torch.save(sentiment_analyzer.model, model_save_path_v2)
 
     # Predict sentiment for the original dataset to see for improvements
     test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')
@@ -338,8 +342,8 @@ def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'):
         plt.legend()
 
         plt.tight_layout()
-        plt.show()
-
+        plt.savefig(f'comparison_{metric}.png')
+        plt.close()
 
     # Plot the comparison for accuracy, precision, recall, and f1-score
     plot_metrics_comparison(test_metrics, test_metrics_v2, metric='accuracy')
@@ -347,7 +351,6 @@ def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'):
     plot_metrics_comparison(test_metrics, test_metrics_v2, metric='recall')
     plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score')
 
-
     def calculate_overall_accuracy(metrics_df):
         """
         Calculate the overall accuracy from the metrics DataFrame.
@@ -363,7 +366,6 @@ def calculate_overall_accuracy(metrics_df):
         overall_accuracy = weighted_accuracy_sum / total_support
         return overall_accuracy
 
-
     def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, support_new):
         """
         Plot the overall accuracy comparison before and after fine-tuning.
@@ -397,18 +399,13 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, supp
             plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold')
 
         plt.tight_layout()
-        plt.show()
+        plt.savefig('overall_accuracy_comparison.png')
+        plt.close()
 
 
     # Ensure the 'total' column exists in the metrics DataFrame
     test_analysis_v2 = analyze_disparities(test_subgroups_v2)
     val_analysis_v2 = analyze_disparities(val_subgroups_v2)
 
-    # Print the DataFrame to verify the 'total' column
-    print("\nTest Analysis V2:")
-    print(test_analysis_v2)
-    print("\nValidation Analysis V2:")
-    print(val_analysis_v2)
-
     # Calculate and plot the overall accuracy comparison
     plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2)
\ No newline at end of file

From 724a4f29d119bf53625b397e2a4d26b3f351602d Mon Sep 17 00:00:00 2001
From: Leonardo Moraglia <omblivion@gmail.com>
Date: Sat, 27 Jul 2024 14:55:06 +0200
Subject: [PATCH 35/42] reddit dataset is now enabled

---
 main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index d8bc9e2..aa33960 100644
--- a/main.py
+++ b/main.py
@@ -17,7 +17,7 @@
 
     # Set up argument parser for command-line options
     parser = argparse.ArgumentParser(description='Load dataset')
-    parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'TODO'],
+    parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'reddit'],
                         help='Type of dataset to load')
     parser.add_argument('--debug', type=bool, default=False,
                         help='Enable debug mode to print even more additional information')
@@ -39,7 +39,7 @@
 
     # Initialize dataset loader with the specified type and base path
     base_path = os.path.dirname(os.path.abspath(__file__))
-    dataset_loader = DatasetLoad('tweets', base_path, args.percentage)
+    dataset_loader = DatasetLoad(args.dataset_type, base_path, args.percentage)
     dataset_loader.load_datasets()
 
     # Load the original train, test, and validation datasets

From 23b89072a4c545eba5feb81bfb22ebd638b9e86b Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:56:18 +0200
Subject: [PATCH 36/42] tried to fix plotting

---
 DatasetLoad.py       |  3 ++-
 SentimentAnalyzer.py |  2 +-
 main.py              | 38 +++++++++++++-------------------------
 3 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/DatasetLoad.py b/DatasetLoad.py
index 10a2f42..059a032 100644
--- a/DatasetLoad.py
+++ b/DatasetLoad.py
@@ -43,7 +43,8 @@ def load_datasets(self):
             data = data.rename(columns={'clean_comment': 'text'})
             # truncate the text in the text column with over 512 characters
             data['text'] = data['text'].str.slice(0, 512)
-
+            data['category'] = data['category'].map({-1: 0, 0: 1, 1: 2})
+            data = data.dropna()
 
         elif self.dataset_type == 'tweets':
             print("Loading Twitter dataset...")
diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 878d115..cd9c727 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -132,7 +132,7 @@ def tokenize_function(examples):  # Tokenize the text
         data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
 
         # Define training arguments
-        training_args = TrainingArguments(  # Define the training arguments
+        training_args = TrainingArguments(  #
             output_dir="./results",
             run_name="finetuning_sentiment_classifier",
             eval_strategy="epoch",
diff --git a/main.py b/main.py
index d8bc9e2..a5e7951 100644
--- a/main.py
+++ b/main.py
@@ -96,8 +96,10 @@
     val_true_labels = original_val_data['category']
     val_predicted_labels = val_data_with_sentiment['sentiment']
     print("\nValidation Classification Report:")
+    val_report = classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0,
+                                       output_dict=True)
     print(classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0))
-
+    val_accuracy = val_report['accuracy']
 
     # Initialize the metadata extractor
     extractor = MetadataExtractor()
@@ -194,42 +196,33 @@ def analyze_disparities(subgroups):
     print(val_analysis)
 
 
-    def weighted_metrics(metrics_df, support_df, metric='accuracy'):
+    def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'):
         # Join metrics with their respective support counts
         metrics_df = metrics_df.copy()
-        # print("Columns before merging with support_df:", metrics_df.columns)
-        # print("Support DataFrame columns:", support_df.columns)
         metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
-        metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support']
+        metrics_df['weighted_metric'] = (metrics_df[metric] - accuracy) * metrics_df['support']
         return metrics_df
 
 
     # Function to get top and bottom topics based on weighted metrics
-    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
+    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=val_accuracy, metric='accuracy'):
         # Get support for each topic
         support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
 
         # Compute weighted metrics
-        weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric)
-
-        # Compute baseline accuracy
-        baseline_accuracy = weighted_metrics_df['accuracy'].mean()
+        weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric)
 
         # Sort topics by their weighted metrics
-        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending
+        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending, so the first 3 are the most disadvantaged
 
         # Get top 3 and bottom 3 topics
-        top_3_topics = sorted_metrics.head(3)['topic'].tolist()
-        bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist()
+        bottom_3_topics = sorted_metrics.head(3)['topic'].tolist()
+        top_3_topics = sorted_metrics.tail(3)['topic'].tolist()
 
-        # Adjust for baseline accuracy
-        bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)[
-            'topic'].tolist()
+        return bottom_3_topics, top_3_topics
 
-        return top_3_topics, bottom_3_topics_below_baseline
 
-
-    top_3_topics, bottom_3_topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy')
+    bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, val_accuracy, metric='accuracy')
     print(f"Bottom 3 validation topics: {bottom_3_topics }")
 
     print("Augmenting the training dataset with synthetic data...")
@@ -393,7 +386,7 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, supp
         plt.xlabel('Model', fontweight='bold')
         plt.ylabel('Overall Accuracy', fontweight='bold')
         plt.title('Overall Accuracy Comparison', fontweight='bold')
-        plt.ylim(0, 1)  # Assuming accuracy is between 0 and 1
+        plt.ylim(0, 1)
 
         for i, v in enumerate(accuracies):
             plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold')
@@ -402,10 +395,5 @@ def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, supp
         plt.savefig('overall_accuracy_comparison.png')
         plt.close()
 
-
-    # Ensure the 'total' column exists in the metrics DataFrame
-    test_analysis_v2 = analyze_disparities(test_subgroups_v2)
-    val_analysis_v2 = analyze_disparities(val_subgroups_v2)
-
     # Calculate and plot the overall accuracy comparison
     plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2)
\ No newline at end of file

From a54d4d92f69834c3b0ebec252596850b546f3dd1 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 15:01:56 +0200
Subject: [PATCH 37/42] fixed bottom 3

---
 main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index b1d37e1..2068297 100644
--- a/main.py
+++ b/main.py
@@ -200,7 +200,7 @@ def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'):
         # Join metrics with their respective support counts
         metrics_df = metrics_df.copy()
         metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
-        metrics_df['weighted_metric'] = (metrics_df[metric] - accuracy) * metrics_df['support']
+        metrics_df['weighted_metric'] = (accuracy - metrics_df[metric]) * metrics_df['support']
         return metrics_df
 
 
@@ -216,7 +216,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=
         sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending, so the first 3 are the most disadvantaged
 
         # Get top 3 and bottom 3 topics
-        bottom_3_topics = sorted_metrics.head(3)['topic'].tolist()
+        bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist()
         top_3_topics = sorted_metrics.tail(3)['topic'].tolist()
 
         return bottom_3_topics, top_3_topics

From 820e2833df02c08dd560cc5d697dc5a491325153 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 15:10:35 +0200
Subject: [PATCH 38/42] changed temperature

---
 SentimentAnalyzer.py | 2 +-
 main.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index cd9c727..dbdd3d4 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -62,7 +62,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False
                 num_return_sequences=1,
                 do_sample=True,
                 top_k=50,
-                temperature=0.9
+                temperature=0.7
             )
             generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
             synthetic_data.append(generated_text)
diff --git a/main.py b/main.py
index 2068297..52ebfea 100644
--- a/main.py
+++ b/main.py
@@ -223,7 +223,7 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=
 
 
     bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, val_accuracy, metric='accuracy')
-    print(f"Bottom 3 validation topics: {bottom_3_topics }")
+    print(f"Bottom 3 validation topics: {bottom_3_topics}")
 
     print("Augmenting the training dataset with synthetic data...")
     # Randomly select rows from bottom three topics in the training set

From 7fe919977852f3c7affbcc26d43c0f758fceac8b Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 15:27:39 +0200
Subject: [PATCH 39/42] debug for top_lower

---
 main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 52ebfea..7803442 100644
--- a/main.py
+++ b/main.py
@@ -100,7 +100,7 @@
                                        output_dict=True)
     print(classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0))
     val_accuracy = val_report['accuracy']
-
+    print(val_accuracy)
     # Initialize the metadata extractor
     extractor = MetadataExtractor()
 
@@ -211,14 +211,14 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=
 
         # Compute weighted metrics
         weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric)
-
+        print(weighted_metrics_df)
         # Sort topics by their weighted metrics
         sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending, so the first 3 are the most disadvantaged
-
+        print(sorted_metrics)
         # Get top 3 and bottom 3 topics
         bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist()
         top_3_topics = sorted_metrics.tail(3)['topic'].tolist()
-
+        print(bottom_3_topics)
         return bottom_3_topics, top_3_topics
 
 

From ec0a0a6608efb97720ec4c419317a9e2492c50b6 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 15:35:34 +0200
Subject: [PATCH 40/42] added stratification

---
 DatasetLoad.py | 4 ++--
 main.py        | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/DatasetLoad.py b/DatasetLoad.py
index 059a032..23d2f69 100644
--- a/DatasetLoad.py
+++ b/DatasetLoad.py
@@ -63,8 +63,8 @@ def load_datasets(self):
         # Ensure the first column is 'text' and the second column is 'category'
         data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]]
 
-        train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
-        self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
+        train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['category'])
+        self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['category'])
         self.train_data = train_data
 
         if self.percentage < 100.0:
diff --git a/main.py b/main.py
index 7803442..cc23c76 100644
--- a/main.py
+++ b/main.py
@@ -211,14 +211,14 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=
 
         # Compute weighted metrics
         weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric)
-        print(weighted_metrics_df)
+        # print(weighted_metrics_df)
         # Sort topics by their weighted metrics
         sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending, so the first 3 are the most disadvantaged
-        print(sorted_metrics)
+        # print(sorted_metrics)
         # Get top 3 and bottom 3 topics
         bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist()
         top_3_topics = sorted_metrics.tail(3)['topic'].tolist()
-        print(bottom_3_topics)
+        # print(bottom_3_topics)
         return bottom_3_topics, top_3_topics
 
 

From eaff928e97e697bb0542f81ce4c1a69187b2ee69 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 15:41:40 +0200
Subject: [PATCH 41/42] removed stratification

---
 DatasetLoad.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DatasetLoad.py b/DatasetLoad.py
index 23d2f69..059a032 100644
--- a/DatasetLoad.py
+++ b/DatasetLoad.py
@@ -63,8 +63,8 @@ def load_datasets(self):
         # Ensure the first column is 'text' and the second column is 'category'
         data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]]
 
-        train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42, stratify=data['category'])
-        self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['category'])
+        train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
+        self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
         self.train_data = train_data
 
         if self.percentage < 100.0:

From bacb3a9295a46778a8ab7dc139f352d9e99bbf51 Mon Sep 17 00:00:00 2001
From: Raul Gatto <126099719+Raoolo@users.noreply.github.com>
Date: Sat, 27 Jul 2024 16:35:03 +0200
Subject: [PATCH 42/42] samples changed from 6 to 50

---
 SentimentAnalyzer.py | 2 +-
 main.py              | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index dbdd3d4..e726486 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -75,7 +75,7 @@ def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False
         return synthetic_data
 
     # Augment the training data with synthetic data
-    def generate_training_data(self, topics, texts, sentiments, n_samples=6, debug=False):
+    def generate_training_data(self, topics, texts, sentiments, n_samples=50, debug=False):
         print("Generating synthetic data...")
         generated_data = {'text': [], 'category': []}
         generated_data_with_topic = {'text': [], 'category': [], 'topic': []}
diff --git a/main.py b/main.py
index cc23c76..d811b48 100644
--- a/main.py
+++ b/main.py
@@ -205,7 +205,7 @@ def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'):
 
 
     # Function to get top and bottom topics based on weighted metrics
-    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=val_accuracy, metric='accuracy'):
+    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df,k=3, accuracy=val_accuracy, metric='accuracy'):
         # Get support for each topic
         support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
 
@@ -217,12 +217,12 @@ def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, accuracy=
         # print(sorted_metrics)
         # Get top 3 and bottom 3 topics
         bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist()
-        top_3_topics = sorted_metrics.tail(3)['topic'].tolist()
+        top_3_topics = sorted_metrics.tail(k)['topic'].tolist()
         # print(bottom_3_topics)
         return bottom_3_topics, top_3_topics
 
 
-    bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, val_accuracy, metric='accuracy')
+    bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, 3, val_accuracy, metric='accuracy')
     print(f"Bottom 3 validation topics: {bottom_3_topics}")
 
     print("Augmenting the training dataset with synthetic data...")