diff --git a/DatasetLoad.py b/DatasetLoad.py
index dc3f877..059a032 100644
--- a/DatasetLoad.py
+++ b/DatasetLoad.py
@@ -43,7 +43,8 @@ def load_datasets(self):
             data = data.rename(columns={'clean_comment': 'text'})
             # truncate the text in the text column with over 512 characters
             data['text'] = data['text'].str.slice(0, 512)
-
+            data['category'] = data['category'].map({-1: 0, 0: 1, 1: 2})
+            data = data.dropna()
 
         elif self.dataset_type == 'tweets':
             print("Loading Twitter dataset...")
@@ -57,6 +58,7 @@ def load_datasets(self):
             data = data.rename(columns={'Tweet': 'text'})
             # remove the rows of the text column in which the text is "Not Available"
             data = data[data['text'] != 'Not Available']
+            data = data.dropna()
 
         # Ensure the first column is 'text' and the second column is 'category'
         data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]]
diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
index 66546c3..e726486 100644
--- a/SentimentAnalyzer.py
+++ b/SentimentAnalyzer.py
@@ -1,7 +1,9 @@
 import pandas as pd
 import torch
 from sklearn.model_selection import train_test_split
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \
+    DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM
+
 from datasets import Dataset
 
 
@@ -10,9 +12,15 @@ def __init__(self):
         self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
         self.device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
+                                                                        ignore_mismatched_sizes=True).to(self.device)
         self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)
 
+        # Initialize FLAN model for synthetic data generation
+        self.flan_model_name = "google/flan-t5-small"
+        self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name)
+        self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device)
+
     def analyze_sentiment(self, text):
         results = self.classifier(text)
         return results[0]['label']
@@ -28,51 +36,87 @@ def map_label_to_target(self, label):
         else:
             return None
 
-    # Generate synthetic data using LLMs to be defined
-    def generate_synthetic_data(self, topic, n_samples):
-        openai.api_key = 'YOUR_API_KEY'
+    def map_target_to_label(self, target):
+        # Map the target value to the sentiment label
+        if target == 0:
+            return "negative"
+        elif target == 1:
+            return "neutral"
+        elif target == 2:
+            return "positive"
+        else:
+            return None
+
+    # Generate synthetic data using the FLAN model
+    def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False):
         synthetic_data = []
+        # print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
+        count = 0
         for _ in range(n_samples):
-            prompt = f"Generate six tweets related to {topic} that expresses sentiment."
-            response = openai.Completion.create(
-                engine="text-davinci-003",
-                prompt=prompt,
-                max_tokens=60
+            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' "
+            inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
+            # Use top-k sampling and temperature sampling for more diverse outputs
+            outputs = self.flan_model.generate(
+                inputs.input_ids,
+                max_length=60,
+                num_return_sequences=1,
+                do_sample=True,
+                top_k=50,
+                temperature=0.7
             )
-            synthetic_data.append(response.choices[0].text.strip())
+            generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            synthetic_data.append(generated_text)
+            count += 1
+            if debug:
+                print(f"DEBUG - Generated Text: {generated_text}")
+                print(f"DEBUG - Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
+            if int(count % 5) == 0:
+                print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
         return synthetic_data
 
-    def augment_training_data(self, topics, n_samples=100):
-        augmented_data = {'text': [], 'label': []}
-        augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
-        for topic in topics:
-            synthetic_texts = self.generate_synthetic_data(topic, n_samples)
-            # Assuming the sentiment label for generated data
-            augmented_data['text'].extend(synthetic_texts)
-            augmented_data['label'].extend([1] * len(synthetic_texts))  # Defaulting to neutral
-            augmented_data_with_topics['text'].extend(synthetic_texts)
-            augmented_data_with_topics['label'].extend([1] * len(synthetic_texts))
-            augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))
-
-        augmented_df = pd.DataFrame(augmented_data)
-        augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics)
-        return augmented_df, augmented_df_with_topics
-
-    def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5):
-        augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples)
-        return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
+    # Augment the training data with synthetic data
+    def generate_training_data(self, topics, texts, sentiments, n_samples=50, debug=False):
+        print("Generating synthetic data...")
+        generated_data = {'text': [], 'category': []}
+        generated_data_with_topic = {'text': [], 'category': [], 'topic': []}
+
+        count = 0
+        total = len(texts)
+        for topic, text, sentiment in zip(topics, texts, sentiments):
+            sentiment_text = self.map_target_to_label(sentiment)
+            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples,
+                                                           debug)  # List of synthetic texts
+            generated_data['text'].extend(synthetic_texts)
+            generated_data['category'].extend(
+                [sentiment] * len(synthetic_texts))  # append sentiment to texts many times
+            generated_data_with_topic['text'].extend(synthetic_texts)
+            generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts))
+            generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts))
+            count += 1
+            if debug:
+                print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}")
+            # Print percentage of completion of total texts
+            percentage_complete = count / total * 100
+            if int(percentage_complete) % 5 == 0:
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count} of {total}")
+
+
+
+        generated_df = pd.DataFrame(generated_data)
+        generated_df_with_topics = pd.DataFrame(generated_data_with_topic)
+        return generated_df, generated_df_with_topics
 
     # Fine-tune the model on a custom dataset
     def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
         # Preprocess the dataset
-        df = df.rename(columns={"text": "text", "category": "label"})     # Rename the columns
-        df['label'] = df['label'].astype(int)   # Ensure the labels are integers
-        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)        # Split the dataset
+        df = df.rename(columns={"text": "text", "category": "label"})  # Rename the columns
+        df['label'] = df['label'].astype(int)  # Ensure the labels are integers
+        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # Split the dataset
 
-        train_dataset = Dataset.from_pandas(train_df)   # Load the dataset
+        train_dataset = Dataset.from_pandas(train_df)  # Load the dataset
         test_dataset = Dataset.from_pandas(test_df)
 
-        def tokenize_function(examples):    # Tokenize the text
+        def tokenize_function(examples):  # Tokenize the text
             return self.tokenizer(examples["text"], padding="max_length", truncation=True)
 
         train_dataset = train_dataset.map(tokenize_function, batched=True)  # Tokenize the dataset
@@ -81,14 +125,14 @@ def tokenize_function(examples):    # Tokenize the text
         train_dataset = train_dataset.remove_columns(["text"])  # Remove the text column after tokenization
         test_dataset = test_dataset.remove_columns(["text"])
 
-        train_dataset.set_format("torch")   # Set the format to PyTorch
+        train_dataset.set_format("torch")  # Set the format to PyTorch
         test_dataset.set_format("torch")
 
         # Define the data collator
         data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
 
         # Define training arguments
-        training_args = TrainingArguments(  # Define the training arguments
+        training_args = TrainingArguments(  #
             output_dir="./results",
             run_name="finetuning_sentiment_classifier",
             eval_strategy="epoch",
diff --git a/extract_stuff.py b/extract_stuff.py
index 2d779e9..706c2af 100644
--- a/extract_stuff.py
+++ b/extract_stuff.py
@@ -28,11 +28,11 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de
             # If debug mode is enabled, print debug information
             percentage_complete = ((count + 1) / total_rows) * 100
             if debug:
-                print(f"Text: {row['text']}")
-                print(f"Generated Metadata: Topic - {topic}")
-                print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
+                print(f"DEBUG - Text: {row['text']}")
+                print(f"DEBUG - Generated Metadata: Topic - {topic}")
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
-            if percentage_complete % 5 == 0:
+            if int(percentage_complete) % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
             count += 1
@@ -68,11 +68,11 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch
             # Extend the sentiments list with the batch sentiments
             sentiments.extend(batch_sentiments)
             # Calculate the percentage of completion
-            percentage_complete = ((end) / total_rows) * 100
+            percentage_complete = (end / total_rows) * 100
             if debug:
-                print(f"Processed batch {start // batch_size + 1}: {start} to {end}")
-                print(f"Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
-            if percentage_complete % 5 == 0:
+                print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}")
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
+            if int(percentage_complete) % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%")
 
         dataset['sentiment'] = sentiments
diff --git a/main.py b/main.py
index cc7b16e..d811b48 100644
--- a/main.py
+++ b/main.py
@@ -1,24 +1,27 @@
 import argparse
 import os
-import wandb
+
+import matplotlib.pyplot as plt
 import pandas as pd
 import torch
 from sklearn.metrics import classification_report
+
 from DatasetLoad import DatasetLoad
 from MetadataExtractor import MetadataExtractor
 from SentimentAnalyzer import SentimentAnalyzer
 from extract_stuff import augment_and_extract_metadata, predict_sentiment
 
-
 os.environ["WANDB_API_KEY"] = "21cb0c9433eeca19401ee01e9b1bc9e4b6f7a696"
 
 if __name__ == "__main__":
 
     # Set up argument parser for command-line options
     parser = argparse.ArgumentParser(description='Load dataset')
-    parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'TODO'],
+    parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'reddit'],
                         help='Type of dataset to load')
     parser.add_argument('--debug', type=bool, default=False,
+                        help='Enable debug mode to print even more additional information')
+    parser.add_argument('--deep_debug', type=bool, default=False,
                         help='Enable debug mode to print additional information')
     parser.add_argument('--percentage', type=float, default=100.0,
                         help='Percentage of the dataset to use (e.g., 0.1 for 0.1%)')
@@ -26,6 +29,7 @@
     # Parse command-line arguments
     args = parser.parse_args()
     print("Debugging is set to: ", args.debug)
+    print("Deep Debugging is set to: ", args.deep_debug)
     print("Percentage is set to: ", args.percentage)
 
     # Print Torch availability and device information
@@ -35,7 +39,7 @@
 
     # Initialize dataset loader with the specified type and base path
     base_path = os.path.dirname(os.path.abspath(__file__))
-    dataset_loader = DatasetLoad('tweets', base_path, args.percentage)
+    dataset_loader = DatasetLoad(args.dataset_type, base_path, args.percentage)
     dataset_loader.load_datasets()
 
     # Load the original train, test, and validation datasets
@@ -48,25 +52,32 @@
     # Initialize the sentiment analyzer
     sentiment_analyzer = SentimentAnalyzer()
 
-    # Fine-tune the sentiment analyzer with the original dataset
-    fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
-    print(f"Fine-tuning results: {fine_tuning_results}")
-
     # Extract metadata for the datasets
     base_path = os.path.dirname(os.path.abspath(__file__))
-
-    # Extract metadata for the datasets
+    model_save_path = os.path.join(base_path, f'sentiment_model_{args.dataset_type}_{args.percentage}.pt')
+    # Check if a saved model exists
+    if os.path.exists(model_save_path):
+        print("Loading the fine-tuned model from disk...")
+        sentiment_analyzer.model = torch.load(model_save_path)
+    else:
+        print("Fine-tuning the sentiment analyzer with the original dataset...")
+        fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
+        print(f"Fine-tuning results: {fine_tuning_results}")
+        # Save the fine-tuned model
+        torch.save(sentiment_analyzer.model, model_save_path)
+
+    # Define the file names for the sentiment predictions
     train_sentiment_file_name = os.path.join(base_path, f'train_sentiment_{args.dataset_type}_{args.percentage}.csv')
     test_sentiment_file_name = os.path.join(base_path, f'test_sentiment_{args.dataset_type}_{args.percentage}.csv')
     val_sentiment_file_name = os.path.join(base_path, f'val_sentiment_{args.dataset_type}_{args.percentage}.csv')
 
     # Predict sentiment for the datasets
     train_data_with_sentiment = predict_sentiment(original_train_data.copy(), sentiment_analyzer,
-                                                  train_sentiment_file_name, args.debug)
+                                                  train_sentiment_file_name, args.deep_debug)
     test_data_with_sentiment = predict_sentiment(original_test_data.copy(), sentiment_analyzer,
-                                                 test_sentiment_file_name, args.debug)
+                                                 test_sentiment_file_name, args.deep_debug)
     val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name,
-                                                args.debug)
+                                                args.deep_debug)
 
 
     # Compute metrics for the train dataset
@@ -85,14 +96,16 @@
     val_true_labels = original_val_data['category']
     val_predicted_labels = val_data_with_sentiment['sentiment']
     print("\nValidation Classification Report:")
+    val_report = classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0,
+                                       output_dict=True)
     print(classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0))
-
-
+    val_accuracy = val_report['accuracy']
+    print(val_accuracy)
     # Initialize the metadata extractor
     extractor = MetadataExtractor()
 
     # Define topic labels
-    topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"]
+    topic_labels = ["news", "entertainment", "sports", "technology", "health", "education", "business", "lifestyle", "opinions", "other"]
 
     # Define the base path where main.py is located
     base_path = os.path.dirname(os.path.abspath(__file__))
@@ -103,11 +116,11 @@
     val_file_name = os.path.join(base_path, f'val_augmented_{args.dataset_type}_{args.percentage}.csv')
 
     train_data_with_metadata = augment_and_extract_metadata(train_data_with_sentiment.copy(), extractor,
-                                                            topic_labels, train_file_name, args.debug)
+                                                            topic_labels, train_file_name, args.deep_debug)
     test_data_with_metadata = augment_and_extract_metadata(test_data_with_sentiment.copy(), extractor,
-                                                           topic_labels, test_file_name, args.debug)
+                                                           topic_labels, test_file_name, args.deep_debug)
     val_data_with_metadata = augment_and_extract_metadata(val_data_with_sentiment.copy(), extractor,
-                                                          topic_labels, val_file_name, args.debug)
+                                                          topic_labels, val_file_name, args.deep_debug)
 
 
     # Function to create subgroups based on metadata
@@ -153,6 +166,7 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column
     print("\nValidation Metrics per Topic")
     print(val_metrics)
 
+
     # Function to analyze disparities in sentiment predictions
     def analyze_disparities(subgroups):
         analysis_results = []
@@ -168,7 +182,6 @@ def analyze_disparities(subgroups):
                 })
         return pd.DataFrame(analysis_results)
 
-
     # Analyze disparities for the datasets
     train_analysis = analyze_disparities(train_subgroups)
     test_analysis = analyze_disparities(test_subgroups)
@@ -182,3 +195,205 @@ def analyze_disparities(subgroups):
     print("\nValidation Percentage Analysis")
     print(val_analysis)
 
+
+    def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'):
+        # Join metrics with their respective support counts
+        metrics_df = metrics_df.copy()
+        metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
+        metrics_df['weighted_metric'] = (accuracy - metrics_df[metric]) * metrics_df['support']
+        return metrics_df
+
+
+    # Function to get top and bottom topics based on weighted metrics
+    def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df,k=3, accuracy=val_accuracy, metric='accuracy'):
+        # Get support for each topic
+        support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
+
+        # Compute weighted metrics
+        weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric)
+        # print(weighted_metrics_df)
+        # Sort topics by their weighted metrics
+        sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)  # Sort by descending, so the first 3 are the most disadvantaged
+        # print(sorted_metrics)
+        # Get top 3 and bottom 3 topics
+        bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist()
+        top_3_topics = sorted_metrics.tail(k)['topic'].tolist()
+        # print(bottom_3_topics)
+        return bottom_3_topics, top_3_topics
+
+
+    bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, 3, val_accuracy, metric='accuracy')
+    print(f"Bottom 3 validation topics: {bottom_3_topics}")
+
+    print("Augmenting the training dataset with synthetic data...")
+    # Randomly select rows from bottom three topics in the training set
+    train_data_bottom_3 = train_data_with_metadata[train_data_with_metadata['topic'].isin(bottom_3_topics)]
+    selected_samples = train_data_bottom_3.sample(n=50, random_state=42)    # Select n samples from the bottom 3 topics
+
+    # Augment the selected samples using the sentiment analyzer
+    generated_df, generated_df_with_metadata = sentiment_analyzer.generate_training_data(
+        selected_samples['topic'].tolist(),
+        selected_samples['text'].tolist(),
+        selected_samples['sentiment'].tolist(),
+        debug=args.debug
+    )
+
+    # Combine the original and augmented datasets
+    train_original_and_generated_data = pd.concat([original_train_data, generated_df], ignore_index=True)
+    # Save the combined datasets
+    train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False)
+
+    model_save_path_v2 = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt')
+    if os.path.exists(model_save_path_v2):
+        print("Loading the fine-tuned model from disk...")
+        sentiment_analyzer.model = torch.load(model_save_path_v2)
+    else:
+        print("Fine-tuning the sentiment analyzer with the generated+original dataset...")
+        fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data)  # TODO NON CE
+        print(f"Fine-tuning results: {fine_tuning_results_new}")
+        # Save the fine-tuned model
+        torch.save(sentiment_analyzer.model, model_save_path_v2)
+
+    # Predict sentiment for the original dataset to see for improvements
+    test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')
+    val_sentiment_file_name_v2 = os.path.join(base_path, f'val_sentiment_v2_{args.dataset_type}_{args.percentage}.csv')
+    test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer,
+                                                    test_sentiment_file_name_v2, args.deep_debug)
+    val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer,
+                                                   val_sentiment_file_name_v2, args.deep_debug)
+
+    # Compute metrics for the test dataset
+    test_true_labels = original_test_data['category']
+    test_predicted_labels_v2 = test_data_with_sentiment_v2['sentiment']
+    print("\nTest Classification Report:")
+    print(classification_report(test_true_labels, test_predicted_labels_v2, labels=[0, 1, 2], zero_division=0))
+
+    # Compute metrics for the validation dataset
+    val_true_labels = original_val_data['category']
+    val_predicted_labels_v2 = val_data_with_sentiment_v2['sentiment']
+    print("\nValidation Classification Report:")
+    print(classification_report(val_true_labels, val_predicted_labels_v2, labels=[0, 1, 2], zero_division=0))
+
+    test_file_name_v2 = os.path.join(base_path, f'test_augmented_v2_{args.dataset_type}_{args.percentage}.csv')
+    val_file_name_v2 = os.path.join(base_path, f'val_augmented_v2_{args.dataset_type}_{args.percentage}.csv')
+    test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor,
+                                                              topic_labels, test_file_name_v2, args.deep_debug)
+    val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels,
+                                                             val_file_name_v2, args.deep_debug)
+
+    # Create subgroups for the datasets
+    test_subgroups_v2 = create_subgroups(test_data_with_metadata_v2)
+    val_subgroups_v2 = create_subgroups(val_data_with_metadata_v2)
+
+
+    test_metrics_v2 = compute_metrics(test_subgroups_v2)
+    val_metrics_v2 = compute_metrics(val_subgroups_v2)
+
+    print("\nTest Metrics per Topic")
+    print(test_metrics_v2)
+    print("\nValidation Metrics per Topic")
+    print(val_metrics_v2)
+
+    test_analysis_v2 = analyze_disparities(test_subgroups_v2)
+    val_analysis_v2 = analyze_disparities(val_subgroups_v2)
+    print("\nTest Percentage Analysis")
+    print(test_analysis_v2)
+    print("\nValidation Percentage Analysis")
+    print(val_analysis_v2)
+
+
+    def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'):
+        """
+        Plots a comparison of the given metric before and after fine-tuning.
+
+        Parameters:
+        - old_metrics: DataFrame containing the old metrics
+        - new_metrics: DataFrame containing the new metrics
+        - metric: The metric to compare (default is 'accuracy')
+        """
+        # Merge the old and new metrics on the 'topic' column
+        comparison_df = old_metrics.merge(new_metrics, on='topic', suffixes=('_old', '_new'))
+
+        # Sort the DataFrame by the new metric for better visualization
+        comparison_df = comparison_df.sort_values(by=f'{metric}_new', ascending=False)
+
+        # Plot the comparison
+        plt.figure(figsize=(12, 8))
+        bar_width = 0.4
+
+        # Positioning the bars
+        r1 = range(len(comparison_df))
+        r2 = [x + bar_width for x in r1]
+
+        plt.bar(r1, comparison_df[f'{metric}_old'], color='blue', width=bar_width, edgecolor='grey', label='Old')
+        plt.bar(r2, comparison_df[f'{metric}_new'], color='green', width=bar_width, edgecolor='grey', label='New')
+
+        plt.xlabel('Topics', fontweight='bold')
+        plt.ylabel(metric.capitalize(), fontweight='bold')
+        plt.title(f'Comparison of {metric.capitalize()} by Topic', fontweight='bold')
+        plt.xticks([r + bar_width / 2 for r in range(len(comparison_df))], comparison_df['topic'], rotation=90)
+        plt.legend()
+
+        plt.tight_layout()
+        plt.savefig(f'comparison_{metric}.png')
+        plt.close()
+
+    # Plot the comparison for accuracy, precision, recall, and f1-score
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='accuracy')
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='precision')
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='recall')
+    plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score')
+
+    def calculate_overall_accuracy(metrics_df):
+        """
+        Calculate the overall accuracy from the metrics DataFrame.
+
+        Parameters:
+        - metrics_df: DataFrame containing the metrics
+
+        Returns:
+        - overall_accuracy: The overall accuracy
+        """
+        total_support = metrics_df['total'].sum()
+        weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum()
+        overall_accuracy = weighted_accuracy_sum / total_support
+        return overall_accuracy
+
+    def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, support_new):
+        """
+        Plot the overall accuracy comparison before and after fine-tuning.
+
+        Parameters:
+        - old_metrics: DataFrame containing the old metrics
+        - new_metrics: DataFrame containing the new metrics
+        - support_old: DataFrame containing the support data for the old metrics
+        - support_new: DataFrame containing the support data for the new metrics
+        """
+        old_metrics_with_total = old_metrics.merge(support_old[['subgroup', 'total']], left_on='topic',
+                                                   right_on='subgroup')
+        new_metrics_with_total = new_metrics.merge(support_new[['subgroup', 'total']], left_on='topic',
+                                                   right_on='subgroup')
+
+        overall_accuracy_old = calculate_overall_accuracy(old_metrics_with_total)
+        overall_accuracy_new = calculate_overall_accuracy(new_metrics_with_total)
+
+        accuracies = [overall_accuracy_old, overall_accuracy_new]
+        labels = ['Old Model', 'New Model']
+
+        plt.figure(figsize=(8, 6))
+        plt.bar(labels, accuracies, color=['blue', 'green'], edgecolor='grey')
+
+        plt.xlabel('Model', fontweight='bold')
+        plt.ylabel('Overall Accuracy', fontweight='bold')
+        plt.title('Overall Accuracy Comparison', fontweight='bold')
+        plt.ylim(0, 1)
+
+        for i, v in enumerate(accuracies):
+            plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold')
+
+        plt.tight_layout()
+        plt.savefig('overall_accuracy_comparison.png')
+        plt.close()
+
+    # Calculate and plot the overall accuracy comparison
+    plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 9a3e5b1..11143d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ scikit_learn==1.2.0
 transformers==4.42.4
 tensorflow
 gdown
-
+matplotlib