diff --git a/DatasetLoad.py b/DatasetLoad.py index dc3f877..059a032 100644 --- a/DatasetLoad.py +++ b/DatasetLoad.py @@ -43,7 +43,8 @@ def load_datasets(self): data = data.rename(columns={'clean_comment': 'text'}) # truncate the text in the text column with over 512 characters data['text'] = data['text'].str.slice(0, 512) - + data['category'] = data['category'].map({-1: 0, 0: 1, 1: 2}) + data = data.dropna() elif self.dataset_type == 'tweets': print("Loading Twitter dataset...") @@ -57,6 +58,7 @@ def load_datasets(self): data = data.rename(columns={'Tweet': 'text'}) # remove the rows of the text column in which the text is "Not Available" data = data[data['text'] != 'Not Available'] + data = data.dropna() # Ensure the first column is 'text' and the second column is 'category' data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]] diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 66546c3..e726486 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -1,7 +1,9 @@ import pandas as pd import torch from sklearn.model_selection import train_test_split -from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline +from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \ + DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM + from datasets import Dataset @@ -10,9 +12,15 @@ def __init__(self): self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" self.device = 0 if torch.cuda.is_available() else -1 # Use GPU if available self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device) + self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, + ignore_mismatched_sizes=True).to(self.device) self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device) + # Initialize FLAN model for synthetic data generation + self.flan_model_name = "google/flan-t5-small" + self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name) + self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device) + def analyze_sentiment(self, text): results = self.classifier(text) return results[0]['label'] @@ -28,51 +36,87 @@ def map_label_to_target(self, label): else: return None - # Generate synthetic data using LLMs to be defined - def generate_synthetic_data(self, topic, n_samples): - openai.api_key = 'YOUR_API_KEY' + def map_target_to_label(self, target): + # Map the target value to the sentiment label + if target == 0: + return "negative" + elif target == 1: + return "neutral" + elif target == 2: + return "positive" + else: + return None + + # Generate synthetic data using the FLAN model + def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False): synthetic_data = [] + # print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}") + count = 0 for _ in range(n_samples): - prompt = f"Generate six tweets related to {topic} that expresses sentiment." - response = openai.Completion.create( - engine="text-davinci-003", - prompt=prompt, - max_tokens=60 + prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' " + inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device) + # Use top-k sampling and temperature sampling for more diverse outputs + outputs = self.flan_model.generate( + inputs.input_ids, + max_length=60, + num_return_sequences=1, + do_sample=True, + top_k=50, + temperature=0.7 ) - synthetic_data.append(response.choices[0].text.strip()) + generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) + synthetic_data.append(generated_text) + count += 1 + if debug: + print(f"DEBUG - Generated Text: {generated_text}") + print(f"DEBUG - Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}") + if int(count % 5) == 0: + print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}") return synthetic_data - def augment_training_data(self, topics, n_samples=100): - augmented_data = {'text': [], 'label': []} - augmented_data_with_topics = {'text': [], 'label': [], 'topic': []} - for topic in topics: - synthetic_texts = self.generate_synthetic_data(topic, n_samples) - # Assuming the sentiment label for generated data - augmented_data['text'].extend(synthetic_texts) - augmented_data['label'].extend([1] * len(synthetic_texts)) # Defaulting to neutral - augmented_data_with_topics['text'].extend(synthetic_texts) - augmented_data_with_topics['label'].extend([1] * len(synthetic_texts)) - augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts)) - - augmented_df = pd.DataFrame(augmented_data) - augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics) - return augmented_df, augmented_df_with_topics - - def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5): - augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples) - return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics + # Augment the training data with synthetic data + def generate_training_data(self, topics, texts, sentiments, n_samples=50, debug=False): + print("Generating synthetic data...") + generated_data = {'text': [], 'category': []} + generated_data_with_topic = {'text': [], 'category': [], 'topic': []} + + count = 0 + total = len(texts) + for topic, text, sentiment in zip(topics, texts, sentiments): + sentiment_text = self.map_target_to_label(sentiment) + synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples, + debug) # List of synthetic texts + generated_data['text'].extend(synthetic_texts) + generated_data['category'].extend( + [sentiment] * len(synthetic_texts)) # append sentiment to texts many times + generated_data_with_topic['text'].extend(synthetic_texts) + generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts)) + generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts)) + count += 1 + if debug: + print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}") + # Print percentage of completion of total texts + percentage_complete = count / total * 100 + if int(percentage_complete) % 5 == 0: + print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count} of {total}") + + + + generated_df = pd.DataFrame(generated_data) + generated_df_with_topics = pd.DataFrame(generated_data_with_topic) + return generated_df, generated_df_with_topics # Fine-tune the model on a custom dataset def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5): # Preprocess the dataset - df = df.rename(columns={"text": "text", "category": "label"}) # Rename the columns - df['label'] = df['label'].astype(int) # Ensure the labels are integers - train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset + df = df.rename(columns={"text": "text", "category": "label"}) # Rename the columns + df['label'] = df['label'].astype(int) # Ensure the labels are integers + train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset - train_dataset = Dataset.from_pandas(train_df) # Load the dataset + train_dataset = Dataset.from_pandas(train_df) # Load the dataset test_dataset = Dataset.from_pandas(test_df) - def tokenize_function(examples): # Tokenize the text + def tokenize_function(examples): # Tokenize the text return self.tokenizer(examples["text"], padding="max_length", truncation=True) train_dataset = train_dataset.map(tokenize_function, batched=True) # Tokenize the dataset @@ -81,14 +125,14 @@ def tokenize_function(examples): # Tokenize the text train_dataset = train_dataset.remove_columns(["text"]) # Remove the text column after tokenization test_dataset = test_dataset.remove_columns(["text"]) - train_dataset.set_format("torch") # Set the format to PyTorch + train_dataset.set_format("torch") # Set the format to PyTorch test_dataset.set_format("torch") # Define the data collator data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) # Define training arguments - training_args = TrainingArguments( # Define the training arguments + training_args = TrainingArguments( # output_dir="./results", run_name="finetuning_sentiment_classifier", eval_strategy="epoch", diff --git a/extract_stuff.py b/extract_stuff.py index 2d779e9..706c2af 100644 --- a/extract_stuff.py +++ b/extract_stuff.py @@ -28,11 +28,11 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de # If debug mode is enabled, print debug information percentage_complete = ((count + 1) / total_rows) * 100 if debug: - print(f"Text: {row['text']}") - print(f"Generated Metadata: Topic - {topic}") - print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") + print(f"DEBUG - Text: {row['text']}") + print(f"DEBUG - Generated Metadata: Topic - {topic}") + print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") - if percentage_complete % 5 == 0: + if int(percentage_complete) % 5 == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}") count += 1 @@ -68,11 +68,11 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch # Extend the sentiments list with the batch sentiments sentiments.extend(batch_sentiments) # Calculate the percentage of completion - percentage_complete = ((end) / total_rows) * 100 + percentage_complete = (end / total_rows) * 100 if debug: - print(f"Processed batch {start // batch_size + 1}: {start} to {end}") - print(f"Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}") - if percentage_complete % 5 == 0: + print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}") + print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}") + if int(percentage_complete) % 5 == 0: print(f"Percentage of Completion: {percentage_complete:.2f}%") dataset['sentiment'] = sentiments diff --git a/main.py b/main.py index cc7b16e..d811b48 100644 --- a/main.py +++ b/main.py @@ -1,24 +1,27 @@ import argparse import os -import wandb + +import matplotlib.pyplot as plt import pandas as pd import torch from sklearn.metrics import classification_report + from DatasetLoad import DatasetLoad from MetadataExtractor import MetadataExtractor from SentimentAnalyzer import SentimentAnalyzer from extract_stuff import augment_and_extract_metadata, predict_sentiment - os.environ["WANDB_API_KEY"] = "21cb0c9433eeca19401ee01e9b1bc9e4b6f7a696" if __name__ == "__main__": # Set up argument parser for command-line options parser = argparse.ArgumentParser(description='Load dataset') - parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'TODO'], + parser.add_argument('--dataset_type', type=str, default='tweets', choices=['tweets', 'reddit'], help='Type of dataset to load') parser.add_argument('--debug', type=bool, default=False, + help='Enable debug mode to print even more additional information') + parser.add_argument('--deep_debug', type=bool, default=False, help='Enable debug mode to print additional information') parser.add_argument('--percentage', type=float, default=100.0, help='Percentage of the dataset to use (e.g., 0.1 for 0.1%)') @@ -26,6 +29,7 @@ # Parse command-line arguments args = parser.parse_args() print("Debugging is set to: ", args.debug) + print("Deep Debugging is set to: ", args.deep_debug) print("Percentage is set to: ", args.percentage) # Print Torch availability and device information @@ -35,7 +39,7 @@ # Initialize dataset loader with the specified type and base path base_path = os.path.dirname(os.path.abspath(__file__)) - dataset_loader = DatasetLoad('tweets', base_path, args.percentage) + dataset_loader = DatasetLoad(args.dataset_type, base_path, args.percentage) dataset_loader.load_datasets() # Load the original train, test, and validation datasets @@ -48,25 +52,32 @@ # Initialize the sentiment analyzer sentiment_analyzer = SentimentAnalyzer() - # Fine-tune the sentiment analyzer with the original dataset - fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data) - print(f"Fine-tuning results: {fine_tuning_results}") - # Extract metadata for the datasets base_path = os.path.dirname(os.path.abspath(__file__)) - - # Extract metadata for the datasets + model_save_path = os.path.join(base_path, f'sentiment_model_{args.dataset_type}_{args.percentage}.pt') + # Check if a saved model exists + if os.path.exists(model_save_path): + print("Loading the fine-tuned model from disk...") + sentiment_analyzer.model = torch.load(model_save_path) + else: + print("Fine-tuning the sentiment analyzer with the original dataset...") + fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data) + print(f"Fine-tuning results: {fine_tuning_results}") + # Save the fine-tuned model + torch.save(sentiment_analyzer.model, model_save_path) + + # Define the file names for the sentiment predictions train_sentiment_file_name = os.path.join(base_path, f'train_sentiment_{args.dataset_type}_{args.percentage}.csv') test_sentiment_file_name = os.path.join(base_path, f'test_sentiment_{args.dataset_type}_{args.percentage}.csv') val_sentiment_file_name = os.path.join(base_path, f'val_sentiment_{args.dataset_type}_{args.percentage}.csv') # Predict sentiment for the datasets train_data_with_sentiment = predict_sentiment(original_train_data.copy(), sentiment_analyzer, - train_sentiment_file_name, args.debug) + train_sentiment_file_name, args.deep_debug) test_data_with_sentiment = predict_sentiment(original_test_data.copy(), sentiment_analyzer, - test_sentiment_file_name, args.debug) + test_sentiment_file_name, args.deep_debug) val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name, - args.debug) + args.deep_debug) # Compute metrics for the train dataset @@ -85,14 +96,16 @@ val_true_labels = original_val_data['category'] val_predicted_labels = val_data_with_sentiment['sentiment'] print("\nValidation Classification Report:") + val_report = classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0, + output_dict=True) print(classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0)) - - + val_accuracy = val_report['accuracy'] + print(val_accuracy) # Initialize the metadata extractor extractor = MetadataExtractor() # Define topic labels - topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"] + topic_labels = ["news", "entertainment", "sports", "technology", "health", "education", "business", "lifestyle", "opinions", "other"] # Define the base path where main.py is located base_path = os.path.dirname(os.path.abspath(__file__)) @@ -103,11 +116,11 @@ val_file_name = os.path.join(base_path, f'val_augmented_{args.dataset_type}_{args.percentage}.csv') train_data_with_metadata = augment_and_extract_metadata(train_data_with_sentiment.copy(), extractor, - topic_labels, train_file_name, args.debug) + topic_labels, train_file_name, args.deep_debug) test_data_with_metadata = augment_and_extract_metadata(test_data_with_sentiment.copy(), extractor, - topic_labels, test_file_name, args.debug) + topic_labels, test_file_name, args.deep_debug) val_data_with_metadata = augment_and_extract_metadata(val_data_with_sentiment.copy(), extractor, - topic_labels, val_file_name, args.debug) + topic_labels, val_file_name, args.deep_debug) # Function to create subgroups based on metadata @@ -153,6 +166,7 @@ def compute_metrics(subgroups, true_labels_column='category', pred_labels_column print("\nValidation Metrics per Topic") print(val_metrics) + # Function to analyze disparities in sentiment predictions def analyze_disparities(subgroups): analysis_results = [] @@ -168,7 +182,6 @@ def analyze_disparities(subgroups): }) return pd.DataFrame(analysis_results) - # Analyze disparities for the datasets train_analysis = analyze_disparities(train_subgroups) test_analysis = analyze_disparities(test_subgroups) @@ -182,3 +195,205 @@ def analyze_disparities(subgroups): print("\nValidation Percentage Analysis") print(val_analysis) + + def weighted_metrics(metrics_df, support_df, accuracy, metric='accuracy'): + # Join metrics with their respective support counts + metrics_df = metrics_df.copy() + metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup') + metrics_df['weighted_metric'] = (accuracy - metrics_df[metric]) * metrics_df['support'] + return metrics_df + + + # Function to get top and bottom topics based on weighted metrics + def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df,k=3, accuracy=val_accuracy, metric='accuracy'): + # Get support for each topic + support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'}) + + # Compute weighted metrics + weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, accuracy, metric) + # print(weighted_metrics_df) + # Sort topics by their weighted metrics + sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False) # Sort by descending, so the first 3 are the most disadvantaged + # print(sorted_metrics) + # Get top 3 and bottom 3 topics + bottom_3_topics = sorted_metrics[sorted_metrics['weighted_metric'] > 0].head(3)['topic'].tolist() + top_3_topics = sorted_metrics.tail(k)['topic'].tolist() + # print(bottom_3_topics) + return bottom_3_topics, top_3_topics + + + bottom_3_topics, top_3_topics = get_top_lower_topics(val_metrics, val_analysis, 3, val_accuracy, metric='accuracy') + print(f"Bottom 3 validation topics: {bottom_3_topics}") + + print("Augmenting the training dataset with synthetic data...") + # Randomly select rows from bottom three topics in the training set + train_data_bottom_3 = train_data_with_metadata[train_data_with_metadata['topic'].isin(bottom_3_topics)] + selected_samples = train_data_bottom_3.sample(n=50, random_state=42) # Select n samples from the bottom 3 topics + + # Augment the selected samples using the sentiment analyzer + generated_df, generated_df_with_metadata = sentiment_analyzer.generate_training_data( + selected_samples['topic'].tolist(), + selected_samples['text'].tolist(), + selected_samples['sentiment'].tolist(), + debug=args.debug + ) + + # Combine the original and augmented datasets + train_original_and_generated_data = pd.concat([original_train_data, generated_df], ignore_index=True) + # Save the combined datasets + train_original_and_generated_data.to_csv(os.path.join(base_path, 'train_original_and_generated_data.csv'), index=False) + + model_save_path_v2 = os.path.join(base_path, f'finetuned_sentiment_model_{args.dataset_type}_{args.percentage}.pt') + if os.path.exists(model_save_path_v2): + print("Loading the fine-tuned model from disk...") + sentiment_analyzer.model = torch.load(model_save_path_v2) + else: + print("Fine-tuning the sentiment analyzer with the generated+original dataset...") + fine_tuning_results_new = sentiment_analyzer.fine_tune(train_original_and_generated_data) # TODO NON CE + print(f"Fine-tuning results: {fine_tuning_results_new}") + # Save the fine-tuned model + torch.save(sentiment_analyzer.model, model_save_path_v2) + + # Predict sentiment for the original dataset to see for improvements + test_sentiment_file_name_v2 = os.path.join(base_path, f'test_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') + val_sentiment_file_name_v2 = os.path.join(base_path, f'val_sentiment_v2_{args.dataset_type}_{args.percentage}.csv') + test_data_with_sentiment_v2 = predict_sentiment(original_test_data.copy(), sentiment_analyzer, + test_sentiment_file_name_v2, args.deep_debug) + val_data_with_sentiment_v2 = predict_sentiment(original_val_data.copy(), sentiment_analyzer, + val_sentiment_file_name_v2, args.deep_debug) + + # Compute metrics for the test dataset + test_true_labels = original_test_data['category'] + test_predicted_labels_v2 = test_data_with_sentiment_v2['sentiment'] + print("\nTest Classification Report:") + print(classification_report(test_true_labels, test_predicted_labels_v2, labels=[0, 1, 2], zero_division=0)) + + # Compute metrics for the validation dataset + val_true_labels = original_val_data['category'] + val_predicted_labels_v2 = val_data_with_sentiment_v2['sentiment'] + print("\nValidation Classification Report:") + print(classification_report(val_true_labels, val_predicted_labels_v2, labels=[0, 1, 2], zero_division=0)) + + test_file_name_v2 = os.path.join(base_path, f'test_augmented_v2_{args.dataset_type}_{args.percentage}.csv') + val_file_name_v2 = os.path.join(base_path, f'val_augmented_v2_{args.dataset_type}_{args.percentage}.csv') + test_data_with_metadata_v2 = augment_and_extract_metadata(test_data_with_sentiment_v2.copy(), extractor, + topic_labels, test_file_name_v2, args.deep_debug) + val_data_with_metadata_v2 = augment_and_extract_metadata(val_data_with_sentiment_v2.copy(), extractor, topic_labels, + val_file_name_v2, args.deep_debug) + + # Create subgroups for the datasets + test_subgroups_v2 = create_subgroups(test_data_with_metadata_v2) + val_subgroups_v2 = create_subgroups(val_data_with_metadata_v2) + + + test_metrics_v2 = compute_metrics(test_subgroups_v2) + val_metrics_v2 = compute_metrics(val_subgroups_v2) + + print("\nTest Metrics per Topic") + print(test_metrics_v2) + print("\nValidation Metrics per Topic") + print(val_metrics_v2) + + test_analysis_v2 = analyze_disparities(test_subgroups_v2) + val_analysis_v2 = analyze_disparities(val_subgroups_v2) + print("\nTest Percentage Analysis") + print(test_analysis_v2) + print("\nValidation Percentage Analysis") + print(val_analysis_v2) + + + def plot_metrics_comparison(old_metrics, new_metrics, metric='accuracy'): + """ + Plots a comparison of the given metric before and after fine-tuning. + + Parameters: + - old_metrics: DataFrame containing the old metrics + - new_metrics: DataFrame containing the new metrics + - metric: The metric to compare (default is 'accuracy') + """ + # Merge the old and new metrics on the 'topic' column + comparison_df = old_metrics.merge(new_metrics, on='topic', suffixes=('_old', '_new')) + + # Sort the DataFrame by the new metric for better visualization + comparison_df = comparison_df.sort_values(by=f'{metric}_new', ascending=False) + + # Plot the comparison + plt.figure(figsize=(12, 8)) + bar_width = 0.4 + + # Positioning the bars + r1 = range(len(comparison_df)) + r2 = [x + bar_width for x in r1] + + plt.bar(r1, comparison_df[f'{metric}_old'], color='blue', width=bar_width, edgecolor='grey', label='Old') + plt.bar(r2, comparison_df[f'{metric}_new'], color='green', width=bar_width, edgecolor='grey', label='New') + + plt.xlabel('Topics', fontweight='bold') + plt.ylabel(metric.capitalize(), fontweight='bold') + plt.title(f'Comparison of {metric.capitalize()} by Topic', fontweight='bold') + plt.xticks([r + bar_width / 2 for r in range(len(comparison_df))], comparison_df['topic'], rotation=90) + plt.legend() + + plt.tight_layout() + plt.savefig(f'comparison_{metric}.png') + plt.close() + + # Plot the comparison for accuracy, precision, recall, and f1-score + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='accuracy') + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='precision') + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='recall') + plot_metrics_comparison(test_metrics, test_metrics_v2, metric='f1-score') + + def calculate_overall_accuracy(metrics_df): + """ + Calculate the overall accuracy from the metrics DataFrame. + + Parameters: + - metrics_df: DataFrame containing the metrics + + Returns: + - overall_accuracy: The overall accuracy + """ + total_support = metrics_df['total'].sum() + weighted_accuracy_sum = (metrics_df['accuracy'] * metrics_df['total']).sum() + overall_accuracy = weighted_accuracy_sum / total_support + return overall_accuracy + + def plot_overall_accuracy_comparison(old_metrics, new_metrics, support_old, support_new): + """ + Plot the overall accuracy comparison before and after fine-tuning. + + Parameters: + - old_metrics: DataFrame containing the old metrics + - new_metrics: DataFrame containing the new metrics + - support_old: DataFrame containing the support data for the old metrics + - support_new: DataFrame containing the support data for the new metrics + """ + old_metrics_with_total = old_metrics.merge(support_old[['subgroup', 'total']], left_on='topic', + right_on='subgroup') + new_metrics_with_total = new_metrics.merge(support_new[['subgroup', 'total']], left_on='topic', + right_on='subgroup') + + overall_accuracy_old = calculate_overall_accuracy(old_metrics_with_total) + overall_accuracy_new = calculate_overall_accuracy(new_metrics_with_total) + + accuracies = [overall_accuracy_old, overall_accuracy_new] + labels = ['Old Model', 'New Model'] + + plt.figure(figsize=(8, 6)) + plt.bar(labels, accuracies, color=['blue', 'green'], edgecolor='grey') + + plt.xlabel('Model', fontweight='bold') + plt.ylabel('Overall Accuracy', fontweight='bold') + plt.title('Overall Accuracy Comparison', fontweight='bold') + plt.ylim(0, 1) + + for i, v in enumerate(accuracies): + plt.text(i, v + 0.01, f"{v:.2f}", ha='center', fontweight='bold') + + plt.tight_layout() + plt.savefig('overall_accuracy_comparison.png') + plt.close() + + # Calculate and plot the overall accuracy comparison + plot_overall_accuracy_comparison(test_metrics, test_metrics_v2, test_analysis, test_analysis_v2) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9a3e5b1..11143d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ scikit_learn==1.2.0 transformers==4.42.4 tensorflow gdown - +matplotlib