diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py index 3ba9697..804bfd5 100644 --- a/SentimentAnalyzer.py +++ b/SentimentAnalyzer.py @@ -1,8 +1,8 @@ -import pandas as pd +import numpy as np import torch -from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline -from datasets import Dataset + +from datasets import Dataset, load_metric class SentimentAnalyzer: @@ -10,7 +10,7 @@ def __init__(self): self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" self.device = 0 if torch.cuda.is_available() else -1 # Use GPU if available self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) - self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device) + self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device) self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device) def analyze_sentiment(self, text): @@ -18,7 +18,6 @@ def analyze_sentiment(self, text): return results[0]['label'] def map_label_to_target(self, label): - # Map the sentiment label to the target value if label == "negative" or label == "Negative": return -1 elif label == "neutral" or label == "Neutral": @@ -28,84 +27,49 @@ def map_label_to_target(self, label): else: return None - # Generate synthetic data using LLMs to be defined - def generate_synthetic_data(self, topic, n_samples): - openai.api_key = 'YOUR_API_KEY' - synthetic_data = [] - for _ in range(n_samples): - prompt = f"Generate six tweets related to {topic} that expresses sentiment." - response = openai.Completion.create( - engine="text-davinci-003", - prompt=prompt, - max_tokens=60 - ) - synthetic_data.append(response.choices[0].text.strip()) - return synthetic_data - - def augment_training_data(self, topics, n_samples=100): - augmented_data = {'text': [], 'label': []} - augmented_data_with_topics = {'text': [], 'label': [], 'topic': []} - for topic in topics: - synthetic_texts = self.generate_synthetic_data(topic, n_samples) - # Assuming the sentiment label for generated data - augmented_data['text'].extend(synthetic_texts) - augmented_data['label'].extend([1] * len(synthetic_texts)) # Defaulting to neutral - augmented_data_with_topics['text'].extend(synthetic_texts) - augmented_data_with_topics['label'].extend([1] * len(synthetic_texts)) - augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts)) - - augmented_df = pd.DataFrame(augmented_data) - augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics) - return augmented_df, augmented_df_with_topics - - def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5): - augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples) - return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics + def fine_tune(self, train_data): + # Ensure the training data includes labels + if 'label' not in train_data.columns: + train_data['label'] = train_data['category'].apply(self.map_label_to_target) - # Fine-tune the model on a custom dataset - def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5): - # Preprocess the dataset - df = df.rename(columns={"clean_text": "text", "category": "label"}) # Rename the columns - train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset + # Tokenize the data + def tokenize_function(examples): + return self.tokenizer(examples['text'], padding="max_length", truncation=True) - train_dataset = Dataset.from_pandas(train_df) # Load the dataset - test_dataset = Dataset.from_pandas(test_df) + train_dataset = Dataset.from_pandas(train_data) + tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True) - def tokenize_function(examples): # Tokenize the text - return self.tokenizer(examples["text"], padding="max_length", truncation=True) + # Remove columns not required for training + tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text", "category", "__index_level_0__"]) - train_dataset = train_dataset.map(tokenize_function, batched=True) # Tokenize the dataset - test_dataset = test_dataset.map(tokenize_function, batched=True) - - train_dataset = train_dataset.remove_columns(["text"]) # Remove the text column after tokenization - test_dataset = test_dataset.remove_columns(["text"]) - - train_dataset.set_format("torch") # Set the format to PyTorch - test_dataset.set_format("torch") - - # Define training arguments - training_args = TrainingArguments( # Define the training arguments + # Set up training arguments + training_args = TrainingArguments( output_dir="./results", - eval_strategy="epoch", - learning_rate=learning_rate, - per_device_train_batch_size=batch_size, - per_device_eval_batch_size=batch_size, - num_train_epochs=epochs, + evaluation_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=3, weight_decay=0.01, ) - # Define the trainer + # Initialize Trainer trainer = Trainer( model=self.model, args=training_args, - train_dataset=train_dataset, - eval_dataset=test_dataset, + train_dataset=tokenized_train_dataset, + tokenizer=self.tokenizer, + compute_metrics=self.compute_metrics ) - # Fine-tune the model + # Train the model trainer.train() - # Evaluate the model - results = trainer.evaluate() - print(results) - return results + return trainer.evaluate() + + def compute_metrics(self, p): + metric = load_metric("accuracy") + preds = np.argmax(p.predictions, axis=1) + labels = p.label_ids + accuracy = metric.compute(predictions=preds, references=labels) + return accuracy diff --git a/main.py b/main.py index edbce18..2101906 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,16 @@ import argparse import os -import wandb + import pandas as pd import torch from sklearn.metrics import classification_report + from DatasetLoad import DatasetLoad from MetadataExtractor import MetadataExtractor from SentimentAnalyzer import SentimentAnalyzer from extract_stuff import augment_and_extract_metadata, predict_sentiment - -os.environ["WANDB_API_KEY"] = "21cb0c9433eeca19401ee01e9b1bc9e4b6f7a696" +os.environ["WANDB_DISABLED"] = "true" if __name__ == "__main__": @@ -46,9 +46,9 @@ # Initialize the sentiment analyzer sentiment_analyzer = SentimentAnalyzer() - # Fine-tune the sentiment analyzer with the original dataset - fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data) - print(f"Fine-tuning results: {fine_tuning_results}") + # # Fine-tune the sentiment analyzer with the original dataset + # fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data) + # print(f"Fine-tuning results: {fine_tuning_results}") # Extract metadata for the datasets base_path = os.path.dirname(os.path.abspath(__file__)) @@ -66,21 +66,24 @@ val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name, args.debug) + if args.debug: + print("\nTrain Data with Sentiment:") + print(train_data_with_sentiment.head()) # Compute metrics for the train dataset - train_true_labels = original_train_data['target'] + train_true_labels = original_train_data['category'] train_predicted_labels = train_data_with_sentiment['sentiment'] print("\nTrain Classification Report:") print(classification_report(train_true_labels, train_predicted_labels, labels=[-1, 0, 1], zero_division=0)) # Compute metrics for the test dataset - test_true_labels = original_test_data['target'] + test_true_labels = original_test_data['category'] test_predicted_labels = test_data_with_sentiment['sentiment'] print("\nTest Classification Report:") print(classification_report(test_true_labels, test_predicted_labels, labels=[-1, 0, 1], zero_division=0)) # Compute metrics for the validation dataset - val_true_labels = original_val_data['target'] + val_true_labels = original_val_data['category'] val_predicted_labels = val_data_with_sentiment['sentiment'] print("\nValidation Classification Report:") print(classification_report(val_true_labels, val_predicted_labels, labels=[-1, 0, 1], zero_division=0)) @@ -90,7 +93,8 @@ extractor = MetadataExtractor() # Define topic labels - topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"] + topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", + "other"] # Define the base path where main.py is located base_path = os.path.dirname(os.path.abspath(__file__)) @@ -179,4 +183,3 @@ def analyze_disparities(subgroups): print(test_analysis) print("\nValidation Percentage Analysis") print(val_analysis) - diff --git a/requirements.txt b/requirements.txt index 9a3e5b1..5858c85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ scikit_learn==1.2.0 transformers==4.42.4 tensorflow gdown - +datasets \ No newline at end of file