Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 35 additions & 71 deletions SentimentAnalyzer.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import Dataset

from datasets import Dataset, load_metric


class SentimentAnalyzer:
def __init__(self):
self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
self.device = 0 if torch.cuda.is_available() else -1 # Use GPU if available
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device)
self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)

def analyze_sentiment(self, text):
results = self.classifier(text)
return results[0]['label']

def map_label_to_target(self, label):
# Map the sentiment label to the target value
if label == "negative" or label == "Negative":
return -1
elif label == "neutral" or label == "Neutral":
Expand All @@ -28,84 +27,49 @@ def map_label_to_target(self, label):
else:
return None

# Generate synthetic data using LLMs to be defined
def generate_synthetic_data(self, topic, n_samples):
openai.api_key = 'YOUR_API_KEY'
synthetic_data = []
for _ in range(n_samples):
prompt = f"Generate six tweets related to {topic} that expresses sentiment."
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=60
)
synthetic_data.append(response.choices[0].text.strip())
return synthetic_data

def augment_training_data(self, topics, n_samples=100):
augmented_data = {'text': [], 'label': []}
augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
for topic in topics:
synthetic_texts = self.generate_synthetic_data(topic, n_samples)
# Assuming the sentiment label for generated data
augmented_data['text'].extend(synthetic_texts)
augmented_data['label'].extend([1] * len(synthetic_texts)) # Defaulting to neutral
augmented_data_with_topics['text'].extend(synthetic_texts)
augmented_data_with_topics['label'].extend([1] * len(synthetic_texts))
augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))

augmented_df = pd.DataFrame(augmented_data)
augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics)
return augmented_df, augmented_df_with_topics

def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5):
augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples)
return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
def fine_tune(self, train_data):
# Ensure the training data includes labels
if 'label' not in train_data.columns:
train_data['label'] = train_data['category'].apply(self.map_label_to_target)

# Fine-tune the model on a custom dataset
def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
# Preprocess the dataset
df = df.rename(columns={"clean_text": "text", "category": "label"}) # Rename the columns
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset
# Tokenize the data
def tokenize_function(examples):
return self.tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(train_df) # Load the dataset
test_dataset = Dataset.from_pandas(test_df)
train_dataset = Dataset.from_pandas(train_data)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

def tokenize_function(examples): # Tokenize the text
return self.tokenizer(examples["text"], padding="max_length", truncation=True)
# Remove columns not required for training
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text", "category", "__index_level_0__"])

train_dataset = train_dataset.map(tokenize_function, batched=True) # Tokenize the dataset
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"]) # Remove the text column after tokenization
test_dataset = test_dataset.remove_columns(["text"])

train_dataset.set_format("torch") # Set the format to PyTorch
test_dataset.set_format("torch")

# Define training arguments
training_args = TrainingArguments( # Define the training arguments
# Set up training arguments
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=epochs,
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)

# Define the trainer
# Initialize Trainer
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
train_dataset=tokenized_train_dataset,
tokenizer=self.tokenizer,
compute_metrics=self.compute_metrics
)

# Fine-tune the model
# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)
return results
return trainer.evaluate()

def compute_metrics(self, p):
metric = load_metric("accuracy")
preds = np.argmax(p.predictions, axis=1)
labels = p.label_ids
accuracy = metric.compute(predictions=preds, references=labels)
return accuracy
25 changes: 14 additions & 11 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import argparse
import os
import wandb

import pandas as pd
import torch
from sklearn.metrics import classification_report

from DatasetLoad import DatasetLoad
from MetadataExtractor import MetadataExtractor
from SentimentAnalyzer import SentimentAnalyzer
from extract_stuff import augment_and_extract_metadata, predict_sentiment


os.environ["WANDB_API_KEY"] = "21cb0c9433eeca19401ee01e9b1bc9e4b6f7a696"
os.environ["WANDB_DISABLED"] = "true"

if __name__ == "__main__":

Expand Down Expand Up @@ -46,9 +46,9 @@
# Initialize the sentiment analyzer
sentiment_analyzer = SentimentAnalyzer()

# Fine-tune the sentiment analyzer with the original dataset
fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
print(f"Fine-tuning results: {fine_tuning_results}")
# # Fine-tune the sentiment analyzer with the original dataset
# fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
# print(f"Fine-tuning results: {fine_tuning_results}")

# Extract metadata for the datasets
base_path = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -66,21 +66,24 @@
val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name,
args.debug)

if args.debug:
print("\nTrain Data with Sentiment:")
print(train_data_with_sentiment.head())

# Compute metrics for the train dataset
train_true_labels = original_train_data['target']
train_true_labels = original_train_data['category']
train_predicted_labels = train_data_with_sentiment['sentiment']
print("\nTrain Classification Report:")
print(classification_report(train_true_labels, train_predicted_labels, labels=[-1, 0, 1], zero_division=0))

# Compute metrics for the test dataset
test_true_labels = original_test_data['target']
test_true_labels = original_test_data['category']
test_predicted_labels = test_data_with_sentiment['sentiment']
print("\nTest Classification Report:")
print(classification_report(test_true_labels, test_predicted_labels, labels=[-1, 0, 1], zero_division=0))

# Compute metrics for the validation dataset
val_true_labels = original_val_data['target']
val_true_labels = original_val_data['category']
val_predicted_labels = val_data_with_sentiment['sentiment']
print("\nValidation Classification Report:")
print(classification_report(val_true_labels, val_predicted_labels, labels=[-1, 0, 1], zero_division=0))
Expand All @@ -90,7 +93,8 @@
extractor = MetadataExtractor()

# Define topic labels
topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"]
topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food",
"other"]

# Define the base path where main.py is located
base_path = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -179,4 +183,3 @@ def analyze_disparities(subgroups):
print(test_analysis)
print("\nValidation Percentage Analysis")
print(val_analysis)

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ scikit_learn==1.2.0
transformers==4.42.4
tensorflow
gdown

datasets