omblivion · omblivion · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
@@ -1,24 +1,23 @@
-import pandas as pd
+import numpy as np
 import torch
-from sklearn.model_selection import train_test_split
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
-from datasets import Dataset
+
+from datasets import Dataset, load_metric
 
 
 class SentimentAnalyzer:
     def __init__(self):
         self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
         self.device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name).to(self.device)
         self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)
 
     def analyze_sentiment(self, text):
         results = self.classifier(text)
         return results[0]['label']
 
     def map_label_to_target(self, label):
-        # Map the sentiment label to the target value
         if label == "negative" or label == "Negative":
             return -1
         elif label == "neutral" or label == "Neutral":
@@ -28,84 +27,49 @@ def map_label_to_target(self, label):
         else:
             return None
 
-    # Generate synthetic data using LLMs to be defined
-    def generate_synthetic_data(self, topic, n_samples):
-        openai.api_key = 'YOUR_API_KEY'
-        synthetic_data = []
-        for _ in range(n_samples):
-            prompt = f"Generate six tweets related to {topic} that expresses sentiment."
-            response = openai.Completion.create(
-                engine="text-davinci-003",
-                prompt=prompt,
-                max_tokens=60
-            )
-            synthetic_data.append(response.choices[0].text.strip())
-        return synthetic_data
-
-    def augment_training_data(self, topics, n_samples=100):
-        augmented_data = {'text': [], 'label': []}
-        augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
-        for topic in topics:
-            synthetic_texts = self.generate_synthetic_data(topic, n_samples)
-            # Assuming the sentiment label for generated data
-            augmented_data['text'].extend(synthetic_texts)
-            augmented_data['label'].extend([1] * len(synthetic_texts))  # Defaulting to neutral
-            augmented_data_with_topics['text'].extend(synthetic_texts)
-            augmented_data_with_topics['label'].extend([1] * len(synthetic_texts))
-            augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))
-
-        augmented_df = pd.DataFrame(augmented_data)
-        augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics)
-        return augmented_df, augmented_df_with_topics
-
-    def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5):
-        augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples)
-        return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
+    def fine_tune(self, train_data):
+        # Ensure the training data includes labels
+        if 'label' not in train_data.columns:
+            train_data['label'] = train_data['category'].apply(self.map_label_to_target)
 
-    # Fine-tune the model on a custom dataset
-    def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
-        # Preprocess the dataset
-        df = df.rename(columns={"clean_text": "text", "category": "label"})     # Rename the columns
-        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)        # Split the dataset
+        # Tokenize the data
+        def tokenize_function(examples):
+            return self.tokenizer(examples['text'], padding="max_length", truncation=True)
 
-        train_dataset = Dataset.from_pandas(train_df)   # Load the dataset
-        test_dataset = Dataset.from_pandas(test_df)
+        train_dataset = Dataset.from_pandas(train_data)
+        tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
 
-        def tokenize_function(examples):    # Tokenize the text
-            return self.tokenizer(examples["text"], padding="max_length", truncation=True)
+        # Remove columns not required for training
+        tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text", "category", "__index_level_0__"])
 
-        train_dataset = train_dataset.map(tokenize_function, batched=True)  # Tokenize the dataset
-        test_dataset = test_dataset.map(tokenize_function, batched=True)
-
-        train_dataset = train_dataset.remove_columns(["text"])  # Remove the text column after tokenization
-        test_dataset = test_dataset.remove_columns(["text"])
-
-        train_dataset.set_format("torch")   # Set the format to PyTorch
-        test_dataset.set_format("torch")
-
-        # Define training arguments
-        training_args = TrainingArguments(  # Define the training arguments
+        # Set up training arguments
+        training_args = TrainingArguments(
             output_dir="./results",
-            eval_strategy="epoch",
-            learning_rate=learning_rate,
-            per_device_train_batch_size=batch_size,
-            per_device_eval_batch_size=batch_size,
-            num_train_epochs=epochs,
+            evaluation_strategy="epoch",
+            learning_rate=2e-5,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=16,
+            num_train_epochs=3,
             weight_decay=0.01,
         )
 
-        # Define the trainer
+        # Initialize Trainer
         trainer = Trainer(
             model=self.model,
             args=training_args,
-            train_dataset=train_dataset,
-            eval_dataset=test_dataset,
+            train_dataset=tokenized_train_dataset,
+            tokenizer=self.tokenizer,
+            compute_metrics=self.compute_metrics
         )
 
-        # Fine-tune the model
+        # Train the model
         trainer.train()
 
-        # Evaluate the model
-        results = trainer.evaluate()
-        print(results)
-        return results
+        return trainer.evaluate()
+
+    def compute_metrics(self, p):
+        metric = load_metric("accuracy")
+        preds = np.argmax(p.predictions, axis=1)
+        labels = p.label_ids
+        accuracy = metric.compute(predictions=preds, references=labels)
+        return accuracy
diff --git a/main.py b/main.py
@@ -1,16 +1,16 @@
 import argparse
 import os
-import wandb
+
 import pandas as pd
 import torch
 from sklearn.metrics import classification_report
+
 from DatasetLoad import DatasetLoad
 from MetadataExtractor import MetadataExtractor
 from SentimentAnalyzer import SentimentAnalyzer
 from extract_stuff import augment_and_extract_metadata, predict_sentiment
 
-
-os.environ["WANDB_API_KEY"] = "21cb0c9433eeca19401ee01e9b1bc9e4b6f7a696"
+os.environ["WANDB_DISABLED"] = "true"
 
 if __name__ == "__main__":
 
@@ -46,9 +46,9 @@
     # Initialize the sentiment analyzer
     sentiment_analyzer = SentimentAnalyzer()
 
-    # Fine-tune the sentiment analyzer with the original dataset
-    fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
-    print(f"Fine-tuning results: {fine_tuning_results}")
+    # # Fine-tune the sentiment analyzer with the original dataset
+    # fine_tuning_results = sentiment_analyzer.fine_tune(original_train_data)
+    # print(f"Fine-tuning results: {fine_tuning_results}")
 
     # Extract metadata for the datasets
     base_path = os.path.dirname(os.path.abspath(__file__))
@@ -66,21 +66,24 @@
     val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name,
                                                 args.debug)
 
+    if args.debug:
+        print("\nTrain Data with Sentiment:")
+        print(train_data_with_sentiment.head())
 
     # Compute metrics for the train dataset
-    train_true_labels = original_train_data['target']
+    train_true_labels = original_train_data['category']
     train_predicted_labels = train_data_with_sentiment['sentiment']
     print("\nTrain Classification Report:")
     print(classification_report(train_true_labels, train_predicted_labels, labels=[-1, 0, 1], zero_division=0))
 
     # Compute metrics for the test dataset
-    test_true_labels = original_test_data['target']
+    test_true_labels = original_test_data['category']
     test_predicted_labels = test_data_with_sentiment['sentiment']
     print("\nTest Classification Report:")
     print(classification_report(test_true_labels, test_predicted_labels, labels=[-1, 0, 1], zero_division=0))
 
     # Compute metrics for the validation dataset
-    val_true_labels = original_val_data['target']
+    val_true_labels = original_val_data['category']
     val_predicted_labels = val_data_with_sentiment['sentiment']
     print("\nValidation Classification Report:")
     print(classification_report(val_true_labels, val_predicted_labels, labels=[-1, 0, 1], zero_division=0))
@@ -90,7 +93,8 @@
     extractor = MetadataExtractor()
 
     # Define topic labels
-    topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"]
+    topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food",
+                    "other"]
 
     # Define the base path where main.py is located
     base_path = os.path.dirname(os.path.abspath(__file__))
@@ -179,4 +183,3 @@ def analyze_disparities(subgroups):
     print(test_analysis)
     print("\nValidation Percentage Analysis")
     print(val_analysis)
-
diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,4 @@ scikit_learn==1.2.0
 transformers==4.42.4
 tensorflow
 gdown
-
+datasets