omblivion · omblivion · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/DatasetLoad.py b/DatasetLoad.py
@@ -43,7 +43,8 @@ def load_datasets(self):
             data = data.rename(columns={'clean_comment': 'text'})
             # truncate the text in the text column with over 512 characters
             data['text'] = data['text'].str.slice(0, 512)
-
+            data['category'] = data['category'].map({-1: 0, 0: 1, 1: 2})
+            data = data.dropna()
 
         elif self.dataset_type == 'tweets':
             print("Loading Twitter dataset...")
@@ -57,6 +58,7 @@ def load_datasets(self):
             data = data.rename(columns={'Tweet': 'text'})
             # remove the rows of the text column in which the text is "Not Available"
             data = data[data['text'] != 'Not Available']
+            data = data.dropna()
 
         # Ensure the first column is 'text' and the second column is 'category'
         data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]]

diff --git a/SentimentAnalyzer.py b/SentimentAnalyzer.py
@@ -1,7 +1,9 @@
 import pandas as pd
 import torch
 from sklearn.model_selection import train_test_split
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \
+    DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM
+
 from datasets import Dataset
 
 
@@ -10,9 +12,15 @@ def __init__(self):
         self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
         self.device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
+                                                                        ignore_mismatched_sizes=True).to(self.device)
         self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)
 
+        # Initialize FLAN model for synthetic data generation
+        self.flan_model_name = "google/flan-t5-small"
+        self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name)
+        self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device)
+
     def analyze_sentiment(self, text):
         results = self.classifier(text)
         return results[0]['label']
@@ -28,51 +36,87 @@ def map_label_to_target(self, label):
         else:
             return None
 
-    # Generate synthetic data using LLMs to be defined
-    def generate_synthetic_data(self, topic, n_samples):
-        openai.api_key = 'YOUR_API_KEY'
+    def map_target_to_label(self, target):
+        # Map the target value to the sentiment label
+        if target == 0:
+            return "negative"
+        elif target == 1:
+            return "neutral"
+        elif target == 2:
+            return "positive"
+        else:
+            return None
+
+    # Generate synthetic data using the FLAN model
+    def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False):
         synthetic_data = []
+        # print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
+        count = 0
         for _ in range(n_samples):
-            prompt = f"Generate six tweets related to {topic} that expresses sentiment."
-            response = openai.Completion.create(
-                engine="text-davinci-003",
-                prompt=prompt,
-                max_tokens=60
+            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' "
+            inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
+            # Use top-k sampling and temperature sampling for more diverse outputs
+            outputs = self.flan_model.generate(
+                inputs.input_ids,
+                max_length=60,
+                num_return_sequences=1,
+                do_sample=True,
+                top_k=50,
+                temperature=0.7
             )
-            synthetic_data.append(response.choices[0].text.strip())
+            generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            synthetic_data.append(generated_text)
+            count += 1
+            if debug:
+                print(f"DEBUG - Generated Text: {generated_text}")
+                print(f"DEBUG - Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
+            if int(count % 5) == 0:
+                print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
         return synthetic_data
 
-    def augment_training_data(self, topics, n_samples=100):
-        augmented_data = {'text': [], 'label': []}
-        augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
-        for topic in topics:
-            synthetic_texts = self.generate_synthetic_data(topic, n_samples)
-            # Assuming the sentiment label for generated data
-            augmented_data['text'].extend(synthetic_texts)
-            augmented_data['label'].extend([1] * len(synthetic_texts))  # Defaulting to neutral
-            augmented_data_with_topics['text'].extend(synthetic_texts)
-            augmented_data_with_topics['label'].extend([1] * len(synthetic_texts))
-            augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))
-
-        augmented_df = pd.DataFrame(augmented_data)
-        augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics)
-        return augmented_df, augmented_df_with_topics
-
-    def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5):
-        augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples)
-        return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
+    # Augment the training data with synthetic data
+    def generate_training_data(self, topics, texts, sentiments, n_samples=50, debug=False):
+        print("Generating synthetic data...")
+        generated_data = {'text': [], 'category': []}
+        generated_data_with_topic = {'text': [], 'category': [], 'topic': []}
+
+        count = 0
+        total = len(texts)
+        for topic, text, sentiment in zip(topics, texts, sentiments):
+            sentiment_text = self.map_target_to_label(sentiment)
+            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples,
+                                                           debug)  # List of synthetic texts
+            generated_data['text'].extend(synthetic_texts)
+            generated_data['category'].extend(
+                [sentiment] * len(synthetic_texts))  # append sentiment to texts many times
+            generated_data_with_topic['text'].extend(synthetic_texts)
+            generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts))
+            generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts))
+            count += 1
+            if debug:
+                print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}")
+            # Print percentage of completion of total texts
+            percentage_complete = count / total * 100
+            if int(percentage_complete) % 5 == 0:
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count} of {total}")
+
+
+
+        generated_df = pd.DataFrame(generated_data)
+        generated_df_with_topics = pd.DataFrame(generated_data_with_topic)
+        return generated_df, generated_df_with_topics
 
     # Fine-tune the model on a custom dataset
     def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
         # Preprocess the dataset
-        df = df.rename(columns={"text": "text", "category": "label"})     # Rename the columns
-        df['label'] = df['label'].astype(int)   # Ensure the labels are integers
-        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)        # Split the dataset
+        df = df.rename(columns={"text": "text", "category": "label"})  # Rename the columns
+        df['label'] = df['label'].astype(int)  # Ensure the labels are integers
+        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)  # Split the dataset
 
-        train_dataset = Dataset.from_pandas(train_df)   # Load the dataset
+        train_dataset = Dataset.from_pandas(train_df)  # Load the dataset
         test_dataset = Dataset.from_pandas(test_df)
 
-        def tokenize_function(examples):    # Tokenize the text
+        def tokenize_function(examples):  # Tokenize the text
             return self.tokenizer(examples["text"], padding="max_length", truncation=True)
 
         train_dataset = train_dataset.map(tokenize_function, batched=True)  # Tokenize the dataset
@@ -81,14 +125,14 @@ def tokenize_function(examples):    # Tokenize the text
         train_dataset = train_dataset.remove_columns(["text"])  # Remove the text column after tokenization
         test_dataset = test_dataset.remove_columns(["text"])
 
-        train_dataset.set_format("torch")   # Set the format to PyTorch
+        train_dataset.set_format("torch")  # Set the format to PyTorch
         test_dataset.set_format("torch")
 
         # Define the data collator
         data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
 
         # Define training arguments
-        training_args = TrainingArguments(  # Define the training arguments
+        training_args = TrainingArguments(  #
             output_dir="./results",
             run_name="finetuning_sentiment_classifier",
             eval_strategy="epoch",

diff --git a/extract_stuff.py b/extract_stuff.py
@@ -28,11 +28,11 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de
             # If debug mode is enabled, print debug information
             percentage_complete = ((count + 1) / total_rows) * 100
             if debug:
-                print(f"Text: {row['text']}")
-                print(f"Generated Metadata: Topic - {topic}")
-                print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
+                print(f"DEBUG - Text: {row['text']}")
+                print(f"DEBUG - Generated Metadata: Topic - {topic}")
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
-            if percentage_complete % 5 == 0:
+            if int(percentage_complete) % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
 
             count += 1
@@ -68,11 +68,11 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch
             # Extend the sentiments list with the batch sentiments
             sentiments.extend(batch_sentiments)
             # Calculate the percentage of completion
-            percentage_complete = ((end) / total_rows) * 100
+            percentage_complete = (end / total_rows) * 100
             if debug:
-                print(f"Processed batch {start // batch_size + 1}: {start} to {end}")
-                print(f"Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
-            if percentage_complete % 5 == 0:
+                print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}")
+                print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
+            if int(percentage_complete) % 5 == 0:
                 print(f"Percentage of Completion: {percentage_complete:.2f}%")
 
         dataset['sentiment'] = sentiments