Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
b6197fa
changed from target to category in line original_train_data
Raoolo Jul 26, 2024
7c9ae48
First implementation in main of generating new data
Raoolo Jul 26, 2024
cf24a15
Check if saved model exists or not
Raoolo Jul 26, 2024
36cc3d9
implemented model_save_path
omblivion Jul 26, 2024
61c687c
first implementation fo the bottom topics
omblivion Jul 26, 2024
4af3e44
missing import
omblivion Jul 26, 2024
8b0adbf
use val instead of test
omblivion Jul 26, 2024
929c732
Solving raul pull problems
Raoolo Jul 26, 2024
dbab91d
Merge branch 'raul_tries_things' of https://github.com/omblivion/expl…
Raoolo Jul 26, 2024
077b9a4
use val instead of test
omblivion Jul 26, 2024
e5eb93f
is this the end?
Raoolo Jul 26, 2024
63f353b
is this the end?
Raoolo Jul 26, 2024
ea46935
is this the end?
Raoolo Jul 26, 2024
7b27fb1
is this the end?
Raoolo Jul 26, 2024
3a062fc
is this the end?
Raoolo Jul 26, 2024
a74fb86
is this the end?
Raoolo Jul 27, 2024
3290b60
is this the end?
Raoolo Jul 27, 2024
c8e22cc
is this the end?
Raoolo Jul 27, 2024
74b0b84
count was initialized
omblivion Jul 27, 2024
e12f205
added matplotlib to reqs
omblivion Jul 27, 2024
c0997dd
is this the end?
Raoolo Jul 27, 2024
1b911b5
is this the end?
Raoolo Jul 27, 2024
b4df0ca
temperature set to 0.9
omblivion Jul 27, 2024
b3bca14
Merge remote-tracking branch 'origin/raul_tries_things_v2' into raul_…
omblivion Jul 27, 2024
bda7149
max lenght 60
omblivion Jul 27, 2024
9eaddb6
implemenmted deep debug
omblivion Jul 27, 2024
fe5f47c
small fix
omblivion Jul 27, 2024
359cde1
small fix of print statements 5%
omblivion Jul 27, 2024
1ac3ec8
tried to fix plotting
Raoolo Jul 27, 2024
58d00e2
small fix of print statements 5%, more print statements
omblivion Jul 27, 2024
37daa30
tried to fix plotting
Raoolo Jul 27, 2024
ae0bbc0
Merge remote-tracking branch 'origin/raul_tries_things_v2' into raul_…
Raoolo Jul 27, 2024
e7bd36f
tried to fix plotting
Raoolo Jul 27, 2024
671040b
tried to fix plotting
Raoolo Jul 27, 2024
c318f9f
tried to fix plotting
Raoolo Jul 27, 2024
bbc65bc
tried to fix plotting
Raoolo Jul 27, 2024
34beb4f
tried to fix plotting
Raoolo Jul 27, 2024
724a4f2
reddit dataset is now enabled
omblivion Jul 27, 2024
23b8907
tried to fix plotting
Raoolo Jul 27, 2024
92847d2
Merge remote-tracking branch 'origin/raul_tries_things_v2' into raul_…
Raoolo Jul 27, 2024
a54d4d9
fixed bottom 3
Raoolo Jul 27, 2024
820e283
changed temperature
Raoolo Jul 27, 2024
7fe9199
debug for top_lower
Raoolo Jul 27, 2024
ec0a0a6
added stratification
Raoolo Jul 27, 2024
eaff928
removed stratification
Raoolo Jul 27, 2024
bacb3a9
samples changed from 6 to 50
Raoolo Jul 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion DatasetLoad.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def load_datasets(self):
data = data.rename(columns={'clean_comment': 'text'})
# truncate the text in the text column with over 512 characters
data['text'] = data['text'].str.slice(0, 512)

data['category'] = data['category'].map({-1: 0, 0: 1, 1: 2})
data = data.dropna()

elif self.dataset_type == 'tweets':
print("Loading Twitter dataset...")
Expand All @@ -57,6 +58,7 @@ def load_datasets(self):
data = data.rename(columns={'Tweet': 'text'})
# remove the rows of the text column in which the text is "Not Available"
data = data[data['text'] != 'Not Available']
data = data.dropna()

# Ensure the first column is 'text' and the second column is 'category'
data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]]
Expand Down
118 changes: 81 additions & 37 deletions SentimentAnalyzer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, \
DataCollatorWithPadding, pipeline, AutoModelForSeq2SeqLM

from datasets import Dataset


Expand All @@ -10,9 +12,15 @@ def __init__(self):
self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
self.device = 0 if torch.cuda.is_available() else -1 # Use GPU if available
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name,
ignore_mismatched_sizes=True).to(self.device)
self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)

# Initialize FLAN model for synthetic data generation
self.flan_model_name = "google/flan-t5-small"
self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name)
self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device)

def analyze_sentiment(self, text):
results = self.classifier(text)
return results[0]['label']
Expand All @@ -28,51 +36,87 @@ def map_label_to_target(self, label):
else:
return None

# Generate synthetic data using LLMs to be defined
def generate_synthetic_data(self, topic, n_samples):
openai.api_key = 'YOUR_API_KEY'
def map_target_to_label(self, target):
# Map the target value to the sentiment label
if target == 0:
return "negative"
elif target == 1:
return "neutral"
elif target == 2:
return "positive"
else:
return None

# Generate synthetic data using the FLAN model
def generate_synthetic_data(self, topic, text, sentiment, n_samples, debug=False):
synthetic_data = []
# print(f"Generating synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment}")
count = 0
for _ in range(n_samples):
prompt = f"Generate six tweets related to {topic} that expresses sentiment."
response = openai.Completion.create(
engine="text-davinci-003",
prompt=prompt,
max_tokens=60
prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment and the tweet has to be semantically similar to: '{text}' "
inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
# Use top-k sampling and temperature sampling for more diverse outputs
outputs = self.flan_model.generate(
inputs.input_ids,
max_length=60,
num_return_sequences=1,
do_sample=True,
top_k=50,
temperature=0.7
)
synthetic_data.append(response.choices[0].text.strip())
generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
synthetic_data.append(generated_text)
count += 1
if debug:
print(f"DEBUG - Generated Text: {generated_text}")
print(f"DEBUG - Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
if int(count % 5) == 0:
print(f"Percentage of Completion: {count / n_samples * 100:.2f}%, {count} of {n_samples}")
return synthetic_data

def augment_training_data(self, topics, n_samples=100):
augmented_data = {'text': [], 'label': []}
augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}
for topic in topics:
synthetic_texts = self.generate_synthetic_data(topic, n_samples)
# Assuming the sentiment label for generated data
augmented_data['text'].extend(synthetic_texts)
augmented_data['label'].extend([1] * len(synthetic_texts)) # Defaulting to neutral
augmented_data_with_topics['text'].extend(synthetic_texts)
augmented_data_with_topics['label'].extend([1] * len(synthetic_texts))
augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))

augmented_df = pd.DataFrame(augmented_data)
augmented_df_with_topics = pd.DaataFrame(augmented_data_with_topics)
return augmented_df, augmented_df_with_topics

def fine_tune_with_augmented_data(self, topics, n_samples=100, epochs=3, batch_size=16, learning_rate=2e-5):
augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, n_samples)
return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics
# Augment the training data with synthetic data
def generate_training_data(self, topics, texts, sentiments, n_samples=50, debug=False):
print("Generating synthetic data...")
generated_data = {'text': [], 'category': []}
generated_data_with_topic = {'text': [], 'category': [], 'topic': []}

count = 0
total = len(texts)
for topic, text, sentiment in zip(topics, texts, sentiments):
sentiment_text = self.map_target_to_label(sentiment)
synthetic_texts = self.generate_synthetic_data(topic, text, sentiment_text, n_samples,
debug) # List of synthetic texts
generated_data['text'].extend(synthetic_texts)
generated_data['category'].extend(
[sentiment] * len(synthetic_texts)) # append sentiment to texts many times
generated_data_with_topic['text'].extend(synthetic_texts)
generated_data_with_topic['category'].extend([sentiment] * len(synthetic_texts))
generated_data_with_topic['topic'].extend([topic] * len(synthetic_texts))
count += 1
if debug:
print(f"DEBUG - Generated synthetic data for topic: {topic}, text: {text}, sentiment: {sentiment_text}")
# Print percentage of completion of total texts
percentage_complete = count / total * 100
if int(percentage_complete) % 5 == 0:
print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count} of {total}")



generated_df = pd.DataFrame(generated_data)
generated_df_with_topics = pd.DataFrame(generated_data_with_topic)
return generated_df, generated_df_with_topics

# Fine-tune the model on a custom dataset
def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
# Preprocess the dataset
df = df.rename(columns={"text": "text", "category": "label"}) # Rename the columns
df['label'] = df['label'].astype(int) # Ensure the labels are integers
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset
df = df.rename(columns={"text": "text", "category": "label"}) # Rename the columns
df['label'] = df['label'].astype(int) # Ensure the labels are integers
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split the dataset

train_dataset = Dataset.from_pandas(train_df) # Load the dataset
train_dataset = Dataset.from_pandas(train_df) # Load the dataset
test_dataset = Dataset.from_pandas(test_df)

def tokenize_function(examples): # Tokenize the text
def tokenize_function(examples): # Tokenize the text
return self.tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True) # Tokenize the dataset
Expand All @@ -81,14 +125,14 @@ def tokenize_function(examples): # Tokenize the text
train_dataset = train_dataset.remove_columns(["text"]) # Remove the text column after tokenization
test_dataset = test_dataset.remove_columns(["text"])

train_dataset.set_format("torch") # Set the format to PyTorch
train_dataset.set_format("torch") # Set the format to PyTorch
test_dataset.set_format("torch")

# Define the data collator
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

# Define training arguments
training_args = TrainingArguments( # Define the training arguments
training_args = TrainingArguments( #
output_dir="./results",
run_name="finetuning_sentiment_classifier",
eval_strategy="epoch",
Expand Down
16 changes: 8 additions & 8 deletions extract_stuff.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, de
# If debug mode is enabled, print debug information
percentage_complete = ((count + 1) / total_rows) * 100
if debug:
print(f"Text: {row['text']}")
print(f"Generated Metadata: Topic - {topic}")
print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")
print(f"DEBUG - Text: {row['text']}")
print(f"DEBUG - Generated Metadata: Topic - {topic}")
print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")

if percentage_complete % 5 == 0:
if int(percentage_complete) % 5 == 0:
print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")

count += 1
Expand Down Expand Up @@ -68,11 +68,11 @@ def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch
# Extend the sentiments list with the batch sentiments
sentiments.extend(batch_sentiments)
# Calculate the percentage of completion
percentage_complete = ((end) / total_rows) * 100
percentage_complete = (end / total_rows) * 100
if debug:
print(f"Processed batch {start // batch_size + 1}: {start} to {end}")
print(f"Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
if percentage_complete % 5 == 0:
print(f"DEBUG - Processed batch {start // batch_size + 1}: {start} to {end}")
print(f"DEBUG - Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
if int(percentage_complete) % 5 == 0:
print(f"Percentage of Completion: {percentage_complete:.2f}%")

dataset['sentiment'] = sentiments
Expand Down
Loading