From d9eaa457e779144503a6deca5fb33f4f76c98748 Mon Sep 17 00:00:00 2001 From: Martin Nguyen Date: Tue, 10 Jun 2025 00:44:10 +0700 Subject: [PATCH 01/27] Initialize code for iterative finetuning --- finetune/commands.py | 41 +++++++ finetune/configs/base_export.yaml | 11 ++ finetune/configs/base_finetune.yaml | 47 +++++++ finetune/main.py | 182 ++++++++++++++++++++++++++++ finetune/utils.py | 21 ++++ 5 files changed, 302 insertions(+) create mode 100644 finetune/commands.py create mode 100644 finetune/configs/base_export.yaml create mode 100644 finetune/configs/base_finetune.yaml create mode 100644 finetune/main.py create mode 100644 finetune/utils.py diff --git a/finetune/commands.py b/finetune/commands.py new file mode 100644 index 0000000..c2f38c8 --- /dev/null +++ b/finetune/commands.py @@ -0,0 +1,41 @@ +LF_FINETUNE = """export DISABLE_VERSION_CHECK=1 +llamafactory-cli train {train_config} +llamafactory-cli export {export_config} +""" + +HELM_EVALUTE = """ +# Binary Classification +helm-run --run-entries \ + ultra_suite_classification:model={model_name} \ + --suite binary-suite \ + --output-path {evaluation_dir} \ + --max-eval-instances 1000 + +# ASR Classification +helm-run --run-entries \ + ultra_suite_classification:model={model_name} \ + --suite asr-suite \ + --output-path {evaluation_dir} \ + --max-eval-instances 1000 + +# ASR Transcription +helm-run --run-entries \ + ultra_suite_asr_transcription:model={model_name} \ + --suite trans-suite \ + --output-path {evaluation_dir} \ + --max-eval-instances 1000 + +# Type Classification +helm-run --run-entries \ + ultra_suite_classification_breakdown:model={model_name} \ + --suite type-suite \ + --output-path {evaluation_dir} \ + --max-eval-instances 1000 + +# Symptom Classification +helm-run --run-entries \ + ultra_suite_disorder_symptoms:model={model_name} \ + --suite symp-suite \ + --output-path {evaluation_dir} \ + --max-eval-instances 1000 +""" diff --git a/finetune/configs/base_export.yaml b/finetune/configs/base_export.yaml new file mode 100644 index 0000000..5bbd621 --- /dev/null +++ b/finetune/configs/base_export.yaml @@ -0,0 +1,11 @@ +### model +model_name_or_path: Qwen/Qwen2.5-Omni-3B +adapter_name_or_path: saves/qwen2_5omni-3b/lora/sft +template: qwen2_omni +trust_remote_code: true + +### export +export_dir: output/qwen2_5omni-3b +export_size: 5 +export_device: cpu # choices: [cpu, auto] +export_legacy_format: false \ No newline at end of file diff --git a/finetune/configs/base_finetune.yaml b/finetune/configs/base_finetune.yaml new file mode 100644 index 0000000..7b796de --- /dev/null +++ b/finetune/configs/base_finetune.yaml @@ -0,0 +1,47 @@ +### model +model_name_or_path: Qwen/Qwen2.5-Omni-3B +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 32 +lora_dropout: 0.1 +lora_alpha: 64 +lora_target: all + +### dataset +dataset: ultrasuite +template: qwen2_omni +cutoff_len: 4096 +max_samples: 1200 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen2_5omni-3b/lora/sft +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/finetune/main.py b/finetune/main.py new file mode 100644 index 0000000..9f1cb77 --- /dev/null +++ b/finetune/main.py @@ -0,0 +1,182 @@ +import os +import argparse +import yaml + +from datetime import datetime +from commands import LF_FINETUNE, HELM_EVALUTE +from utils import create_dataset + + +def main(args): + # Set environment variables + os.environ["DISABLE_VERSION_CHECK"] = "1" + + # Get number of available GPUs + num_gpus = ( + os.environ.get("CUDA_VISIBLE_DEVICES", "").count(",") + 1 + if os.environ.get("CUDA_VISIBLE_DEVICES") + else 0 + ) + if num_gpus == 0: + raise RuntimeError("Please set CUDA_VISIBLE_DEVICES.") + + current_model_name = args.model_name + current_dataset = None + current_adapter_name = None + current_finetune_config = None + current_export_config = None + + for iter in range(args.niteration): + print( + f"Starting finetuning iteration {iter + 1} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + ) + + # Prepare dataset + current_dataset = create_dataset( + model_name=current_model_name, + num_samples=args.nsample, + ) + + # Read the training and export configurations + train_config = yaml.safe_load(open(args.train_config, "r", encoding="utf-8")) + export_config = yaml.safe_load(open(args.export_config, "r", encoding="utf-8")) + + # Update configurations with command line arguments + train_config["model_name_or_path"] = current_model_name + train_config["dataset"] = current_dataset + train_config["max_samples"] = args.nsample + train_config["output_dir"] = f"{args.finetuning_dir}/iter_{iter + 1}/ckpts" + train_config["per_device_train_batch_size"] = min( + args.batch_size // num_gpus, 4 + ) + train_config["gradient_accumulation_steps"] = max( + 1, + args.batch_size // (train_config["per_device_train_batch_size"] * num_gpus), + ) + current_adapter_name = train_config["output_dir"] + + # Update export configuration + export_config["model_name_or_path"] = current_model_name + export_config["adapter_name_or_path"] = current_adapter_name + export_config["export_dir"] = f"{args.finetuning_dir}/iter_{iter + 1}/model" + current_model_name = export_config["export_dir"] + + # Save the updated configurations back to the files + current_finetune_config = "configs/finetune_iter_{}.yaml".format(iter) + current_export_config = "configs/export_iter_{}.yaml".format(iter) + with open(current_finetune_config, "w", encoding="utf-8") as f: + yaml.safe_dump(train_config, f, default_flow_style=False) + with open(current_export_config, "w", encoding="utf-8") as f: + yaml.safe_dump(export_config, f, default_flow_style=False) + + # Log the current configuration + print(f"Using model: {args.model_name}") + print(f"Using dataset: {args.dataset}") + print(f"Evaluation directory: {args.evaluation_dir}") + print(f"Finetuning directory: {args.finetuning_dir}") + print(f"Number of samples per iteration: {args.nsample}") + print( + f"Finetuning iteration {iter + 1} started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + ) + # Log the training configuration + print(f"Training configuration: {current_finetune_config}") + print(f"Export configuration: {current_export_config}") + # Log the environment variables + print(f"Environment variables set for iteration {iter + 1}:") + print(f"MODEL_NAME: {os.environ['MODEL_NAME']}") + print(f"DATASET: {os.environ['DATASET']}") + print(f"EVALUATION_DIR: {os.environ['EVALUATION_DIR']}") + print(f"FINETUNING_DIR: {os.environ['FINETUNING_DIR']}") + print(f"NSAMPLE: {os.environ['NSAMPLE']}") + + # Log the finetuning command + print( + f"Running finetuning command: {LF_FINETUNE.format(train_config=current_finetune_config, export_config=current_export_config)}" + ) + # Run finetuning command + os.system( + LF_FINETUNE.format( + train_config=current_finetune_config, + export_config=current_export_config, + ) + ) + + # Log the finetuning completion + print(f"Finetuning completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Log the evaluation command + print( + f"Running evaluation command: {HELM_EVALUTE.format(evaluation_dir=f'{args.evaluation_dir}/iter_{iter + 1}', finetuning_dir=f'{args.finetuning_dir}/iter_{iter + 1}')}" + ) + + # Run evaluation command + os.system( + HELM_EVALUTE.format( + model_name=current_model_name, + evaluation_dir=f"{args.evaluation_dir}/iter_{iter + 1}", + ) + ) + + # Log the evaluation completion + print(f"Evaluation completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run finetuning with LlamaFactory CLI." + ) + parser.add_argument( + "--train_config", + type=str, + required=True, + default="./configs/base_finetune.yaml", + help="Path to the training configuration file.", + ) + parser.add_argument( + "--export_config", + type=str, + required=True, + default="./configs/base_export.yaml", + help="Path to the export configuration file.", + ) + parser.add_argument( + "--model_name", + type=str, + default="Qwen/Qwen2.5-Omni-3B", + help="Name of the model to finetune.", + ) + parser.add_argument( + "--evaluation_dir", + type=str, + default="./evaluation_results", + help="Directory to save evaluation results.", + ) + parser.add_argument( + "--finetuning_dir", + type=str, + default="./finetuning_results", + help="Directory to save finetuning results.", + ) + parser.add_argument( + "--niteration", + type=int, + default=5, + help="Number of finetuning iterations to perform.", + ) + parser.add_argument( + "--batch_size", type=int, default=32, help="Batch size for training." + ) + parser.add_argument( + "--nsample", + type=int, + default=1000, + help="Number of samples to use for each finetuning iteration.", + ) + args = parser.parse_args() + + # Ensure directories exist + os.makedirs(args.evaluation_dir, exist_ok=True) + os.makedirs(args.finetuning_dir, exist_ok=True) + + # Run the main function + main(args) diff --git a/finetune/utils.py b/finetune/utils.py new file mode 100644 index 0000000..03d9af2 --- /dev/null +++ b/finetune/utils.py @@ -0,0 +1,21 @@ +def create_dataset(model_name, num_samples): + """ + Create a dataset for the given model name and number of samples. + + Args: + model_name (str): The name of the model. + num_samples (int): The number of samples to create. + + Returns: + str: The path to the created dataset. + """ + # Placeholder for dataset creation logic + # In a real implementation, this function would create or load a dataset + # based on the model name and number of samples. + dataset_path = f"datasets/{model_name}_dataset_{num_samples}.json" + + # Simulate dataset creation + with open(dataset_path, "w", encoding="utf-8") as f: + f.write(f"Dataset for {model_name} with {num_samples} samples.") + + return dataset_path From f8dd429e1b7d28d6c623eb0271179868cc2e613a Mon Sep 17 00:00:00 2001 From: Martin Nguyen Date: Thu, 12 Jun 2025 23:45:06 +0700 Subject: [PATCH 02/27] WIP: Creating dataset by prompting the LLM --- finetune/{commands.py => constants.py} | 14 +++ finetune/main.py | 2 +- finetune/utils.py | 157 ++++++++++++++++++++++++- 3 files changed, 171 insertions(+), 2 deletions(-) rename finetune/{commands.py => constants.py} (51%) diff --git a/finetune/commands.py b/finetune/constants.py similarity index 51% rename from finetune/commands.py rename to finetune/constants.py index c2f38c8..3959ebe 100644 --- a/finetune/commands.py +++ b/finetune/constants.py @@ -39,3 +39,17 @@ --output-path {evaluation_dir} \ --max-eval-instances 1000 """ + +SYSTEM_PROMPT = "You are a helpful assistant." +TRANSCRIPT_GENERATION_PROMPT = "Assuming you are a speech therapist who is helping children with speech disorders. Generate a sentence containing 3 - 10 words that suitable for testing children speech disorders. Return only the sentence and nothing else." +SPEECH_GENERATION_PROMPT = """Assuming you are a child with speech disordered. {disorder_description} Generate a corresponding transcript with the `{disorder_type} disorder` when the child say "{transcript}". Return only the transcript text and nothing else.""" + +DISORDER_DESCRIPTION = { + "addition": "The children with addition disorder will insert of an extra sound into a word.", + "substitution": "The children with substitution disorder will replace a sound in a word with another sound.", + "omission": "The children with omission disorder will leave out a sound in a word.", + "stuttering": "The children with stuttering disorder will repeat a sound or syllable in a word.", +} + +# set use audio in video +USE_AUDIO_IN_VIDEO = True diff --git a/finetune/main.py b/finetune/main.py index 9f1cb77..4fe5160 100644 --- a/finetune/main.py +++ b/finetune/main.py @@ -3,7 +3,7 @@ import yaml from datetime import datetime -from commands import LF_FINETUNE, HELM_EVALUTE +from finetune.constants import LF_FINETUNE, HELM_EVALUTE from utils import create_dataset diff --git a/finetune/utils.py b/finetune/utils.py index 03d9af2..a14769f 100644 --- a/finetune/utils.py +++ b/finetune/utils.py @@ -1,4 +1,105 @@ -def create_dataset(model_name, num_samples): +from constants import ( + SYSTEM_PROMPT, + TRANSCRIPT_GENERATION_PROMPT, + SPEECH_GENERATION_PROMPT, + DISORDER_DESCRIPTION, + USE_AUDIO_IN_VIDEO, +) +from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor +from qwen_omni_utils import process_mm_info +import soundfile as sf +import random + + +def load_model(model_name, attn_implementation="flash_attention_2"): + if "qwen2.5-omni" in model_name.lower(): + model = Qwen2_5OmniForConditionalGeneration.from_pretrained( + model_name, + torch_dtype="auto", + device_map="auto", + attn_implementation=attn_implementation, + ) + + processor = Qwen2_5OmniProcessor.from_pretrained(model_name) + else: + raise ValueError(f"Unsupported model name: {model_name}") + return model, processor + + +def inference(model, processor, prompts, speaker="Ethan", **kwargs): + """ + Perform inference using the model and processor. + + Args: + model: The loaded model. + processor: The processor for the model. + prompts (list): A list of input prompts for inference. + speaker (str): The speaker's name or identifier. + + Returns: + str: The generated output from the model. + """ + + conversations = [] + for prompt in prompts: + conversation = [ + { + "role": "system", + "content": [ + {"type": "text", "text": SYSTEM_PROMPT}, + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + ], + }, + ] + conversations.append(conversation) + + # Preparation for inference + input_texts = processor.apply_chat_template( + conversations, add_generation_prompt=True, tokenize=False + ) + audios, images, videos = process_mm_info( + conversations, use_audio_in_video=USE_AUDIO_IN_VIDEO + ) + inputs = processor( + text=input_texts, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=USE_AUDIO_IN_VIDEO, + ) + inputs = inputs.to(model.device).to(model.dtype) + + # Inference: Generation of the output text and audio + text_ids, audios = model.generate( + **inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, speaker=speaker, **kwargs + ) + + output_texts = processor.batch_decode( + text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + return output_texts, audios + + +def save_sound_file(audio, file_path): + """ + Save the audio data to a file. + + Args: + audio (np.ndarray): The audio data to save. + file_path (str): The path where the audio file will be saved. + """ + sf.write(file_path, audio, samplerate=24000) + + +def create_dataset(model_name, num_samples, batch_size=32): """ Create a dataset for the given model name and number of samples. @@ -9,6 +110,60 @@ def create_dataset(model_name, num_samples): Returns: str: The path to the created dataset. """ + + model, processor = load_model(model_name) + + list_of_disorder_types = list(DISORDER_DESCRIPTION.keys()) + num_disorder_types = len(list_of_disorder_types) + + list_speakers = ["Chelsie", "Ethan"] + + # Randomly select disorder types for each sample + disorder_types = [ + list_of_disorder_types[i % num_disorder_types] for i in range(num_samples) + ] + + normal_transcripts, normal_audios = [], [] + breakpoint() + for speaker in list_speakers: + transcripts, audios = inference( + model, + processor, + prompts=[TRANSCRIPT_GENERATION_PROMPT], + speaker=speaker, + num_return_sequences=num_samples, + do_sample=True, + max_new_tokens=32, + ) + normal_transcripts.extend(transcripts) + normal_audios.extend(audios) + + speech_generation_prompts = [ + SPEECH_GENERATION_PROMPT.format( + disorder_description=DISORDER_DESCRIPTION[disorder_type], + disorder_type=disorder_type, + transcript=transcript, + ) + for disorder_type, transcript in zip(disorder_types, normal_transcripts) + ] + disorderd_transcripts, disorder_audios = [], [] + + # inference for disordered speech by batch + for i in range(0, num_samples, batch_size): + batch_prompts = speech_generation_prompts[i : i + batch_size] + speaker = random.choice(list_speakers) + transcripts, audios = inference( + model, + processor, + prompts=batch_prompts, + speaker=speaker, + num_return_sequences=1, + do_sample=False, + max_new_tokens=32, + ) + disorderd_transcripts.extend(transcripts) + disorder_audios.extend(audios) + # Placeholder for dataset creation logic # In a real implementation, this function would create or load a dataset # based on the model name and number of samples. From 20ab384e81b6e197e9c55bab28c0b49568abd900 Mon Sep 17 00:00:00 2001 From: Martin Nguyen Date: Fri, 13 Jun 2025 00:03:42 +0700 Subject: [PATCH 03/27] Fix import bug --- finetune/main.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/finetune/main.py b/finetune/main.py index 4fe5160..11b3c7a 100644 --- a/finetune/main.py +++ b/finetune/main.py @@ -3,7 +3,7 @@ import yaml from datetime import datetime -from finetune.constants import LF_FINETUNE, HELM_EVALUTE +from constants import LF_FINETUNE, HELM_EVALUTE from utils import create_dataset @@ -38,8 +38,10 @@ def main(args): ) # Read the training and export configurations - train_config = yaml.safe_load(open(args.train_config, "r", encoding="utf-8")) - export_config = yaml.safe_load(open(args.export_config, "r", encoding="utf-8")) + train_config = yaml.safe_load( + open(args.train_config, "r", encoding="utf-8")) + export_config = yaml.safe_load( + open(args.export_config, "r", encoding="utf-8")) # Update configurations with command line arguments train_config["model_name_or_path"] = current_model_name @@ -51,7 +53,8 @@ def main(args): ) train_config["gradient_accumulation_steps"] = max( 1, - args.batch_size // (train_config["per_device_train_batch_size"] * num_gpus), + args.batch_size // ( + train_config["per_device_train_batch_size"] * num_gpus), ) current_adapter_name = train_config["output_dir"] @@ -102,7 +105,8 @@ def main(args): ) # Log the finetuning completion - print(f"Finetuning completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print( + f"Finetuning completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") # Log the evaluation command print( @@ -118,7 +122,8 @@ def main(args): ) # Log the evaluation completion - print(f"Evaluation completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print( + f"Evaluation completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") if __name__ == "__main__": From 346ec50088a2b6bb7b33d465ae152d634cd9299c Mon Sep 17 00:00:00 2001 From: Martin Nguyen Date: Fri, 13 Jun 2025 00:05:11 +0700 Subject: [PATCH 04/27] Remvoe unnecessary required arguments --- finetune/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/finetune/main.py b/finetune/main.py index 11b3c7a..b24a847 100644 --- a/finetune/main.py +++ b/finetune/main.py @@ -133,14 +133,12 @@ def main(args): parser.add_argument( "--train_config", type=str, - required=True, default="./configs/base_finetune.yaml", help="Path to the training configuration file.", ) parser.add_argument( "--export_config", type=str, - required=True, default="./configs/base_export.yaml", help="Path to the export configuration file.", ) From 6c4c8ba4a87af7a3a7d4ebcbae1eef47a93be159 Mon Sep 17 00:00:00 2001 From: Martin Nguyen Date: Fri, 13 Jun 2025 00:12:58 +0700 Subject: [PATCH 05/27] Remove default flash_attention_2 --- finetune/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune/utils.py b/finetune/utils.py index a14769f..0d09434 100644 --- a/finetune/utils.py +++ b/finetune/utils.py @@ -11,7 +11,7 @@ import random -def load_model(model_name, attn_implementation="flash_attention_2"): +def load_model(model_name, attn_implementation=None): if "qwen2.5-omni" in model_name.lower(): model = Qwen2_5OmniForConditionalGeneration.from_pretrained( model_name, From c078e8fb1a4ea29b775c95e3f75a1db4bebd72b0 Mon Sep 17 00:00:00 2001 From: Martin Nguyen Date: Fri, 13 Jun 2025 00:16:48 +0700 Subject: [PATCH 06/27] Fix system prompt --- finetune/constants.py | 2 +- finetune/main.py | 1 + finetune/utils.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/finetune/constants.py b/finetune/constants.py index 3959ebe..82403b5 100644 --- a/finetune/constants.py +++ b/finetune/constants.py @@ -40,7 +40,7 @@ --max-eval-instances 1000 """ -SYSTEM_PROMPT = "You are a helpful assistant." +SYSTEM_PROMPT = "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." TRANSCRIPT_GENERATION_PROMPT = "Assuming you are a speech therapist who is helping children with speech disorders. Generate a sentence containing 3 - 10 words that suitable for testing children speech disorders. Return only the sentence and nothing else." SPEECH_GENERATION_PROMPT = """Assuming you are a child with speech disordered. {disorder_description} Generate a corresponding transcript with the `{disorder_type} disorder` when the child say "{transcript}". Return only the transcript text and nothing else.""" diff --git a/finetune/main.py b/finetune/main.py index b24a847..2ba2a99 100644 --- a/finetune/main.py +++ b/finetune/main.py @@ -35,6 +35,7 @@ def main(args): current_dataset = create_dataset( model_name=current_model_name, num_samples=args.nsample, + batch_size=args.batch_size, ) # Read the training and export configurations diff --git a/finetune/utils.py b/finetune/utils.py index 0d09434..83fdeb6 100644 --- a/finetune/utils.py +++ b/finetune/utils.py @@ -150,7 +150,7 @@ def create_dataset(model_name, num_samples, batch_size=32): # inference for disordered speech by batch for i in range(0, num_samples, batch_size): - batch_prompts = speech_generation_prompts[i : i + batch_size] + batch_prompts = speech_generation_prompts[i: i + batch_size] speaker = random.choice(list_speakers) transcripts, audios = inference( model, From 5864992b31f71d30e86af5c9915cdb1f7e774ebd Mon Sep 17 00:00:00 2001 From: Martin Nguyen Date: Sat, 14 Jun 2025 13:19:03 -0700 Subject: [PATCH 07/27] Update finetuning code --- finetune/constants.py | 15 +- finetune/data/dataset_info.json | 16 ++ finetune/main.py | 31 +-- finetune/utils.py | 362 +++++++++++++++++++++++++++----- 4 files changed, 342 insertions(+), 82 deletions(-) create mode 100644 finetune/data/dataset_info.json diff --git a/finetune/constants.py b/finetune/constants.py index 82403b5..e152ff0 100644 --- a/finetune/constants.py +++ b/finetune/constants.py @@ -41,14 +41,17 @@ """ SYSTEM_PROMPT = "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." -TRANSCRIPT_GENERATION_PROMPT = "Assuming you are a speech therapist who is helping children with speech disorders. Generate a sentence containing 3 - 10 words that suitable for testing children speech disorders. Return only the sentence and nothing else." -SPEECH_GENERATION_PROMPT = """Assuming you are a child with speech disordered. {disorder_description} Generate a corresponding transcript with the `{disorder_type} disorder` when the child say "{transcript}". Return only the transcript text and nothing else.""" +TRANSCRIPT_GENERATION_PROMPT = ( + "Generate one simple sentence. Return only the sentence and nothing else." +) +NORMAL_SPEECH_GENERATION_PROMPT = "Repeat the following sentence. Return only the sentence and nothing else. The sentence is: {transcript}" +DISORDERED_SPEECH_GENERATION_PROMPT = """Assuming you are a child with speech disordered. {disorder_description} Generate a corresponding transcript with the `{disorder_type} disorder` when the child say: "{transcript}". Return only the transcript text and nothing else.""" DISORDER_DESCRIPTION = { - "addition": "The children with addition disorder will insert of an extra sound into a word.", - "substitution": "The children with substitution disorder will replace a sound in a word with another sound.", - "omission": "The children with omission disorder will leave out a sound in a word.", - "stuttering": "The children with stuttering disorder will repeat a sound or syllable in a word.", + "addition": "The children with `addition disorder` will insert of an extra sound into a word.", + "substitution": "The children with `substitution disorder` will replace a sound in a word with another sound.", + "omission": "The children with `omission disorder` will leave out a sound in a word.", + "stuttering": "The children with `stuttering disorder` will repeat a sound or syllable in a word.", } # set use audio in video diff --git a/finetune/data/dataset_info.json b/finetune/data/dataset_info.json new file mode 100644 index 0000000..1569541 --- /dev/null +++ b/finetune/data/dataset_info.json @@ -0,0 +1,16 @@ +{ + "qwen2.5-omni-3b_dataset_4_iter0": { + "file_name": "qwen2.5-omni-3b_dataset_4_iter0/qwen2.5-omni-3b_dataset_4_iter0.json", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "audios": "audios" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + } +} \ No newline at end of file diff --git a/finetune/main.py b/finetune/main.py index 2ba2a99..b0010cd 100644 --- a/finetune/main.py +++ b/finetune/main.py @@ -36,13 +36,12 @@ def main(args): model_name=current_model_name, num_samples=args.nsample, batch_size=args.batch_size, + iteration=iter, ) # Read the training and export configurations - train_config = yaml.safe_load( - open(args.train_config, "r", encoding="utf-8")) - export_config = yaml.safe_load( - open(args.export_config, "r", encoding="utf-8")) + train_config = yaml.safe_load(open(args.train_config, "r", encoding="utf-8")) + export_config = yaml.safe_load(open(args.export_config, "r", encoding="utf-8")) # Update configurations with command line arguments train_config["model_name_or_path"] = current_model_name @@ -54,15 +53,14 @@ def main(args): ) train_config["gradient_accumulation_steps"] = max( 1, - args.batch_size // ( - train_config["per_device_train_batch_size"] * num_gpus), + args.batch_size // (train_config["per_device_train_batch_size"] * num_gpus), ) current_adapter_name = train_config["output_dir"] # Update export configuration export_config["model_name_or_path"] = current_model_name export_config["adapter_name_or_path"] = current_adapter_name - export_config["export_dir"] = f"{args.finetuning_dir}/iter_{iter + 1}/model" + export_config["export_dir"] = f"{args.finetuning_dir}/iter_{iter + 1}" current_model_name = export_config["export_dir"] # Save the updated configurations back to the files @@ -74,8 +72,8 @@ def main(args): yaml.safe_dump(export_config, f, default_flow_style=False) # Log the current configuration - print(f"Using model: {args.model_name}") - print(f"Using dataset: {args.dataset}") + print(f"Using model: {current_model_name}") + print(f"Using dataset: {current_dataset}") print(f"Evaluation directory: {args.evaluation_dir}") print(f"Finetuning directory: {args.finetuning_dir}") print(f"Number of samples per iteration: {args.nsample}") @@ -85,13 +83,6 @@ def main(args): # Log the training configuration print(f"Training configuration: {current_finetune_config}") print(f"Export configuration: {current_export_config}") - # Log the environment variables - print(f"Environment variables set for iteration {iter + 1}:") - print(f"MODEL_NAME: {os.environ['MODEL_NAME']}") - print(f"DATASET: {os.environ['DATASET']}") - print(f"EVALUATION_DIR: {os.environ['EVALUATION_DIR']}") - print(f"FINETUNING_DIR: {os.environ['FINETUNING_DIR']}") - print(f"NSAMPLE: {os.environ['NSAMPLE']}") # Log the finetuning command print( @@ -106,12 +97,11 @@ def main(args): ) # Log the finetuning completion - print( - f"Finetuning completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Finetuning completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") # Log the evaluation command print( - f"Running evaluation command: {HELM_EVALUTE.format(evaluation_dir=f'{args.evaluation_dir}/iter_{iter + 1}', finetuning_dir=f'{args.finetuning_dir}/iter_{iter + 1}')}" + f"Running evaluation command: {HELM_EVALUTE.format(evaluation_dir=f'{args.evaluation_dir}/iter_{iter + 1}', model_name=current_model_name)}" ) # Run evaluation command @@ -123,8 +113,7 @@ def main(args): ) # Log the evaluation completion - print( - f"Evaluation completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Evaluation completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") if __name__ == "__main__": diff --git a/finetune/utils.py b/finetune/utils.py index 83fdeb6..cc69816 100644 --- a/finetune/utils.py +++ b/finetune/utils.py @@ -1,32 +1,42 @@ from constants import ( SYSTEM_PROMPT, TRANSCRIPT_GENERATION_PROMPT, - SPEECH_GENERATION_PROMPT, + NORMAL_SPEECH_GENERATION_PROMPT, + DISORDERED_SPEECH_GENERATION_PROMPT, DISORDER_DESCRIPTION, USE_AUDIO_IN_VIDEO, ) -from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor +from transformers import ( + Qwen2_5OmniForConditionalGeneration, + Qwen2_5OmniProcessor, + GenerationConfig, +) from qwen_omni_utils import process_mm_info +from nltk import tokenize +from tqdm import tqdm import soundfile as sf import random +import re +import os +import json def load_model(model_name, attn_implementation=None): if "qwen2.5-omni" in model_name.lower(): + generation_config = GenerationConfig.from_pretrained(model_name) model = Qwen2_5OmniForConditionalGeneration.from_pretrained( model_name, torch_dtype="auto", device_map="auto", attn_implementation=attn_implementation, ) - processor = Qwen2_5OmniProcessor.from_pretrained(model_name) else: raise ValueError(f"Unsupported model name: {model_name}") - return model, processor + return generation_config, model, processor -def inference(model, processor, prompts, speaker="Ethan", **kwargs): +def inference(model, processor, prompts, speaker="Ethan", return_audio=False, **kwargs): """ Perform inference using the model and processor. @@ -75,17 +85,36 @@ def inference(model, processor, prompts, speaker="Ethan", **kwargs): use_audio_in_video=USE_AUDIO_IN_VIDEO, ) inputs = inputs.to(model.device).to(model.dtype) + n_input_tokens = inputs.input_ids.shape[1] # Inference: Generation of the output text and audio - text_ids, audios = model.generate( - **inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO, speaker=speaker, **kwargs - ) + if return_audio: + text_ids, audios = model.generate( + **inputs, + use_audio_in_video=USE_AUDIO_IN_VIDEO, + speaker=speaker, + return_audio=return_audio, + **kwargs, + ) + else: + text_ids = model.generate( + **inputs, + use_audio_in_video=USE_AUDIO_IN_VIDEO, + speaker=speaker, + return_audio=return_audio, + **kwargs, + ) output_texts = processor.batch_decode( - text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False + text_ids[:, n_input_tokens:], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, ) - return output_texts, audios + if return_audio: + return output_texts, audios.detach().cpu().numpy() + + return output_texts def save_sound_file(audio, file_path): @@ -99,7 +128,120 @@ def save_sound_file(audio, file_path): sf.write(file_path, audio, samplerate=24000) -def create_dataset(model_name, num_samples, batch_size=32): +def verify_transcript(transcripts): + """ + Verify and clean up the generated transcripts. + + Args: + transcripts (list): A list of generated transcripts. + + Returns: + list: A list of verified transcripts. + """ + verified = [] + for transcript in transcripts: + # Simple verification: check if the transcript is not empty + if transcript.strip(): + list_sents = tokenize.sent_tokenize( + transcript + ) # Ensure it can be tokenized + for sent in list_sents: + if len(sent.split()) >= 3: + verified.append(sent.strip()) + break + return verified + + +def post_process_transcripts(transcripts): + """ + Post-process the transcripts to remove unnecessary characters and whitespace. + + Args: + transcripts (list): A list of generated transcripts. + + Returns: + list: A list of post-processed transcripts. + """ + processed = [] + for transcript in transcripts: + # Remove leading/trailing whitespace and ensure it's not empty + cleaned = transcript.strip() + + # Remove " or ' at the beginning and end + if cleaned.startswith('"'): + cleaned = cleaned.replace('"', "").strip() + elif cleaned.startswith("'"): + cleaned = cleaned.replace("'", "").strip() + + processed.append(cleaned) + return processed + + +def make_disorder_transcript(normal_transcript, disorder_type, error_rate=0.4): + vowels = "aeiou" + consonants = "bcdfghjklmnpqrstvwxyz" + words = normal_transcript.split() + substitution_map = {} + + # Build a consistent substitution map per sentence + def apply_substitution(word): + new_chars = list(word) + for i, c in enumerate(word): + cl = c.lower() + if cl in consonants: + if cl not in substitution_map: + substitute_options = [x for x in consonants if x != cl] + substitution_map[cl] = random.choice(substitute_options) + new_char = substitution_map[cl] + new_chars[i] = new_char.upper() if c.isupper() else new_char + break # only one substitution per word + return "".join(new_chars) + + def apply_addition(word): + if len(word) <= 1: + return word + # Insert a random vowel somewhere in the word + insert_pos = random.randint(1, len(word) - 1) + insert_char = random.choice(vowels) + return word[:insert_pos] + insert_char + word[insert_pos:] + + def apply_omission(word): + if len(word) <= 1: + return word + # Drop a random character (not punctuation) + omit_pos = random.randint(0, len(word) - 1) + return word[:omit_pos] + word[omit_pos + 1 :] + + def apply_stuttering(word): + # Repeat first phoneme-like unit (simplified to first letter or first consonant-vowel combo) + match = re.match(r"^([^aeiou]*[aeiou]?)", word, re.IGNORECASE) + prefix = match.group(0) if match else word[0] + return f"{prefix}-{prefix}-{word}" + + # Mapping disorder to function + disorder_fn = { + "addition": apply_addition, + "substitution": apply_substitution, + "omission": apply_omission, + "stuttering": apply_stuttering, + } + + if disorder_type not in disorder_fn: + raise ValueError(f"Unsupported disorder type: {disorder_type}") + + apply_fn = disorder_fn[disorder_type] + + # Apply the disorder randomly across words + disordered_words = [ + apply_fn(word) if random.random() < error_rate else word for word in words + ] + + return " ".join(disordered_words) + + +def create_dataset( + model_name, num_samples, batch_size=32, data_dir="data", iteration=0 +): """ Create a dataset for the given model name and number of samples. @@ -110,67 +252,177 @@ def create_dataset(model_name, num_samples, batch_size=32): Returns: str: The path to the created dataset. """ + # Load the model and processor + generation_config, model, processor = load_model(model_name) - model, processor = load_model(model_name) + # Create the dataset directory + dataset_name = ( + f"{model_name.split('/')[-1].lower()}_dataset_{num_samples}_iter{iteration}" + ) + data_dir = os.path.join(data_dir, dataset_name) + os.makedirs(data_dir, exist_ok=True) + # Define prompts and disorder types list_of_disorder_types = list(DISORDER_DESCRIPTION.keys()) num_disorder_types = len(list_of_disorder_types) - list_speakers = ["Chelsie", "Ethan"] - # Randomly select disorder types for each sample - disorder_types = [ - list_of_disorder_types[i % num_disorder_types] for i in range(num_samples) - ] + # Generate normal speech transcripts + normal_transcripts = inference( + model, + processor, + prompts=[TRANSCRIPT_GENERATION_PROMPT], + num_return_sequences=num_samples, + do_sample=True, + max_new_tokens=32, + return_audio=False, + generation_config=generation_config, + ) + normal_transcripts = verify_transcript(normal_transcripts) - normal_transcripts, normal_audios = [], [] - breakpoint() - for speaker in list_speakers: - transcripts, audios = inference( - model, - processor, - prompts=[TRANSCRIPT_GENERATION_PROMPT], - speaker=speaker, - num_return_sequences=num_samples, - do_sample=True, - max_new_tokens=32, - ) - normal_transcripts.extend(transcripts) - normal_audios.extend(audios) - - speech_generation_prompts = [ - SPEECH_GENERATION_PROMPT.format( - disorder_description=DISORDER_DESCRIPTION[disorder_type], - disorder_type=disorder_type, - transcript=transcript, - ) - for disorder_type, transcript in zip(disorder_types, normal_transcripts) + # Generate normal speech audios + normal_audios = [] + normal_speech_generation_prompts = [ + NORMAL_SPEECH_GENERATION_PROMPT.format(transcript=transcript) + for transcript in normal_transcripts ] - disorderd_transcripts, disorder_audios = [], [] - # inference for disordered speech by batch - for i in range(0, num_samples, batch_size): - batch_prompts = speech_generation_prompts[i: i + batch_size] + # Inference for normal speech by batch + for transcript in tqdm(normal_transcripts, desc="Generating normal speech audios"): + speaker = random.choice(list_speakers) + for _ in range(5): # Retry up to 5 times to ensure transcript matches + dc_transcript, audio = inference( + model, + processor, + prompts=[NORMAL_SPEECH_GENERATION_PROMPT.format(transcript=transcript)], + speaker=speaker, + num_return_sequences=1, + do_sample=False, + max_new_tokens=32, + return_audio=True, + generation_config=generation_config, + ) + dc_transcript = post_process_transcripts(dc_transcript) + if dc_transcript[0] == transcript: + normal_audios.append(audio) + break + + # Save normal speech audios + for i, audio in enumerate(normal_audios): + audio_file_path = os.path.join(data_dir, f"normal_speech_{i}.wav") + save_sound_file(audio, audio_file_path) + + # Generate disordered speech transcripts + disordered_transcripts, disordered_audios = [], [] + for transcript in tqdm( + normal_transcripts, desc="Generating disordered speech audios" + ): speaker = random.choice(list_speakers) - transcripts, audios = inference( + disorder_type = random.choice(list_of_disorder_types) + dc_transcripts, audios = inference( model, processor, - prompts=batch_prompts, + prompts=[ + DISORDERED_SPEECH_GENERATION_PROMPT.format( + disorder_description=DISORDER_DESCRIPTION[disorder_type], + disorder_type=disorder_type, + transcript=transcript, + ) + ], speaker=speaker, num_return_sequences=1, do_sample=False, max_new_tokens=32, + return_audio=True, + ) + dc_transcripts = post_process_transcripts(dc_transcripts) + if dc_transcripts[0] != transcript: + disordered_transcripts.append(dc_transcripts[0]) + disordered_audios.append(audios) + else: + # If the disordered transcript is the same as the normal one, retry + for _ in range(10): # Retry up to 10 times + disordered_transcript = make_disorder_transcript( + transcript, disorder_type + ) + if disordered_transcript != transcript: + disordered_transcripts.append(disordered_transcript) + break + else: + disorder_type = random.choice(list_of_disorder_types) + + _, audio = inference( + model, + processor, + prompts=[NORMAL_SPEECH_GENERATION_PROMPT.format(transcript=transcript)], + speaker=speaker, + num_return_sequences=1, + do_sample=False, + max_new_tokens=32, + return_audio=True, + generation_config=generation_config, + ) + disordered_audios.append(audio) + # Save disordered speech audios + for i, audio in enumerate(disordered_audios): + audio_file_path = os.path.join(data_dir, f"disordered_speech_{i}.wav") + save_sound_file(audio, audio_file_path) + + # Save transcripts to a file + list_samples = [] + + for i, transcript in enumerate(normal_transcripts): + wav_path = os.path.join(dataset_name, f"normal_speech_{i}.wav") + list_samples.append( + { + "messages": [ + { + "content": "