diff --git a/.gitignore b/.gitignore index 4c8ddb1..aa03c6a 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,8 @@ cython_debug/ logs/* -*.ipynb \ No newline at end of file +*.ipynb +finetune/data/ +finetune/*results +finetune/configs +finetune/prod_env \ No newline at end of file diff --git a/README.md b/README.md index dd2942c..4de0e5f 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,58 @@ -# SpeechEval +# SLPHelm -## 0. Env Set up: -This project uses a Conda environment defined in conda_env.yml. To create and activate the environment: -```sh -# Create the environment -conda env create -f environment.yml +This repository contains scripts and instructions to run the SLPHelm benchmark. -# Activate the environment -conda activate SpeechEval -``` +There are two sub-folders: +- `finetune`: scripts to finetune models with self-generated data. +- `finetune-ultrasuite`: instructions to create UltraSuite dataset and finetune models with LLaMa-Factory framework. -Alternatively, if you prefer using pip directly, a requirements.txt file is provided: -```sh -# (Optional) create or activate your own environment, then: -pip install -r requirements.txt +## How to run the benchmark +1. Install Helm: +```bash +git clone https://github.com/martinakaduc/helm/ -b slp_helm +cd helm +pip install -e . ``` -## 1. Get model list from huggingface: -We first iterate the models on huggingface. Filter out the model satisfied the following requirement: -1. Has model tag of: any-to-any, audio-text-to-text -2. Has vllm support (test by run `vllm serve MODEL_ID`) -3. Accept audio, text input and output text. (test by run a sample request) +2. Run the benchmark: +```bash +# Binary Classification +helm-run --run-entries \ + ultra_suite_classification:model={model_name} \ + --suite binary-suite \ + --output-path {evaluation_dir} \ + --disable-cache \ + --max-eval-instances 1000 + +# ASR Classification +helm-run --run-entries \ + ultra_suite_classification:model={model_name} \ + --suite asr-suite \ + --output-path {evaluation_dir} \ + --disable-cache \ + --max-eval-instances 1000 + +# ASR Transcription +helm-run --run-entries \ + ultra_suite_asr_transcription:model={model_name} \ + --suite trans-suite \ + --output-path {evaluation_dir} \ + --disable-cache \ + --max-eval-instances 1000 + +# Type Classification +helm-run --run-entries \ + ultra_suite_classification_breakdown:model={model_name} \ + --suite type-suite \ + --output-path {evaluation_dir} \ + --disable-cache \ + --max-eval-instances 1000 -```sh - cd tools && python get_model_list.py +# Symptom Classification +helm-run --run-entries \ + ultra_suite_disorder_symptoms:model={model_name} \ + --suite symp-suite \ + --output-path {evaluation_dir} \ + --disable-cache \ + --max-eval-instances 1000 ``` \ No newline at end of file diff --git a/audio/alloy.wav b/audio/alloy.wav deleted file mode 100644 index c6d0edd..0000000 Binary files a/audio/alloy.wav and /dev/null differ diff --git a/audio/atypical.wav b/audio/atypical.wav deleted file mode 100644 index 6789e48..0000000 Binary files a/audio/atypical.wav and /dev/null differ diff --git a/audio/typical.wav b/audio/typical.wav deleted file mode 100644 index fdfba19..0000000 Binary files a/audio/typical.wav and /dev/null differ diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 515361e..0000000 --- a/environment.yml +++ /dev/null @@ -1,308 +0,0 @@ -name: SpeechEval -channels: - - conda-forge - - https://software.repos.intel.com/python/conda/ - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - aom=3.6.0=h6a678d5_0 - - asttokens=2.4.1=pyhd8ed1ab_0 - - bzip2=1.0.8=h4bc722e_7 - - c-ares=1.19.1=hd590300_0 - - ca-certificates=2024.12.31=h06a4308_0 - - cairo=1.16.0=hb05425b_5 - - comm=0.2.2=pyhd8ed1ab_1 - - curl=8.4.0=hca28451_0 - - dav1d=1.2.1=hd590300_0 - - debugpy=1.8.9=py310hf71b8c6_0 - - decorator=5.1.1=pyhd8ed1ab_1 - - exceptiongroup=1.2.2=pyhd8ed1ab_1 - - executing=2.1.0=pyhd8ed1ab_1 - - expat=2.6.3=h5888daf_0 - - ffmpeg=6.1.1=he32514a_1 - - fontconfig=2.14.1=h55d465d_3 - - freetype=2.12.1=h267a509_2 - - gdbm=1.18=h0a1914f_2 - - gettext=0.21.0=hedfda30_2 - - gh=2.67.0=h76a2195_0 - - giflib=5.2.2=hd590300_0 - - git=2.45.2=pl5340h9abc3c3_1 - - glib=2.78.4=h6a678d5_0 - - glib-tools=2.78.4=h6a678d5_0 - - graphite2=1.3.14=h295c915_1 - - harfbuzz=4.3.0=hf52aaf7_2 - - icu=73.1=h6a678d5_0 - - importlib-metadata=8.5.0=pyha770c72_1 - - ipykernel=6.29.5=pyh3099207_0 - - ipython=8.30.0=pyh707e725_0 - - jedi=0.19.2=pyhd8ed1ab_1 - - jpeg=9e=h0b41bf4_3 - - jupyter_client=8.6.3=pyhd8ed1ab_1 - - jupyter_core=5.7.2=pyh31011fe_1 - - krb5=1.21.3=h143b758_0 - - lame=3.100=h166bdaf_1003 - - ld_impl_linux-64=2.40=hf3520f5_7 - - leptonica=1.82.0=h42c8aad_2 - - lerc=4.0.0=h6a678d5_0 - - libarchive=3.7.7=hfab0078_0 - - libcurl=8.4.0=hca28451_0 - - libdeflate=1.22=hb9d3cd8_0 - - libedit=3.1.20230828=h5eee18b_0 - - libev=4.33=hd590300_2 - - libevent=2.1.12=hf998b51_1 - - libexpat=2.6.3=h5888daf_0 - - libffi=3.4.4=h6a678d5_1 - - libgcc=14.2.0=h77fa898_1 - - libgcc-ng=14.2.0=h69a702a_1 - - libglib=2.78.4=hdc74915_0 - - libgomp=14.2.0=h77fa898_1 - - libiconv=1.16=h516909a_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libogg=1.3.5=h4ab18f5_0 - - libopus=1.3.1=h7f98852_1 - - libpng=1.6.39=h753d276_0 - - libsodium=1.0.20=h4ab18f5_0 - - libsqlite=3.45.3=h2797004_0 - - libssh2=1.11.1=h251f7ec_0 - - libstdcxx=14.2.0=hc0a3c3a_1 - - libstdcxx-ng=11.2.0=he4da1e4_16 - - libtheora=1.1.1=h4ab18f5_1006 - - libtiff=4.5.1=hffd6297_1 - - libuuid=1.41.5=h5eee18b_0 - - libvorbis=1.3.7=h9c3ff4c_0 - - libvpx=1.13.1=h6a678d5_0 - - libwebp=1.3.2=h11a3e52_0 - - libwebp-base=1.3.2=hd590300_1 - - libxcb=1.15=h0b41bf4_0 - - libxml2=2.13.5=hfdd30dd_0 - - libzlib=1.2.13=h4ab18f5_6 - - lz4-c=1.9.4=h6a678d5_1 - - matplotlib-inline=0.1.7=pyhd8ed1ab_1 - - ncurses=6.4.20240210=h59595ed_0 - - nest-asyncio=1.6.0=pyhd8ed1ab_1 - - openh264=2.1.1=h780b84a_0 - - openjpeg=2.5.2=he7f1fd0_0 - - openssl=3.4.0=h7b32b05_1 - - packaging=24.2=pyhd8ed1ab_2 - - parso=0.8.4=pyhd8ed1ab_1 - - pcre2=10.42=hcad00b1_0 - - perl=5.34.0=h5eee18b_2 - - pexpect=4.9.0=pyhd8ed1ab_1 - - pickleshare=0.7.5=pyhd8ed1ab_1004 - - pip=24.2=pyh8b19718_1 - - pixman=0.40.0=h36c2ea0_0 - - platformdirs=4.3.6=pyhd8ed1ab_1 - - prompt-toolkit=3.0.48=pyha770c72_1 - - psutil=6.1.0=py310ha75aee5_0 - - pthread-stubs=0.4=hb9d3cd8_1002 - - ptyprocess=0.7.0=pyhd8ed1ab_1 - - pure_eval=0.2.3=pyhd8ed1ab_1 - - pygments=2.18.0=pyhd8ed1ab_1 - - python=3.10.15=he870216_1 - - python-dateutil=2.9.0.post0=pyhff2d567_1 - - python_abi=3.10=2_cp310 - - pyzmq=26.2.0=py310h71f11fc_3 - - readline=8.2=h8228510_1 - - setuptools=75.1.0=pyhd8ed1ab_0 - - six=1.16.0=pyhd8ed1ab_1 - - sqlite=3.45.3=h2c6b66d_0 - - stack_data=0.6.2=pyhd8ed1ab_0 - - tesseract=5.2.0=h6a678d5_2 - - tk=8.6.14=h39e8969_0 - - tmux=3.3a=hb25038b_1 - - tornado=6.4.2=py310ha75aee5_0 - - traitlets=5.14.3=pyhd8ed1ab_1 - - typing_extensions=4.12.2=pyha770c72_1 - - wcwidth=0.2.13=pyhd8ed1ab_1 - - wheel=0.44.0=pyhd8ed1ab_0 - - xorg-libxau=1.0.12=hb9d3cd8_0 - - xorg-libxdmcp=1.1.5=hb9d3cd8_0 - - xz=5.4.6=h5eee18b_1 - - zeromq=4.3.5=h3b0a872_7 - - zipp=3.21.0=pyhd8ed1ab_1 - - zlib=1.2.13=h4ab18f5_6 - - zstd=1.5.6=hc292b87_0 - - pip: - - accelerate==1.2.1 - - aiohappyeyeballs==2.4.3 - - aiohttp==3.11.8 - - aiohttp-cors==0.7.0 - - aiosignal==1.3.1 - - airportsdata==20241001 - - annotated-types==0.7.0 - - anyio==4.6.2.post1 - - astor==0.8.1 - - async-timeout==5.0.1 - - attrs==24.2.0 - - audioread==3.0.1 - - black==24.10.0 - - blake3==1.0.2 - - cachetools==5.5.1 - - certifi==2024.8.30 - - cffi==1.17.1 - - charset-normalizer==3.4.0 - - click==8.1.7 - - cloudpickle==3.1.0 - - colorful==0.5.6 - - compressed-tensors==0.9.1 - - contourpy==1.3.1 - - cycler==0.12.1 - - datasets==3.1.0 - - depyf==0.18.0 - - dill==0.3.8 - - diskcache==5.6.3 - - distlib==0.3.9 - - distro==1.9.0 - - docker-pycreds==0.4.0 - - einops==0.8.0 - - fastapi==0.115.5 - - filelock==3.16.1 - - flake8==7.1.1 - - fonttools==4.55.0 - - frozenlist==1.5.0 - - fsspec==2024.9.0 - - gguf==0.10.0 - - gitdb==4.0.11 - - gitpython==3.1.43 - - google-api-core==2.24.0 - - google-auth==2.37.0 - - googleapis-common-protos==1.66.0 - - grpcio==1.69.0 - - h11==0.14.0 - - httpcore==1.0.7 - - httptools==0.6.4 - - httpx==0.28.0 - - huggingface-hub==0.26.3 - - idna==3.10 - - iniconfig==2.0.0 - - interegular==0.3.3 - - ipywidgets==8.1.5 - - jinja2==3.1.4 - - jiter==0.8.0 - - joblib==1.4.2 - - jsonschema==4.23.0 - - jsonschema-specifications==2024.10.1 - - jupyterlab-widgets==3.0.13 - - kiwisolver==1.4.7 - - lark==1.2.2 - - lazy-loader==0.4 - - librosa==0.10.2.post1 - - linkify-it-py==2.0.3 - - llvmlite==0.43.0 - - lm-format-enforcer==0.10.9 - - markdown-it-py==3.0.0 - - markupsafe==3.0.2 - - matplotlib==3.9.2 - - mccabe==0.7.0 - - mdit-py-plugins==0.4.2 - - mdurl==0.1.2 - - memray==1.15.0 - - mistral-common==1.5.1 - - mpmath==1.3.0 - - msgpack==1.1.0 - - msgspec==0.18.6 - - multidict==6.1.0 - - multiprocess==0.70.16 - - mypy-extensions==1.0.0 - - networkx==3.4.2 - - numba==0.60.0 - - numpy==1.26.4 - - nvidia-cublas-cu12==12.4.5.8 - - nvidia-cuda-cupti-cu12==12.4.127 - - nvidia-cuda-nvrtc-cu12==12.4.127 - - nvidia-cuda-runtime-cu12==12.4.127 - - nvidia-cudnn-cu12==9.1.0.70 - - nvidia-cufft-cu12==11.2.1.3 - - nvidia-curand-cu12==10.3.5.147 - - nvidia-cusolver-cu12==11.6.1.9 - - nvidia-cusparse-cu12==12.3.1.170 - - nvidia-ml-py==12.560.30 - - nvidia-nccl-cu12==2.21.5 - - nvidia-nvjitlink-cu12==12.4.127 - - nvidia-nvtx-cu12==12.4.127 - - openai==1.55.3 - - opencensus==0.11.4 - - opencensus-context==0.1.3 - - opencv-python-headless==4.10.0.84 - - outlines==0.1.11 - - outlines-core==0.1.26 - - pandas==2.2.3 - - partial-json-parser==0.2.1.1.post4 - - pathspec==0.12.1 - - peft==0.13.2 - - pillow==10.4.0 - - pluggy==1.5.0 - - pooch==1.8.2 - - prometheus-client==0.21.0 - - prometheus-fastapi-instrumentator==7.0.0 - - propcache==0.2.0 - - proto-plus==1.25.0 - - protobuf==5.29.0 - - py-cpuinfo==9.0.0 - - py-spy==0.4.0 - - pyairports==2.1.1 - - pyarrow==18.1.0 - - pyasn1==0.6.1 - - pyasn1-modules==0.4.1 - - pybind11==2.13.6 - - pycodestyle==2.12.1 - - pycountry==24.6.1 - - pycparser==2.22 - - pydantic==2.10.2 - - pydantic-core==2.27.1 - - pyflakes==3.2.0 - - pyparsing==3.2.0 - - pytest==8.3.4 - - python-dotenv==1.0.1 - - pytz==2024.2 - - pyyaml==6.0.2 - - ray==2.39.0 - - referencing==0.35.1 - - regex==2024.11.6 - - requests==2.32.3 - - rich==13.9.4 - - rpds-py==0.21.0 - - rsa==4.9 - - safetensors==0.4.5 - - scikit-learn==1.5.2 - - scipy==1.14.1 - - sentencepiece==0.2.0 - - sentry-sdk==2.19.0 - - setproctitle==1.3.4 - - smart-open==7.1.0 - - smmap==5.0.1 - - sniffio==1.3.1 - - soundfile==0.12.1 - - soxr==0.5.0.post1 - - starlette==0.41.3 - - sympy==1.13.1 - - textual==1.0.0 - - threadpoolctl==3.5.0 - - tiktoken==0.7.0 - - tokenizers==0.21.0 - - tomli==2.2.1 - - torch==2.5.1 - - torchaudio==2.5.1 - - torchvision==0.20.1 - - tqdm==4.67.1 - - transformers==4.49.0.dev0 - - triton==3.1.0 - - tueplots==0.0.17 - - tzdata==2024.2 - - uc-micro-py==1.0.3 - - urllib3==2.2.3 - - uvicorn==0.32.1 - - uvloop==0.21.0 - - virtualenv==20.29.1 - - vllm==0.7.2 - - wandb==0.18.7 - - watchfiles==1.0.0 - - websockets==14.1 - - widgetsnbextension==4.0.13 - - wrapt==1.17.2 - - xformers==0.0.28.post3 - - xgrammar==0.1.10 - - xxhash==3.5.0 - - yarl==1.18.0 - diff --git a/evaluate.py b/evaluate.py deleted file mode 100644 index 09d57f2..0000000 --- a/evaluate.py +++ /dev/null @@ -1,25 +0,0 @@ -import subprocess -import os - -# Dataset list -dataset_list = [ - "MagicLuke/UltraSuite-UXTD-Audio", - "MagicLuke/UltraSuite-UPX-Audio", - "MagicLuke/UltraSuite-UX2020-Audio", - "MagicLuke/UltraSuite-UXSSD-Audio" -] -prompt_version = "v2.3" - -# Ensure the logs directory exists -os.makedirs("logs", exist_ok=True) - -for dataset in dataset_list: - print(f"Processing dataset: {dataset}") - log_file = f"logs/{dataset.replace('/', '_')}.log" - command = [ - "python", "main.py", - "--dataset_name", dataset, - "--prompt_version", prompt_version, - "--log_file", log_file - ] - subprocess.run(command, check=True) diff --git a/finetune-ultrasuite/README.md b/finetune-ultrasuite/README.md new file mode 100644 index 0000000..82488dc --- /dev/null +++ b/finetune-ultrasuite/README.md @@ -0,0 +1,29 @@ +# Finetune with UltraSuite dataset + +## How to create UltraSuite dataset +Follow below instructions to create data to train models with LLaMa-Factory framework. + +```bash +git clone https://huggingface.co/datasets/SAA-Lab/SLPHelmUltraSuite +git clone https://huggingface.co/datasets/SAA-Lab/SLPHelmUltraSuitePlus +python process_data.py UltraSuite +python process_data.py UltraSuitePlus +python create_dataset.py ultrasuite.json ultrasuiteplus.json 1000 +``` + +## How to finetune +Below is an example to finetune Qwen2.5-Omni-7B model with UltraSuite dataset. + +```bash +export CUDA_VISIBLE_DEVICES=1 +export DISABLE_VERSION_CHECK=1 + +# Run the training +llamafactory-cli train configs/qwen2_5omni_7b_lora_sft.yaml + +# Merge the LoRA weights with the base model +python scripts/qwen_omni_merge.py merge_lora \ + Qwen/Qwen2.5-Omni-7B \ + saves/qwen2_5omni-7b-ultrasuite_woa/lora/sft \ + --save_path output/qwen2_5omni-7b-ultrasuite_woa +``` \ No newline at end of file diff --git a/finetune-ultrasuite/configs/qwen2_5omni_3b_lora_sft.yaml b/finetune-ultrasuite/configs/qwen2_5omni_3b_lora_sft.yaml new file mode 100644 index 0000000..b1c4057 --- /dev/null +++ b/finetune-ultrasuite/configs/qwen2_5omni_3b_lora_sft.yaml @@ -0,0 +1,47 @@ +### model +model_name_or_path: Qwen/Qwen2.5-Omni-3B +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 32 +lora_dropout: 0.1 +lora_alpha: 64 +lora_target: all + +### dataset +dataset: ultrasuite +template: qwen2_omni +cutoff_len: 4096 +max_samples: 1200 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen2_5omni-3b-ultrasuite/lora/sft +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/finetune-ultrasuite/configs/qwen2_5omni_3b_lora_sft_woa.yaml b/finetune-ultrasuite/configs/qwen2_5omni_3b_lora_sft_woa.yaml new file mode 100644 index 0000000..1648b51 --- /dev/null +++ b/finetune-ultrasuite/configs/qwen2_5omni_3b_lora_sft_woa.yaml @@ -0,0 +1,48 @@ +### model +model_name_or_path: Qwen/Qwen2.5-Omni-3B +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 32 +lora_dropout: 0.1 +lora_alpha: 64 +lora_target: all + +### dataset +dataset: ultrasuite_woa +template: qwen2_omni +cutoff_len: 4096 +max_samples: 1200 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen2_5omni-3b-ultrasuite_woa/lora/sft +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: false +fp16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/finetune-ultrasuite/configs/qwen2_5omni_7b_lora_sft.yaml b/finetune-ultrasuite/configs/qwen2_5omni_7b_lora_sft.yaml new file mode 100644 index 0000000..f4d231f --- /dev/null +++ b/finetune-ultrasuite/configs/qwen2_5omni_7b_lora_sft.yaml @@ -0,0 +1,47 @@ +### model +model_name_or_path: Qwen/Qwen2.5-Omni-7B +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 32 +lora_dropout: 0.1 +lora_alpha: 64 +lora_target: all + +### dataset +dataset: ultrasuite +template: qwen2_omni +cutoff_len: 4096 +max_samples: 1200 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen2_5omni-7b-ultrasuite/lora/sft +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/finetune-ultrasuite/configs/qwen2_5omni_7b_lora_sft_woa.yaml b/finetune-ultrasuite/configs/qwen2_5omni_7b_lora_sft_woa.yaml new file mode 100644 index 0000000..06fb431 --- /dev/null +++ b/finetune-ultrasuite/configs/qwen2_5omni_7b_lora_sft_woa.yaml @@ -0,0 +1,48 @@ +### model +model_name_or_path: Qwen/Qwen2.5-Omni-7B +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 32 +lora_dropout: 0.1 +lora_alpha: 64 +lora_target: all + +### dataset +dataset: ultrasuite_woa +template: qwen2_omni +cutoff_len: 4096 +max_samples: 1200 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen2_5omni-7b-ultrasuite_woa/lora/sft +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: false +fp16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/finetune-ultrasuite/configs/qwen2_audio_7b_lora_sft.yaml b/finetune-ultrasuite/configs/qwen2_audio_7b_lora_sft.yaml new file mode 100644 index 0000000..dc6f5a5 --- /dev/null +++ b/finetune-ultrasuite/configs/qwen2_audio_7b_lora_sft.yaml @@ -0,0 +1,47 @@ +### model +model_name_or_path: Qwen/Qwen2-Audio-7B-Instruct +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 32 +lora_dropout: 0.1 +lora_alpha: 64 +lora_target: all + +### dataset +dataset: ultrasuite +template: qwen2_audio +cutoff_len: 4096 +max_samples: 1200 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen2_audio_7b-ultrasuite/lora/sft +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/finetune-ultrasuite/configs/qwen2_audio_7b_lora_sft_woa.yaml b/finetune-ultrasuite/configs/qwen2_audio_7b_lora_sft_woa.yaml new file mode 100644 index 0000000..6b9ad65 --- /dev/null +++ b/finetune-ultrasuite/configs/qwen2_audio_7b_lora_sft_woa.yaml @@ -0,0 +1,48 @@ +### model +model_name_or_path: Qwen/Qwen2-Audio-7B-Instruct +trust_remote_code: true + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_rank: 32 +lora_dropout: 0.1 +lora_alpha: 64 +lora_target: all + +### dataset +dataset: ultrasuite_woa +template: qwen2_audio +cutoff_len: 4096 +max_samples: 1200 +overwrite_cache: true +preprocessing_num_workers: 16 +dataloader_num_workers: 4 + +### output +output_dir: saves/qwen2_audio_7b-ultrasuite_woa/lora/sft +logging_steps: 1 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true +save_only_model: false +report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + +### train +per_device_train_batch_size: 4 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: false +fp16: true +ddp_timeout: 180000000 +resume_from_checkpoint: null + +### eval +# val_size: 0.1 +# per_device_eval_batch_size: 1 +# eval_strategy: steps +# eval_steps: 500 diff --git a/finetune-ultrasuite/create_dataset.py b/finetune-ultrasuite/create_dataset.py new file mode 100644 index 0000000..2eeada4 --- /dev/null +++ b/finetune-ultrasuite/create_dataset.py @@ -0,0 +1,121 @@ +import os +import json +import sys + +if __name__ == "__main__": + first_dataset_file = sys.argv[1] + second_dataset_file = sys.argv[2] + if not os.path.exists(first_dataset_file): + print(f"File {first_dataset_file} does not exist.") + sys.exit(1) + if not os.path.exists(second_dataset_file): + print(f"File {second_dataset_file} does not exist.") + sys.exit(1) + + num_samples = sys.argv[3] + if not num_samples.isdigit() or int(num_samples) <= 0: + print(f"Number of samples {num_samples} is not a valid number.") + sys.exit(1) + + num_samples = int(num_samples) + if num_samples % 2 != 0: + print(f"Number of samples {num_samples} is not even.") + sys.exit(1) + + # Read the first dataset + with open(first_dataset_file, 'r') as f: + first_dataset = json.load(f) + + # Read the second dataset + with open(second_dataset_file, 'r') as f: + second_dataset = json.load(f) + + # Remove the duplicated samples in the first dataset by the second one + # by using "audio" as the key + # Sample format + # { + # "messages": [ + # { + # "content": "