From 1a0c84a892c4fec3f10b70aca966cb00103e44b1 Mon Sep 17 00:00:00 2001 From: Vadim Date: Sun, 15 Feb 2026 23:54:41 +0100 Subject: [PATCH 1/4] fix BPM, whisper force cpu and librosa upgrade --- pyproject.toml | 2 +- src/UltraSinger.py | 7 ++++--- src/modules/Audio/bpm.py | 11 +++++++++-- src/modules/Audio/youtube.py | 2 -- src/modules/ProcessData.py | 2 +- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cd7a328..36bc18d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ dependencies = [ "langcodes", "language_data", "packaging", - "librosa>=0.10.2", + "librosa", "numba>=0.59.0", "swift-f0", "pydub", diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 2b52d0c..698a3d0 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -171,6 +171,9 @@ def run() -> tuple[str, Score, Score]: # Create process audio process_data.process_data_paths.processing_audio_path = CreateProcessAudio(process_data) + # Get BPM from wav file + process_data.media_info.bpm = get_bpm_from_file(process_data.process_data_paths.processing_audio_path) + # Detect key detected_key, detected_mode = detect_key_from_audio(process_data.process_data_paths.processing_audio_path) if process_data.media_info.music_key is None: @@ -619,7 +622,7 @@ def transcribe_audio(cache_folder_path: str, processing_audio_path: str) -> Tran transcription_result = transcribe_with_whisper( processing_audio_path, settings.whisper_model, - settings.pytorch_device, + "cpu" if settings.force_whisper_cpu else settings.pytorch_device, settings.whisper_align_model, settings.whisper_batch_size, settings.whisper_compute_type, @@ -684,7 +687,6 @@ def infos_from_audio_video_input_file() -> tuple[str, str, str, MediaInfo]: if song_info.cover_image_data is not None: save_image(song_info.cover_image_data, basename_without_ext, song_folder_output_path) - real_bpm = get_bpm_from_file(ultrastar_audio_input_path) return ( basename_without_ext, song_folder_output_path, @@ -694,7 +696,6 @@ def infos_from_audio_video_input_file() -> tuple[str, str, str, MediaInfo]: title=song_info.title, year=song_info.year, genre=song_info.genres, - bpm=real_bpm, cover_url=song_info.cover_url, audio_extension=audio_ext, video_extension=video_ext diff --git a/src/modules/Audio/bpm.py b/src/modules/Audio/bpm.py index ac438f6..ebaa1ce 100644 --- a/src/modules/Audio/bpm.py +++ b/src/modules/Audio/bpm.py @@ -1,4 +1,5 @@ import librosa +import soundfile as sf from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted @@ -6,7 +7,7 @@ def get_bpm_from_data(data, sampling_rate): """Get real bpm from audio data""" onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate) - wav_tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sampling_rate) + wav_tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sampling_rate) print(f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}") return wav_tempo[0] @@ -14,5 +15,11 @@ def get_bpm_from_data(data, sampling_rate): def get_bpm_from_file(wav_file: str) -> float: """Get real bpm from audio file""" - data, sampling_rate = librosa.load(wav_file, sr=None) + data, sampling_rate = sf.read(wav_file, dtype='float32') + # Transpose if stereo to match librosa's expected format + if len(data.shape) > 1: + data = data.T + # Convert to mono if stereo + if data.ndim > 1: + data = librosa.to_mono(data) return get_bpm_from_data(data, sampling_rate) diff --git a/src/modules/Audio/youtube.py b/src/modules/Audio/youtube.py index cd3dfaf..0c0ece9 100644 --- a/src/modules/Audio/youtube.py +++ b/src/modules/Audio/youtube.py @@ -116,7 +116,6 @@ def download_from_youtube(input_url: str, output_folder_path: str, cookiefile: s input_url, basename_without_ext, song_output, cookiefile ) - real_bpm = get_bpm_from_file(audio_file_path) return ( basename_without_ext, song_output, @@ -126,7 +125,6 @@ def download_from_youtube(input_url: str, output_folder_path: str, cookiefile: s title=song_info.title, year=song_info.year, genre=song_info.genres, - bpm=real_bpm, cover_url=cover_url, video_url=input_url, audio_extension=audio_ext, diff --git a/src/modules/ProcessData.py b/src/modules/ProcessData.py index 4be775a..cc2f66b 100644 --- a/src/modules/ProcessData.py +++ b/src/modules/ProcessData.py @@ -19,7 +19,7 @@ class MediaInfo: """Media Info""" title: str artist: str - bpm: float + bpm: Optional[float] = None year: Optional[str] = None genre: Optional[str] = None language: Optional[str] = None From a5f95003231f4cd1981b14d075146d3a2fbec658 Mon Sep 17 00:00:00 2001 From: Vadim Date: Mon, 16 Feb 2026 00:08:28 +0100 Subject: [PATCH 2/4] fix: remove numba dependency and adjust BPM extraction for non-ultrastar files --- pyproject.toml | 1 - src/UltraSinger.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 36bc18d..e0b5e9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ dependencies = [ "language_data", "packaging", "librosa", - "numba>=0.59.0", "swift-f0", "pydub", "demucs", diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 698a3d0..8a48fec 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -172,7 +172,8 @@ def run() -> tuple[str, Score, Score]: process_data.process_data_paths.processing_audio_path = CreateProcessAudio(process_data) # Get BPM from wav file - process_data.media_info.bpm = get_bpm_from_file(process_data.process_data_paths.processing_audio_path) + if not settings.input_file_is_ultrastar_txt: + process_data.media_info.bpm = get_bpm_from_file(process_data.process_data_paths.processing_audio_path) # Detect key detected_key, detected_mode = detect_key_from_audio(process_data.process_data_paths.processing_audio_path) From 857728d27ab0902d2ee090cd80e21ba4682c35e8 Mon Sep 17 00:00:00 2001 From: Vadim Date: Mon, 16 Feb 2026 00:27:51 +0100 Subject: [PATCH 3/4] fix: update whisper device handling and transcription config --- src/UltraSinger.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 8a48fec..5fbf43f 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -615,15 +615,17 @@ def transcribe_audio(cache_folder_path: str, processing_audio_path: str) -> Tran transcription_result = None whisper_align_model_string = None if settings.transcriber == "whisper": - if not settings.whisper_align_model is None: whisper_align_model_string = settings.whisper_align_model.replace("/", "_") - transcription_config = f"{settings.transcriber}_{settings.whisper_model.value}_{settings.pytorch_device}_{whisper_align_model_string}_{settings.whisper_batch_size}_{settings.whisper_compute_type}_{settings.language}" + if not settings.whisper_align_model is None: + whisper_align_model_string = settings.whisper_align_model.replace("/", "_") + whisper_device = "cpu" if settings.force_whisper_cpu else settings.pytorch_device + transcription_config = f"{settings.transcriber}_{settings.whisper_model.value}_{whisper_device}_{whisper_align_model_string}_{settings.whisper_batch_size}_{settings.whisper_compute_type}_{settings.language}" transcription_path = os.path.join(cache_folder_path, f"{transcription_config}.json") cached_transcription_available = check_file_exists(transcription_path) if settings.skip_cache_transcription or not cached_transcription_available: transcription_result = transcribe_with_whisper( processing_audio_path, settings.whisper_model, - "cpu" if settings.force_whisper_cpu else settings.pytorch_device, + whisper_device, settings.whisper_align_model, settings.whisper_batch_size, settings.whisper_compute_type, From 1d30dc7826c6bc7b0728517d31c0f020573a4314 Mon Sep 17 00:00:00 2001 From: Vadim Date: Mon, 16 Feb 2026 00:29:20 +0100 Subject: [PATCH 4/4] fix: bump version to 0.0.13.dev15 in pyproject.toml and Settings.py --- pyproject.toml | 2 +- src/Settings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e0b5e9b..cd24b82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "UltraSinger" -version = "0.0.13.dev14" +version = "0.0.13.dev15" description = "A tool to create UltraStar karaoke files from audio files" requires-python = ">=3.12" dependencies = [ diff --git a/src/Settings.py b/src/Settings.py index 7b2f66f..540c1c9 100644 --- a/src/Settings.py +++ b/src/Settings.py @@ -11,7 +11,7 @@ @dataclass class Settings: - APP_VERSION = "0.0.13.dev14" + APP_VERSION = "0.0.13.dev15" CONFIDENCE_THRESHOLD = 0.6 CONFIDENCE_PROMPT_TIMEOUT = 4