diff --git a/pyproject.toml b/pyproject.toml index cd7a328..cd24b82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "UltraSinger" -version = "0.0.13.dev14" +version = "0.0.13.dev15" description = "A tool to create UltraStar karaoke files from audio files" requires-python = ">=3.12" dependencies = [ @@ -16,8 +16,7 @@ dependencies = [ "langcodes", "language_data", "packaging", - "librosa>=0.10.2", - "numba>=0.59.0", + "librosa", "swift-f0", "pydub", "demucs", diff --git a/src/Settings.py b/src/Settings.py index 7b2f66f..540c1c9 100644 --- a/src/Settings.py +++ b/src/Settings.py @@ -11,7 +11,7 @@ @dataclass class Settings: - APP_VERSION = "0.0.13.dev14" + APP_VERSION = "0.0.13.dev15" CONFIDENCE_THRESHOLD = 0.6 CONFIDENCE_PROMPT_TIMEOUT = 4 diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 2b52d0c..5fbf43f 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -171,6 +171,10 @@ def run() -> tuple[str, Score, Score]: # Create process audio process_data.process_data_paths.processing_audio_path = CreateProcessAudio(process_data) + # Get BPM from wav file + if not settings.input_file_is_ultrastar_txt: + process_data.media_info.bpm = get_bpm_from_file(process_data.process_data_paths.processing_audio_path) + # Detect key detected_key, detected_mode = detect_key_from_audio(process_data.process_data_paths.processing_audio_path) if process_data.media_info.music_key is None: @@ -611,15 +615,17 @@ def transcribe_audio(cache_folder_path: str, processing_audio_path: str) -> Tran transcription_result = None whisper_align_model_string = None if settings.transcriber == "whisper": - if not settings.whisper_align_model is None: whisper_align_model_string = settings.whisper_align_model.replace("/", "_") - transcription_config = f"{settings.transcriber}_{settings.whisper_model.value}_{settings.pytorch_device}_{whisper_align_model_string}_{settings.whisper_batch_size}_{settings.whisper_compute_type}_{settings.language}" + if not settings.whisper_align_model is None: + whisper_align_model_string = settings.whisper_align_model.replace("/", "_") + whisper_device = "cpu" if settings.force_whisper_cpu else settings.pytorch_device + transcription_config = f"{settings.transcriber}_{settings.whisper_model.value}_{whisper_device}_{whisper_align_model_string}_{settings.whisper_batch_size}_{settings.whisper_compute_type}_{settings.language}" transcription_path = os.path.join(cache_folder_path, f"{transcription_config}.json") cached_transcription_available = check_file_exists(transcription_path) if settings.skip_cache_transcription or not cached_transcription_available: transcription_result = transcribe_with_whisper( processing_audio_path, settings.whisper_model, - settings.pytorch_device, + whisper_device, settings.whisper_align_model, settings.whisper_batch_size, settings.whisper_compute_type, @@ -684,7 +690,6 @@ def infos_from_audio_video_input_file() -> tuple[str, str, str, MediaInfo]: if song_info.cover_image_data is not None: save_image(song_info.cover_image_data, basename_without_ext, song_folder_output_path) - real_bpm = get_bpm_from_file(ultrastar_audio_input_path) return ( basename_without_ext, song_folder_output_path, @@ -694,7 +699,6 @@ def infos_from_audio_video_input_file() -> tuple[str, str, str, MediaInfo]: title=song_info.title, year=song_info.year, genre=song_info.genres, - bpm=real_bpm, cover_url=song_info.cover_url, audio_extension=audio_ext, video_extension=video_ext diff --git a/src/modules/Audio/bpm.py b/src/modules/Audio/bpm.py index ac438f6..ebaa1ce 100644 --- a/src/modules/Audio/bpm.py +++ b/src/modules/Audio/bpm.py @@ -1,4 +1,5 @@ import librosa +import soundfile as sf from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted @@ -6,7 +7,7 @@ def get_bpm_from_data(data, sampling_rate): """Get real bpm from audio data""" onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate) - wav_tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sampling_rate) + wav_tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sampling_rate) print(f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}") return wav_tempo[0] @@ -14,5 +15,11 @@ def get_bpm_from_data(data, sampling_rate): def get_bpm_from_file(wav_file: str) -> float: """Get real bpm from audio file""" - data, sampling_rate = librosa.load(wav_file, sr=None) + data, sampling_rate = sf.read(wav_file, dtype='float32') + # Transpose if stereo to match librosa's expected format + if len(data.shape) > 1: + data = data.T + # Convert to mono if stereo + if data.ndim > 1: + data = librosa.to_mono(data) return get_bpm_from_data(data, sampling_rate) diff --git a/src/modules/Audio/youtube.py b/src/modules/Audio/youtube.py index cd3dfaf..0c0ece9 100644 --- a/src/modules/Audio/youtube.py +++ b/src/modules/Audio/youtube.py @@ -116,7 +116,6 @@ def download_from_youtube(input_url: str, output_folder_path: str, cookiefile: s input_url, basename_without_ext, song_output, cookiefile ) - real_bpm = get_bpm_from_file(audio_file_path) return ( basename_without_ext, song_output, @@ -126,7 +125,6 @@ def download_from_youtube(input_url: str, output_folder_path: str, cookiefile: s title=song_info.title, year=song_info.year, genre=song_info.genres, - bpm=real_bpm, cover_url=cover_url, video_url=input_url, audio_extension=audio_ext, diff --git a/src/modules/ProcessData.py b/src/modules/ProcessData.py index 4be775a..cc2f66b 100644 --- a/src/modules/ProcessData.py +++ b/src/modules/ProcessData.py @@ -19,7 +19,7 @@ class MediaInfo: """Media Info""" title: str artist: str - bpm: float + bpm: Optional[float] = None year: Optional[str] = None genre: Optional[str] = None language: Optional[str] = None