From da434684844c0757373a3ed11a211b8ea3d456e9 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Thu, 14 Sep 2023 17:51:49 +0200 Subject: [PATCH 01/19] add loudness threshold to filter silent regions from pitched data --- LICENSE | 1 + pytest/modules/Pitcher/test_pitcher.py | 2 +- src/UltraSinger.py | 76 ++++++++++++++++++- src/modules/Pitcher/core.py | 1 + src/modules/Pitcher/loudness.py | 69 +++++++++++++++++ src/modules/Pitcher/pitched_data.py | 1 + src/modules/Pitcher/pitcher.py | 34 ++++++--- .../Speech_Recognition/TranscribedData.py | 26 ++++--- src/modules/plot.py | 2 +- 9 files changed, 186 insertions(+), 26 deletions(-) create mode 100644 src/modules/Pitcher/core.py create mode 100644 src/modules/Pitcher/loudness.py diff --git a/LICENSE b/LICENSE index dfe8101..a346333 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License Copyright (c) 2023 Vadim Rangnau +Copyright (c) 2020 Max Morrison (torchcrepe code adapted for crepe output filtering abd thresholding) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/pytest/modules/Pitcher/test_pitcher.py b/pytest/modules/Pitcher/test_pitcher.py index e623986..ad5b253 100644 --- a/pytest/modules/Pitcher/test_pitcher.py +++ b/pytest/modules/Pitcher/test_pitcher.py @@ -8,7 +8,7 @@ class PitcherTest(unittest.TestCase): - @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") + # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") def test_get_pitch_with_crepe_file(self): # Arrange test_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/src/UltraSinger.py b/src/UltraSinger.py index b1cb8a4..47610c2 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -55,7 +55,8 @@ from modules.musicbrainz_client import get_music_infos settings = Settings() - +SYLLABLE_SEGMENT_SIZE = 0.1 +SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1 def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]: """Convert midi notes to ultrastar notes""" @@ -255,6 +256,73 @@ def print_support() -> None: ) +def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> list[TranscribedData]: + """Split every syllable into sub-segments""" + segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1]) + new_data = [] + + for i, data in enumerate(transcribed_data): + + duration = data.end - data.start + if duration <= SYLLABLE_SEGMENT_SIZE: + new_data.append(data) + continue + + has_space = str(data.word).endswith(" ") + first_segment = copy.deepcopy(data) + filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE + remainder = data.end - (filler_words_start) + first_segment.end = filler_words_start + if has_space: + first_segment.word = first_segment.word[:-1] + + new_data.append(first_segment) + + full_segments, partial_segment = divmod(remainder, SYLLABLE_SEGMENT_SIZE) + + if full_segments >= 1: + for i in range(int(full_segments)): + segment = TranscribedData() + segment.word = "~" + segment.start = filler_words_start + round(i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points) + segment.end = segment.start + SYLLABLE_SEGMENT_SIZE + new_data.append(segment) + + if partial_segment >= 0.01: + segment = TranscribedData() + segment.word = "~" + segment.start = filler_words_start + round(full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points) + segment.end = segment.start + partial_segment + new_data.append(segment) + + if has_space: + new_data[-1].word += " " + return new_data + + +def merge_syllable_segments( + transcribed_data: list[TranscribedData], + midi_notes: list[str], + us_notes = list[int] +) -> tuple[list[TranscribedData], list[str], list[int]]: + """Merge sub-segments of a syllable where the pitch is the same""" + new_data = [] + new_midi_notes = [] + new_us_notes = [] + + previous_data = None + + for i, data in enumerate(transcribed_data): + if str(data.word).startswith("~") and previous_data is not None and midi_notes[i] == midi_notes[i-1] and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE: + new_data[-1].end = data.end + else: + new_data.append(data) + new_midi_notes.append(midi_notes[i]) + new_us_notes.append(us_notes[i]) + previous_data = data + return new_data, new_midi_notes, new_us_notes + + def run() -> None: """The processing function of this program""" is_audio = ".txt" not in settings.input_file_path @@ -330,6 +398,8 @@ def run() -> None: # lyric = 'input/faber_lyric.txt' # --corrected_words = correct_words(vosk_speech, lyric) + transcribed_data = split_syllables_into_segments(transcribed_data) + # Create audio chunks if settings.create_audio_chunks: create_audio_chunks( @@ -345,6 +415,8 @@ def run() -> None: is_audio, transcribed_data, ultrastar_class ) + transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers) + # Create plot if settings.create_plot: plot(pitched_data, song_output, transcribed_data, midi_notes) @@ -706,7 +778,7 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast settings.mono_audio_path, settings.crepe_model_capacity, settings.crepe_step_size, - settings.tensorflow_device, + settings.tensorflow_device ) if is_audio: start_times = [] diff --git a/src/modules/Pitcher/core.py b/src/modules/Pitcher/core.py new file mode 100644 index 0000000..7b252a6 --- /dev/null +++ b/src/modules/Pitcher/core.py @@ -0,0 +1 @@ +CREPE_MODEL_SAMPLE_RATE = 16000 \ No newline at end of file diff --git a/src/modules/Pitcher/loudness.py b/src/modules/Pitcher/loudness.py new file mode 100644 index 0000000..f72d8a5 --- /dev/null +++ b/src/modules/Pitcher/loudness.py @@ -0,0 +1,69 @@ +import warnings + +import librosa +import numpy as np +from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE + +############################################################################### +# Constants +############################################################################### + +WINDOW_SIZE = 1024 +TIMES_DECIMAL_PLACES: int = 3 +# Minimum decibel level +MIN_DB = -100. + +# Reference decibel level +REF_DB = 20. + +def set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=-60, step_size=10, pad=True): + # Don't modify in-place + confidence = confidence[:] + + # Compute loudness + loudness = a_weighted(audio, step_size, pad) + + # Threshold silence + confidence[loudness < threshold] = 0. + + return confidence, loudness + +def a_weighted(audio, step_size=10, pad=True): + """Retrieve the per-frame loudness""" + step_size_seconds = round(step_size / 1000, TIMES_DECIMAL_PLACES) + steps_per_second = 1 / step_size_seconds + hop_length = int(CREPE_MODEL_SAMPLE_RATE // steps_per_second) + + a_perceptual_weights = perceptual_weights() + + # Take stft + stft = librosa.stft(audio, + n_fft=WINDOW_SIZE, + hop_length=hop_length, + win_length=WINDOW_SIZE, + center=pad, + pad_mode='constant') + + # Compute magnitude on db scale + db = librosa.amplitude_to_db(np.abs(stft)) + + # Apply A-weighting + weighted = db + a_perceptual_weights + + # Threshold + weighted[weighted < MIN_DB] = MIN_DB + + # Average over weighted frequencies + return weighted.mean(axis=0) + + +def perceptual_weights(): + """A-weighted frequency-dependent perceptual loudness weights""" + frequencies = librosa.fft_frequencies(sr=CREPE_MODEL_SAMPLE_RATE, + n_fft=WINDOW_SIZE) + + # A warning is raised for nearly inaudible frequencies, but it ends up + # defaulting to -100 db. That default is fine for our purposes. + with warnings.catch_warnings(): + warnings.simplefilter('ignore', RuntimeWarning) + return librosa.A_weighting(frequencies)[:, None] - REF_DB \ No newline at end of file diff --git a/src/modules/Pitcher/pitched_data.py b/src/modules/Pitcher/pitched_data.py index 13d828c..f2d32df 100644 --- a/src/modules/Pitcher/pitched_data.py +++ b/src/modules/Pitcher/pitched_data.py @@ -9,3 +9,4 @@ class PitchedData: times: list[float] frequencies: list[float] confidence: list[float] + perceived_loudness_db: list[float] diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py index 5bf9f77..c3fc81b 100644 --- a/src/modules/Pitcher/pitcher.py +++ b/src/modules/Pitcher/pitcher.py @@ -1,10 +1,13 @@ """Pitcher module""" import crepe -from scipy.io import wavfile +import librosa from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted +from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE +from modules.Pitcher.loudness import set_confidence_to_zero_in_silent_regions from modules.Pitcher.pitched_data import PitchedData +import modules.timer as timer def get_pitch_with_crepe_file( @@ -15,26 +18,37 @@ def get_pitch_with_crepe_file( print( f"{ULTRASINGER_HEAD} Pitching with {blue_highlighted('crepe')} and model {blue_highlighted(model_capacity)} and {red_highlighted(device)} as worker" ) - sample_rate, audio = wavfile.read(filename) + timer.log('Load file for pitch detection start') + audio, sample_rate = librosa.load(filename) + timer.log('Load file for pitch detection end') return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size) -def get_pitch_with_crepe( - audio, sample_rate: int, model_capacity: str, step_size: int = 10 -) -> PitchedData: +def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10) -> PitchedData: """Pitch with crepe""" - times, frequencies, confidence, activation = crepe.predict( - audio, sample_rate, model_capacity, step_size=step_size, viterbi=True - ) - return PitchedData(times, frequencies, confidence) + + if sample_rate != CREPE_MODEL_SAMPLE_RATE: + from resampy import resample + audio = resample(audio, sample_rate, CREPE_MODEL_SAMPLE_RATE) + sample_rate = CREPE_MODEL_SAMPLE_RATE + + timer.log('Crepe pitch detection start') + times, frequencies, confidence, activation = crepe.predict(audio, sample_rate, model_capacity, step_size=step_size, viterbi=True) + timer.log('Crepe pitch detection end') + + timer.log('Computing loudness start') + confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size) + timer.log('Computing loudness end') + + return PitchedData(times, frequencies, confidence, perceived_loudness) def get_pitched_data_with_high_confidence( pitched_data: PitchedData, threshold=0.4 ) -> PitchedData: """Get frequency with high confidence""" - new_pitched_data = PitchedData([], [], []) + new_pitched_data = PitchedData([], [], [], []) for i, conf in enumerate(pitched_data.confidence): if conf > threshold: new_pitched_data.times.append(pitched_data.times[i]) diff --git a/src/modules/Speech_Recognition/TranscribedData.py b/src/modules/Speech_Recognition/TranscribedData.py index 8ae2f4a..5962d9a 100644 --- a/src/modules/Speech_Recognition/TranscribedData.py +++ b/src/modules/Speech_Recognition/TranscribedData.py @@ -4,15 +4,17 @@ class TranscribedData: """Transcribed data from json file""" - def __init__(self, transcribed_json): - # Vosk = conf, Whisper = confidence - self.conf = transcribed_json.get( - "conf", transcribed_json.get("confidence", None) - ) - # Vosk = word, Whisper = text - self.word = transcribed_json.get( - "word", transcribed_json.get("text", None) - ) - self.end = transcribed_json.get("end", None) - self.start = transcribed_json.get("start", None) - self.is_hyphen = None + def __init__(self, transcribed_json = None): + + if transcribed_json: + # Vosk = conf, Whisper = confidence + self.conf = transcribed_json.get( + "conf", transcribed_json.get("confidence", None) + ) + # Vosk = word, Whisper = text + self.word = transcribed_json.get( + "word", transcribed_json.get("text", None) + ) + self.end = transcribed_json.get("end", None) + self.start = transcribed_json.get("start", None) + self.is_hyphen = None diff --git a/src/modules/plot.py b/src/modules/plot.py index 881a453..01b121e 100644 --- a/src/modules/plot.py +++ b/src/modules/plot.py @@ -187,7 +187,7 @@ def create_gaps(pitched_data: PitchedData, step_size: float) -> PitchedData: This way the graph is only continuous where it should be. """ - pitched_data_with_gaps = PitchedData([], [], []) + pitched_data_with_gaps = PitchedData([], [], [], []) previous_time = 0 for i, time in enumerate(pitched_data.times): From b52945be200d649d992678f12ede9ff4a490ddf3 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Tue, 3 Oct 2023 07:52:01 +0200 Subject: [PATCH 02/19] wip --- pytest/modules/Pitcher/test_pitcher.py | 64 ++++++++++++++++++++++++++ src/modules/plot.py | 4 +- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/pytest/modules/Pitcher/test_pitcher.py b/pytest/modules/Pitcher/test_pitcher.py index ad5b253..a028843 100644 --- a/pytest/modules/Pitcher/test_pitcher.py +++ b/pytest/modules/Pitcher/test_pitcher.py @@ -3,6 +3,14 @@ import os import unittest import src.modules.Pitcher.pitcher as test_subject + +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +from sklearn.cluster import KMeans +from sklearn import preprocessing as p +from sklearn.decomposition import PCA + import pytest from src.modules.plot import plot @@ -21,7 +29,63 @@ def test_get_pitch_with_crepe_file(self): pitched_data = test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', device="cuda") # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024) plot(pitched_data, test_output, title="pitching test") + print("done") + + # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") + def test_pitch_clustering(self): + # Arrange + times = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31, 1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43, 1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55, 1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, 1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79, 1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91, 1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2, 2.01, 2.02, 2.03, 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15, 2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27, 2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39, 2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51, 2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63, 2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75, 2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, 2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99, 3, 3.01, 3.02, 3.03, 3.04, 3.05, 3.06, 3.07, 3.08, 3.09, 3.1, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16, 3.17, 3.18, 3.19, 3.2, 3.21, 3.22, 3.23, 3.24, 3.25, 3.26, 3.27, 3.28, 3.29, 3.3, 3.31, 3.32, 3.33, 3.34, 3.35, 3.36, 3.37, 3.38, 3.39, 3.4, 3.41, 3.42, 3.43, 3.44, 3.45, 3.46, 3.47, 3.48, 3.49, 3.5, 3.51, 3.52, 3.53, 3.54, 3.55, 3.56, 3.57, 3.58, 3.59, 3.6, 3.61, 3.62, 3.63, 3.64, 3.65, 3.66, 3.67, 3.68, 3.69, 3.7, 3.71, 3.72, 3.73, 3.74, 3.75, 3.76, 3.77, 3.78, 3.79, 3.8, 3.81, 3.82, 3.83, 3.84, 3.85, 3.86, 3.87, 3.88, 3.89, 3.9, 3.91, 3.92, 3.93, 3.94, 3.95, 3.96, 3.97, 3.98, 3.99, 4, 4.01, 4.02, 4.03, 4.04, 4.05, 4.06, 4.07, 4.08, 4.09, 4.1, 4.11, 4.12, 4.13, 4.14, 4.15, 4.16, 4.17, 4.18, 4.19, 4.2, 4.21, 4.22, 4.23, 4.24, 4.25, 4.26, 4.27, 4.28, 4.29, 4.3, 4.31, 4.32, 4.33, 4.34, 4.35, 4.36, 4.37, 4.38, 4.39, 4.4, 4.41, 4.42, 4.43, 4.44, 4.45, 4.46, 4.47, 4.48, 4.49, 4.5, 4.51, 4.52, 4.53, 4.54, 4.55, 4.56, 4.57, 4.58, 4.59, 4.6, 4.61, 4.62, 4.63, 4.64, 4.65, 4.66, 4.67, 4.68, 4.69, 4.7, 4.71, 4.72, 4.73, 4.74, 4.75, 4.76, 4.77, 4.78, 4.79, 4.8, 4.81, 4.82, 4.83, 4.84, 4.85, 4.86, 4.87, 4.88, 4.89, 4.9, 4.91, 4.92, 4.93, 4.94, 4.95, 4.96, 4.97, 4.98, 4.99, 5, 5.01, 5.02, 5.03, 5.04, 5.05, 5.06, 5.07, 5.08, 5.09, 5.1, 5.11, 5.12, 5.13, 5.14, 5.15, 5.16, 5.17, 5.18, 5.19, 5.2, 5.21, 5.22, 5.23, 5.24, 5.25, 5.26, 5.27, 5.28, 5.29, 5.3, 5.31, 5.32, 5.33, 5.34, 5.35, 5.36, 5.37, 5.38, 5.39, 5.4, 5.41, 5.42, 5.43, 5.44, 5.45, 5.46, 5.47, 5.48, 5.49, 5.5, 5.51, 5.52, 5.53, 5.54, 5.55, 5.56, 5.57, 5.58, 5.59, 5.6, 5.61, 5.62, 5.63, 5.64, 5.65, 5.66, 5.67, 5.68, 5.69, 5.7, 5.71, 5.72, 5.73, 5.74, 5.75, 5.76, 5.77, 5.78, 5.79, 5.8, 5.81, 5.82, 5.83, 5.84, 5.85, 5.86, 5.87, 5.88, 5.89, 5.9, 5.91, 5.92, 5.93, 5.94, 5.95, 5.96, 5.97, 5.98, 5.99, 6, 6.01, 6.02, 6.03, 6.04, 6.05, 6.06, 6.07, 6.08, 6.09, 6.1, 6.11, 6.12, 6.13, 6.14, 6.15, 6.16, 6.17, 6.18, 6.19, 6.2, 6.21, 6.22, 6.23, 6.24, 6.25, 6.26, 6.27, 6.28, 6.29, 6.3, 6.31, 6.32, 6.33, 6.34, 6.35, 6.36, 6.37, 6.38, 6.39, 6.4, 6.41, 6.42, 6.43, 6.44, 6.45, 6.46, 6.47, 6.48, 6.49, 6.5, 6.51, 6.52, 6.53, 6.54, 6.55, 6.56, 6.57, 6.58, 6.59, 6.6, 6.61, 6.62, 6.63, 6.64, 6.65, 6.66, 6.67, 6.68, 6.69, 6.7, 6.71, 6.72, 6.73, 6.74, 6.75, 6.76, 6.77, 6.78, 6.79, 6.8, 6.81, 6.82, 6.83, 6.84, 6.85, 6.86, 6.87, 6.88, 6.89, 6.9, 6.91, 6.92, 6.93, 6.94, 6.95, 6.96, 6.97, 6.98, 6.99, 7, 7.01, 7.02, 7.03, 7.04, 7.05, 7.06, 7.07, 7.08, 7.09, 7.1, 7.11, 7.12, 7.13, 7.14, 7.15, 7.16, 7.17, 7.18, 7.19, 7.2, 7.21, 7.22, 7.23, 7.24, 7.25, 7.26, 7.27, 7.28, 7.29, 7.3, 7.31, 7.32, 7.33, 7.34, 7.35, 7.36, 7.37, 7.38, 7.39, 7.4, 7.41, 7.42, 7.43, 7.44, 7.45, 7.46, 7.47, 7.48, 7.49, 7.5, 7.51, 7.52, 7.53, 7.54, 7.55, 7.56, 7.57, 7.58, 7.59, 7.6, 7.61, 7.62, 7.63, 7.64, 7.65, 7.66, 7.67, 7.68, 7.69, 7.7, 7.71, 7.72, 7.73, 7.74, 7.75, 7.76, 7.77, 7.78, 7.79, 7.8, 7.81, 7.82, 7.83, 7.84, 7.85, 7.86, 7.87, 7.88, 7.89, 7.9, 7.91, 7.92, 7.93, 7.94, 7.95, 7.96, 7.97, 7.98, 7.99, 8, 8.01, 8.02, 8.03, 8.04, 8.05, 8.06, 8.07, 8.08, 8.09, 8.1, 8.11, 8.12, 8.13, 8.14, 8.15, 8.16, 8.17, 8.18, 8.19, 8.2, 8.21, 8.22, 8.23, 8.24, 8.25, 8.26, 8.27, 8.28, 8.29, 8.3, 8.31, 8.32, 8.33, 8.34, 8.35, 8.36, 8.37, 8.38, 8.39, 8.4, 8.41, 8.42, 8.43] + frequencies = [665.03, 659.52, 646.07, 572.62, 590.38, 649.3, 600.02, 624.6, 646.16, 650.34, 646.06, 651.35, 650.49, 589.08, 603.26, 625.12, 627.36, 636.09, 660.45, 659.91, 648.32, 657.78, 597.3, 595.76, 594.63, 659.18, 625.98, 645.65, 645.76, 650.01, 652.28, 653.14, 664.93, 662.59, 660.76, 642.67, 644.79, 649.94, 625.44, 627.31, 645.4, 645.22, 652.12, 598.35, 623.99, 644.58, 645.09, 650.61, 650.01, 598.85, 656.88, 636.74, 652.16, 650.66, 649.87, 657.12, 625.48, 634.35, 661.24, 651.33, 655.77, 658.05, 661.75, 660.07, 661.37, 662.0, 662.29, 662.33, 664.63, 661.54, 661.31, 630.51, 590.5, 658.83, 625.16, 635.46, 661.08, 659.04, 659.97, 666.0, 670.29, 666.15, 658.62, 662.12, 662.78, 656.71, 662.49, 661.36, 641.94, 647.05, 652.86, 598.11, 599.27, 656.18, 625.69, 659.14, 657.94, 659.05, 657.8, 656.69, 653.63, 638.82, 631.24, 625.38, 635.76, 658.93, 657.88, 660.1, 660.92, 613.83, 593.21, 608.63, 624.15, 633.75, 659.97, 658.17, 657.64, 590.49, 651.47, 656.66, 657.47, 637.08, 658.92, 659.45, 590.42, 592.88, 592.52, 596.55, 657.24, 660.3, 634.03, 635.35, 647.72, 648.55, 648.52, 647.42, 645.35, 648.98, 653.16, 650.04, 656.79, 644.61, 646.96, 648.74, 646.2, 642.01, 632.67, 698.0, 660.9, 635.71, 637.32, 646.88, 645.85, 644.92, 647.01, 646.71, 645.62, 645.52, 653.7, 660.06, 626.52, 646.6, 652.7, 653.21, 597.68, 658.95, 660.67, 636.1, 657.82, 659.44, 653.16, 652.2, 657.95, 660.01, 627.14, 636.14, 644.73, 649.03, 651.96, 662.32, 675.07, 679.81, 683.4, 692.69, 699.38, 695.89, 697.21, 698.53, 700.52, 704.75, 707.07, 691.78, 682.75, 677.22, 669.76, 660.38, 656.93, 645.32, 643.54, 647.62, 592.21, 658.68, 658.62, 660.63, 634.49, 642.43, 654.18, 659.81, 671.21, 678.8, 684.99, 690.6, 648.73, 598.17, 658.13, 659.56, 690.23, 642.85, 647.82, 648.53, 650.73, 647.39, 645.03, 647.35, 650.27, 649.18, 647.71, 646.93, 646.93, 640.58, 636.02, 632.72, 626.74, 619.02, 612.84, 597.52, 632.17, 663.0, 690.65, 643.34, 653.35, 647.88, 653.15, 652.6, 658.93, 658.99, 659.11, 658.06, 659.11, 660.6, 641.79, 610.65, 598.81, 659.95, 635.14, 660.15, 662.85, 651.13, 635.97, 658.73, 658.91, 659.11, 690.8, 695.01, 683.24, 681.22, 674.65, 663.59, 659.06, 649.47, 640.65, 632.87, 624.67, 645.13, 661.67, 646.04, 647.7, 651.95, 652.31, 659.4, 692.03, 635.43, 628.9, 624.13, 604.23, 597.85, 624.24, 624.9, 645.21, 645.66, 646.12, 649.43, 598.26, 657.7, 635.79, 652.29, 657.12, 659.24, 654.06, 646.01, 645.04, 584.87, 532.52, 537.38, 470.4, 419.16, 376.77, 331.49, 300.82, 266.99, 235.5, 208.51, 187.03, 166.8, 145.7, 128.59, 114.6, 111.88, 111.34, 111.4, 112.1, 112.58, 112.77, 112.31, 111.91, 111.61, 111.35, 111.24, 111.36, 111.71, 112.28, 112.58, 112.6, 112.72, 112.64, 112.59, 112.58, 112.61, 112.7, 112.64, 112.4, 112.35, 112.19, 112.12, 112.28, 112.71, 113.53, 114.07, 114.72, 115.76, 116.9, 118.35, 120.27, 122.07, 123.86, 125.5, 127.17, 128.75, 130.15, 130.75, 131.19, 131.61, 132.12, 132.39, 132.65, 133.09, 133.35, 133.27, 133.47, 133.67, 133.72, 134.03, 134.05, 133.88, 133.98, 133.74, 133.51, 133.39, 133.18, 132.97, 132.75, 132.42, 132.22, 132.33, 132.47, 132.95, 133.73, 134.7, 136.04, 137.57, 138.72, 140.38, 142.22, 144.34, 146.13, 147.59, 149.4, 151.91, 154.03, 155.77, 157.2, 158.19, 158.3, 157.86, 157.38, 156.9, 156.33, 155.93, 155.5, 155.25, 155.1, 154.86, 154.75, 154.79, 154.81, 154.89, 154.93, 154.99, 155.17, 155.21, 155.27, 155.36, 155.26, 155.27, 155.32, 155.59, 155.95, 157.1, 159.24, 161.65, 163.48, 165.11, 166.87, 169.52, 172.43, 174.91, 177.59, 180.13, 182.91, 185.0, 186.43, 187.73, 188.14, 188.07, 187.49, 186.78, 186.54, 186.58, 186.74, 186.89, 186.78, 186.7, 186.94, 187.54, 188.14, 188.32, 188.13, 187.78, 187.68, 187.78, 187.92, 188.01, 187.96, 188.35, 189.13, 190.21, 190.87, 191.13, 190.8, 190.31, 189.41, 188.83, 188.26, 187.96, 187.35, 186.75, 186.32, 185.88, 185.66, 185.5, 185.51, 185.99, 186.39, 187.09, 187.52, 187.75, 187.83, 188.01, 188.66, 189.73, 190.58, 190.67, 190.03, 189.38, 188.8, 188.28, 188.16, 188.06, 187.99, 187.94, 187.91, 188.05, 188.52, 189.14, 189.76, 190.26, 190.42, 190.45, 190.43, 190.29, 190.09, 189.81, 189.93, 189.84, 189.58, 189.03, 188.63, 188.51, 188.85, 189.62, 190.32, 190.56, 190.57, 190.23, 189.8, 189.29, 189.1, 188.58, 187.76, 185.73, 183.73, 179.79, 174.77, 167.02, 164.36, 163.86, 164.43, 165.23, 166.1, 166.97, 167.8, 168.58, 169.3, 169.57, 169.86, 169.71, 169.03, 168.81, 168.58, 168.81, 169.13, 169.9, 170.26, 170.77, 171.23, 171.33, 171.52, 171.68, 171.48, 170.94, 170.4, 169.77, 169.56, 169.39, 169.32, 169.49, 169.51, 169.93, 170.61, 171.28, 171.92, 172.58, 172.45, 172.17, 172.0, 171.76, 171.48, 171.1, 170.78, 170.49, 170.16, 170.08, 170.66, 171.31, 172.1, 172.42, 172.75, 173.04, 173.32, 173.54, 173.76, 173.83, 173.66, 173.12, 172.18, 170.25, 166.95, 164.22, 159.63, 153.98, 149.11, 147.08, 147.28, 147.78, 148.69, 149.42, 149.85, 150.08, 150.11, 150.21, 150.23, 150.26, 150.09, 149.83, 149.6, 149.44, 149.46, 149.45, 149.46, 149.54, 149.83, 150.41, 151.16, 152.16, 152.9, 153.67, 154.13, 154.6, 154.88, 155.06, 155.09, 154.94, 154.86, 154.6, 154.1, 153.51, 152.86, 152.64, 152.4, 152.49, 152.71, 152.89, 153.26, 153.54, 154.2, 154.68, 155.2, 155.77, 156.31, 156.93, 157.24, 157.01, 157.61, 156.31, 170.0, 193.15, 209.91, 234.64, 255.53, 284.74, 306.29, 333.92, 377.15, 401.52, 448.67, 492.14, 535.32, 591.44, 589.15, 595.15, 607.96, 625.84, 636.75, 646.5, 644.68, 652.56, 658.79, 649.39, 631.42, 633.55, 645.17, 644.46, 649.97, 581.36, 581.38, 590.79, 601.73, 592.23, 591.62, 594.09, 633.63, 633.25, 627.58, 658.55, 663.0, 662.13, 648.45, 640.45, 634.05, 645.37, 659.85, 662.4, 667.63, 668.55, 662.2, 662.92, 661.72, 657.75, 653.06, 640.97, 628.55, 626.07, 629.33, 628.84, 635.41, 635.19, 645.97, 650.58, 655.78, 657.31, 657.48, 646.94, 645.51, 651.18, 655.65, 627.13, 647.02, 652.56, 651.73, 643.98, 649.71, 659.13, 689.11, 672.49, 653.14, 646.99, 647.11, 599.3, 624.89, 626.45, 634.17, 646.55, 653.63, 658.17, 662.79, 683.29, 691.02, 635.07, 615.7, 598.63, 616.46, 633.13, 658.27, 643.39, 647.15, 650.84, 656.71, 625.19, 646.25, 657.19, 647.88, 634.7, 636.86, 646.4, 649.72, 596.76, 597.36, 656.17, 626.22, 644.55, 644.42, 651.81, 625.18, 626.33, 634.01, 644.88, 651.92, 595.9, 652.5, 623.37, 656.65, 646.16, 645.91, 651.86, 596.94, 656.35, 658.52, 635.42, 652.11, 660.49, 590.83, 603.91, 613.4, 632.95, 646.0, 658.46, 646.69, 647.74, 590.35, 591.16, 650.7, 596.67, 657.69, 660.8, 689.3, 636.49, 658.31, 647.31, 645.17, 645.15, 649.0, 657.39, 636.03, 647.63, 657.81, 591.34, 596.66, 656.91, 658.68, 641.36, 648.52, 659.13, 590.37, 591.02, 650.28, 656.05, 624.6, 657.67, 652.16, 650.86, 650.79, 657.52, 634.04, 641.58, 645.91, 658.51, 625.29, 634.09, 645.1, 642.77, 634.01, 626.52, 645.07, 650.76101509] + frequencies_log_10 = [freq * 10 for freq in np.log10(frequencies)] + confidence = [0.04, 0.044, 0.109, 0.033, 0.094, 0.078, 0.085, 0.093, 0.099, 0.125, 0.156, 0.168, 0.094, 0.153, 0.063, 0.06, 0.095, 0.119, 0.121, 0.04, 0.098, 0.102, 0.076, 0.089, 0.076, 0.058, 0.075, 0.089, 0.139, 0.157, 0.144, 0.095, 0.032, 0.041, 0.094, 0.124, 0.112, 0.103, 0.104, 0.113, 0.096, 0.177, 0.149, 0.086, 0.079, 0.088, 0.134, 0.111, 0.071, 0.082, 0.097, 0.109, 0.149, 0.142, 0.154, 0.132, 0.117, 0.071, 0.071, 0.10, 0.098, 0.106, 0.087, 0.103, 0.067, 0.069, 0.078, 0.094, 0.303, 0.365, 0.056, 0.014, 0.037, 0.068, 0.106, 0.097, 0.09, 0.092, 0.034, 0.078, 0.028, 0.037, 0.016, 0.009, 0.042, 0.042, 0.041, 0.06, 0.115, 0.151, 0.132, 0.103, 0.092, 0.094, 0.08, 0.106, 0.138, 0.083, 0.077, 0.233, 0.273, 0.074, 0.073, 0.106, 0.103, 0.117, 0.081, 0.084, 0.051, 0.08, 0.036, 0.027, 0.047, 0.108, 0.085, 0.117, 0.099, 0.085, 0.084, 0.092, 0.105, 0.104, 0.086, 0.14, 0.083, 0.04, 0.083, 0.057, 0.08, 0.083, 0.058, 0.106, 0.089, 0.095, 0.046, 0.034, 0.039, 0.138, 0.23, 0.628, 0.397, 0.106, 0.036, 0.034, 0.039, 0.048, 0.056, 0.067, 0.066, 0.047, 0.022, 0.078, 0.04, 0.026, 0.028, 0.069, 0.061, 0.105, 0.056, 0.056, 0.085, 0.097, 0.093, 0.093, 0.075, 0.061, 0.066, 0.10, 0.102, 0.147, 0.093, 0.083, 0.088, 0.083, 0.069, 0.051, 0.04, 0.05, 0.133, 0.075, 0.051, 0.051, 0.108, 0.229, 0.038, 0.03, 0.052, 0.043, 0.068, 0.056, 0.081, 0.131, 0.104, 0.072, 0.056, 0.098, 0.025, 0.047, 0.074, 0.063, 0.068, 0.067, 0.072, 0.084, 0.085, 0.105, 0.051, 0.047, 0.04, 0.116, 0.038, 0.073, 0.037, 0.072, 0.087, 0.083, 0.08, 0.081, 0.075, 0.057, 0.076, 0.046, 0.038, 0.015, 0.03, 0.259, 0.571, 0.455, 0.101, 0.131, 0.033, 0.08, 0.069, 0.094, 0.111, 0.046, 0.037, 0.042, 0.046, 0.083, 0.108, 0.129, 0.101, 0.083, 0.062, 0.091, 0.095, 0.105, 0.123, 0.046, 0.046, 0.045, 0.048, 0.075, 0.07, 0.064, 0.032, 0.045, 0.069, 0.058, 0.091, 0.096, 0.051, 0.028, 0.04, 0.021, 0.048, 0.067, 0.102, 0.071, 0.019, 0.041, 0.048, 0.084, 0.075, 0.108, 0.122, 0.108, 0.087, 0.088, 0.051, 0.079, 0.042, 0.016, 0.028, 0.09, 0.105, 0.084, 0.073, 0.122, 0.125, 0.073, 0.07, 0.099, 0.094, 0.137, 0.112, 0.068, 0.023, 0.069, 0.087, 0.04, 0.079, 0.026, 0.092, 0.027, 0.05, 0.12, 0.112, 0.094, 0.063, 0.041, 0.129, 0.758, 0.529, 0.106, 0.117, 0.432, 0.798, 0.908, 0.898, 0.879, 0.889, 0.89, 0.919, 0.926, 0.923, 0.918, 0.921, 0.916, 0.893, 0.891, 0.893, 0.899, 0.901, 0.904, 0.897, 0.891, 0.895, 0.892, 0.887, 0.893, 0.909, 0.916, 0.902, 0.889, 0.889, 0.918, 0.911, 0.937, 0.936, 0.912, 0.895, 0.915, 0.926, 0.891, 0.884, 0.893, 0.90, 0.934, 0.943, 0.942, 0.925, 0.924, 0.936, 0.945, 0.931, 0.937, 0.931, 0.938, 0.94, 0.952, 0.949, 0.942, 0.95, 0.941, 0.929, 0.936, 0.937, 0.945, 0.95, 0.932, 0.927, 0.938, 0.935, 0.945, 0.945, 0.94, 0.902, 0.915, 0.912, 0.88, 0.912, 0.915, 0.953, 0.959, 0.933, 0.922, 0.939, 0.955, 0.937, 0.959, 0.956, 0.961, 0.953, 0.938, 0.961, 0.967, 0.959, 0.95, 0.95, 0.951, 0.965, 0.958, 0.958, 0.96, 0.955, 0.955, 0.948, 0.95, 0.951, 0.957, 0.948, 0.956, 0.952, 0.962, 0.966, 0.927, 0.928, 0.936, 0.953, 0.966, 0.942, 0.897, 0.911, 0.923, 0.931, 0.921, 0.935, 0.953, 0.923, 0.932, 0.924, 0.927, 0.94, 0.924, 0.935, 0.929, 0.921, 0.93, 0.922, 0.918, 0.931, 0.94, 0.928, 0.92, 0.919, 0.938, 0.938, 0.935, 0.932, 0.933, 0.932, 0.92, 0.915, 0.928, 0.911, 0.901, 0.916, 0.931, 0.924, 0.903, 0.919, 0.933, 0.939, 0.918, 0.94, 0.953, 0.958, 0.963, 0.964, 0.955, 0.928, 0.93, 0.936, 0.933, 0.935, 0.932, 0.895, 0.929, 0.916, 0.91, 0.93, 0.917, 0.894, 0.918, 0.926, 0.928, 0.933, 0.935, 0.927, 0.929, 0.906, 0.916, 0.924, 0.926, 0.925, 0.917, 0.92, 0.924, 0.928, 0.931, 0.928, 0.93, 0.921, 0.905, 0.90, 0.905, 0.896, 0.921, 0.917, 0.912, 0.909, 0.924, 0.92, 0.916, 0.912, 0.898, 0.928, 0.944, 0.934, 0.815, 0.639, 0.781, 0.924, 0.956, 0.952, 0.976, 0.961, 0.955, 0.949, 0.909, 0.927, 0.935, 0.92, 0.906, 0.915, 0.916, 0.912, 0.927, 0.932, 0.915, 0.90, 0.918, 0.932, 0.941, 0.932, 0.926, 0.934, 0.939, 0.899, 0.906, 0.924, 0.929, 0.927, 0.928, 0.918, 0.901, 0.90, 0.937, 0.931, 0.95, 0.951, 0.934, 0.917, 0.924, 0.942, 0.937, 0.923, 0.918, 0.912, 0.909, 0.912, 0.926, 0.925, 0.943, 0.954, 0.958, 0.958, 0.956, 0.953, 0.953, 0.956, 0.96, 0.925, 0.861, 0.893, 0.864, 0.779, 0.709, 0.846, 0.944, 0.963, 0.957, 0.931, 0.939, 0.93, 0.926, 0.929, 0.929, 0.932, 0.931, 0.927, 0.925, 0.937, 0.936, 0.941, 0.935, 0.94, 0.932, 0.925, 0.931, 0.931, 0.938, 0.944, 0.939, 0.955, 0.958, 0.951, 0.951, 0.946, 0.953, 0.957, 0.96, 0.951, 0.931, 0.941, 0.944, 0.941, 0.942, 0.946, 0.94, 0.936, 0.93, 0.954, 0.954, 0.943, 0.954, 0.938, 0.876, 0.728, 0.592, 0.365, 0.058, 0.025, 0.043, 0.031, 0.041, 0.029, 0.025, 0.012, 0.009, 0.018, 0.029, 0.033, 0.026, 0.018, 0.04, 0.069, 0.045, 0.078, 0.01, 0.064, 0.327, 0.034, 0.012, 0.091, 0.08, 0.082, 0.09, 0.117, 0.148, 0.123, 0.315, 0.07, 0.005, 0.008, 0.039, 0.13, 0.084, 0.081, 0.14, 0.102, 0.053, 0.043, 0.163, 0.067, 0.037, 0.04, 0.058, 0.029, 0.034, 0.03, 0.025, 0.261, 0.122, 0.047, 0.081, 0.047, 0.184, 0.10, 0.177, 0.092, 0.052, 0.032, 0.039, 0.127, 0.084, 0.072, 0.09, 0.125, 0.127, 0.128, 0.118, 0.097, 0.124, 0.114, 0.148, 0.124, 0.081, 0.095, 0.097, 0.11, 0.074, 0.14, 0.117, 0.089, 0.09, 0.089, 0.101, 0.122, 0.128, 0.123, 0.094, 0.062, 0.038, 0.035, 0.064, 0.032, 0.033, 0.045, 0.074, 0.109, 0.126, 0.112, 0.127, 0.093, 0.093, 0.059, 0.091, 0.026, 0.053, 0.10, 0.132, 0.115, 0.065, 0.071, 0.063, 0.081, 0.126, 0.143, 0.125, 0.093, 0.112, 0.102, 0.17, 0.128, 0.077, 0.087, 0.069, 0.061, 0.102, 0.139, 0.143, 0.07, 0.076, 0.124, 0.114, 0.14, 0.099, 0.033, 0.062, 0.056, 0.091, 0.041, 0.112, 0.116, 0.145, 0.077, 0.08, 0.073, 0.074, 0.09, 0.091, 0.105, 0.045, 0.034, 0.164, 0.131, 0.102, 0.082, 0.115, 0.085, 0.149, 0.081, 0.04, 0.064, 0.103, 0.135, 0.09, 0.13, 0.109, 0.071, 0.082, 0.08, 0.088, 0.099, 0.105, 0.133, 0.136, 0.133, 0.117, 0.098, 0.023, 0.105, 0.089, 0.102, 0.029, 0.097, 0.034, 0.076, 0.095, 0.103, 0.15038174] + + matrix = [[times[i], frequencies_log_10[i], confidence[i]] for i, _ in enumerate(times)] + # Act + df = pd.DataFrame(matrix) + df.columns = ['time', 'log 10 frequency', 'confidence'] + df_ss = pd.DataFrame(p.minmax_scale(df)) + df_ss.columns = ['time', 'log 10 frequency', 'confidence'] + + # apply custom weight to frequency + df_ss['log 10 frequency'] = df_ss['log 10 frequency'] / 2 + + clusters = 20 + labels = fit_kmeans(df_ss, clusters) + figure, axis = plt.subplots(2, 2) + axis[0][0].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[0][0].set_title("Ratio 1:1:1") + + # apply custom weight to frequency + df_ss['log 10 frequency'] = df['log 10 frequency'] / 5 + labels = fit_kmeans(df_ss, clusters) + axis[1][0].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[1][0].set_title("Ratio 1:5:1") + + # apply custom weight to frequency + df_ss['confidence'] = df['confidence'] / 100 + labels = fit_kmeans(df_ss, clusters) + axis[0][1].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[0][1].set_title("Ratio 1:1:100") + + # apply custom weight to frequency + df_ss['time'] = df['time'] / 100 + labels = fit_kmeans(df_ss, clusters) + axis[1][1].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5) + axis[1][1].set_title("Ratio 100:1:1") + + figure.set_figwidth(12.8) + plt.show() + print("done") + + +def fit_kmeans(data, centers): + kmeans = KMeans(centers) + labels = kmeans.fit_predict(data) + return labels + + + if __name__ == "__main__": unittest.main() diff --git a/src/modules/plot.py b/src/modules/plot.py index 01b121e..01a3e7f 100644 --- a/src/modules/plot.py +++ b/src/modules/plot.py @@ -175,7 +175,7 @@ def determine_bounds(frequency_log_10: list[float]) -> tuple[float, float]: def set_figure_dimensions(time_range, frequency_log_10_range): """Dynamically scale the figure dimensions based on the duration/frequency amplitude of the song""" height = frequency_log_10_range / 0.06 - width = time_range / 2 + width = time_range / 4 plt.figure(1).set_figwidth(max(6.4, width)) plt.figure(1).set_figheight(max(4, height)) @@ -212,7 +212,7 @@ def create_gaps(pitched_data: PitchedData, step_size: float) -> PitchedData: def draw_words(transcribed_data, midi_notes): """Draw rectangles for each word""" - if transcribed_data is not None: + if transcribed_data is not None and len(transcribed_data) > 0: for i, data in enumerate(transcribed_data): note_frequency = librosa.note_to_hz(midi_notes[i]) frequency_range = get_frequency_range(midi_notes[i]) From 442c5b1e56183716fc980968771b3241959fc5a0 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Tue, 3 Oct 2023 13:39:51 +0200 Subject: [PATCH 03/19] UltraSinger evaluation wip --- README.md | 1 + pytest/modules/UltraSinger.py | 30 +++++ src/Settings.py | 5 + src/UltraSinger.py | 45 +++++--- src/modules/Audio/separation.py | 17 +-- src/modules/Research/TestSong.py | 12 ++ src/modules/Research/UltraSingerEvaluation.py | 109 ++++++++++++++++++ src/modules/Ultrastar/ultrastar_converter.py | 4 +- src/modules/Ultrastar/ultrastar_parser.py | 23 +++- src/modules/Ultrastar/ultrastar_txt.py | 6 +- src/modules/Ultrastar/ultrastar_writer.py | 9 +- 11 files changed, 226 insertions(+), 35 deletions(-) create mode 100644 pytest/modules/UltraSinger.py create mode 100644 src/modules/Research/TestSong.py create mode 100644 src/modules/Research/UltraSingerEvaluation.py diff --git a/README.md b/README.md index 63dbea5..1895469 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,7 @@ _Not all options working now!_ --hyphenation True|False >> ((default) is True) --disable_separation True|False >> ((default) is False) --disable_karaoke True|False >> ((default) is False) + --ignore_audio True|False >> ((default) is False) --create_audio_chunks True|False >> ((default) is False) --plot True|False >> ((default) is False) --force_cpu True|False >> ((default) is False) diff --git a/pytest/modules/UltraSinger.py b/pytest/modules/UltraSinger.py new file mode 100644 index 0000000..2aa2d27 --- /dev/null +++ b/pytest/modules/UltraSinger.py @@ -0,0 +1,30 @@ +"""Tests for UltraSinger.py""" + +import os +import unittest +import src.modules.Pitcher.pitcher as test_subject + +import pytest +from src.modules.plot import plot + + +class PitcherTest(unittest.TestCase): + # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") + def test_get_pitch_with_crepe_file(self): + # Arrange + test_dir = os.path.dirname(os.path.abspath(__file__)) + root_dir = os.path.abspath(test_dir + "/../../..") + # test_file_abs_path = os.path.abspath(root_dir + "/test_input/audio_denoised.wav") + test_file_abs_path = os.path.abspath(root_dir + "/test_input/test_denoised.wav") + test_output = root_dir + "/test_output" + + # Act + # pitched_data = test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', device="cuda") + # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024) + # plot(pitched_data, test_output, title="pitching test") + + print("done") + + +if __name__ == "__main__": + unittest.main() diff --git a/src/Settings.py b/src/Settings.py index 6884d5d..b698681 100644 --- a/src/Settings.py +++ b/src/Settings.py @@ -5,6 +5,8 @@ class Settings: hyphenation = True use_separated_vocal = True create_karaoke = True + ignore_audio = False + input_file_is_ultrastar_txt = False input_file_path = "" output_file_path = "" @@ -31,3 +33,6 @@ class Settings: pytorch_device = 'cpu' # cpu|cuda tensorflow_device = 'cpu' # cpu|cuda force_cpu = False + + # UltraSinger Evaluation Configuration + test_songs_input_folder = None diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 628b446..ded0f85 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -330,15 +330,13 @@ def merge_syllable_segments( def run() -> None: """The processing function of this program""" - is_audio = ".txt" not in settings.input_file_path + settings.input_file_is_ultrastar_txt = settings.input_file_path.endswith(".txt") + ultrastar_class = None real_bpm = None (title, artist, year, genre) = (None, None, None, None) - if not is_audio: # Parse Ultrastar txt - print( - f"{ULTRASINGER_HEAD} {gold_highlighted('re-pitch mode')}" - ) + if settings.input_file_is_ultrastar_txt: # Parse Ultrastar txt ( basename_without_ext, real_bpm, @@ -346,6 +344,13 @@ def run() -> None: ultrastar_audio_input_path, ultrastar_class, ) = parse_ultrastar_txt() + + if not ultrastar_class.mp3: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} The provided text file does not have a reference to " + f"an audio file." + ) + exit(1) elif settings.input_file_path.startswith("https:"): # Youtube print( f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}" @@ -384,7 +389,7 @@ def run() -> None: # Audio transcription transcribed_data = None language = settings.language - if is_audio: + if not settings.ignore_audio: detected_language, transcribed_data = transcribe_audio() if language is None: language = detected_language @@ -409,7 +414,6 @@ def run() -> None: if settings.create_audio_chunks: create_audio_chunks( cache_path, - is_audio, transcribed_data, ultrastar_audio_input_path, ultrastar_class, @@ -417,7 +421,7 @@ def run() -> None: # Pitch the audio midi_notes, pitched_data, ultrastar_note_numbers = pitch_audio( - is_audio, transcribed_data, ultrastar_class + transcribed_data, ultrastar_class ) transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers) @@ -427,7 +431,7 @@ def run() -> None: plot(pitched_data, song_output, transcribed_data, midi_notes) # Write Ultrastar txt - if is_audio: + if not settings.ignore_audio: real_bpm, ultrastar_file_output = create_ultrastar_txt_from_automation( audio_separation_path, basename_without_ext, @@ -448,7 +452,7 @@ def run() -> None: # Calc Points ultrastar_class, simple_score, accurate_score = calculate_score_points( - is_audio, pitched_data, ultrastar_class, ultrastar_file_output + pitched_data, ultrastar_class, ultrastar_file_output ) # Add calculated score to Ultrastar txt @@ -506,7 +510,7 @@ def separate_vocal_from_audio( ) -> str: """Separate vocal from audio""" audio_separation_path = os.path.join( - cache_path, "separated", "htdemucs", basename_without_ext + cache_path, "separated", "htdemucs", os.path.splitext(os.path.basename(ultrastar_audio_input_path))[0] ) if settings.use_separated_vocal or settings.create_karaoke: @@ -522,10 +526,10 @@ def separate_vocal_from_audio( def calculate_score_points( - is_audio: bool, pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str + pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str ): """Calculate score points""" - if is_audio: + if not settings.ignore_audio: ultrastar_class = ultrastar_parser.parse_ultrastar_txt( ultrastar_file_output ) @@ -735,12 +739,13 @@ def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]: float(ultrastar_class.bpm.replace(",", ".")) ) ultrastar_mp3_name = ultrastar_class.mp3 - basename_without_ext = os.path.splitext(ultrastar_mp3_name)[0] + + basename_without_ext = f"{ultrastar_class.artist} - {ultrastar_class.title}" dirname = os.path.dirname(settings.input_file_path) ultrastar_audio_input_path = os.path.join(dirname, ultrastar_mp3_name) song_output = os.path.join( settings.output_file_path, - ultrastar_class.artist + " - " + ultrastar_class.title, + basename_without_ext, ) song_output = get_unused_song_output_dir(song_output) os_helper.create_folder(song_output) @@ -771,7 +776,7 @@ def create_midi_file(real_bpm: float, ) -def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[ +def pitch_audio(transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[ list[str], PitchedData, list[int]]: """Pitch audio""" # todo: chunk pitching as option? @@ -782,7 +787,7 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast settings.crepe_step_size, settings.tensorflow_device ) - if is_audio: + if not settings.ignore_audio: start_times = [] end_times = [] for i, data in enumerate(transcribed_data): @@ -802,7 +807,6 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast def create_audio_chunks( cache_path: str, - is_audio: bool, transcribed_data: list[TranscribedData], ultrastar_audio_input_path: str, ultrastar_class: UltrastarTxtValue @@ -812,7 +816,7 @@ def create_audio_chunks( cache_path, settings.audio_chunk_folder_name ) os_helper.create_folder(audio_chunks_path) - if is_audio: # and csv + if not settings.ignore_audio: # and csv csv_filename = os.path.join(audio_chunks_path, "_chunks.csv") export_chunks_from_transcribed_data( settings.mono_audio_path, transcribed_data, audio_chunks_path @@ -883,6 +887,8 @@ def init_settings(argv: list[str]) -> None: settings.create_karaoke = not arg elif opt in ("--create_audio_chunks"): settings.create_audio_chunks = arg + elif opt in ("--ignore_audio"): + settings.ignore_audio = arg in ["True", "true"] elif opt in ("--force_cpu"): settings.force_cpu = arg if settings.force_cpu: @@ -917,6 +923,7 @@ def arg_options(): "disable_separation=", "disable_karaoke=", "create_audio_chunks=", + "ignore_audio=", "force_cpu=", ] return long, short diff --git a/src/modules/Audio/separation.py b/src/modules/Audio/separation.py index 42885c4..bc2dd37 100644 --- a/src/modules/Audio/separation.py +++ b/src/modules/Audio/separation.py @@ -1,7 +1,10 @@ """Separate vocals from audio""" - +import os +import shlex import subprocess +import demucs.separate + from modules.console_colors import ( ULTRASINGER_HEAD, blue_highlighted, @@ -10,17 +13,17 @@ from modules.os_helper import current_executor_path, move, path_join -def separate_audio(input_file_path: str, output_file: str, device="cpu") -> None: +def separate_audio(input_file_path: str, output_folder: str, device="cpu") -> None: """Separate vocals from audio with demucs.""" print( f"{ULTRASINGER_HEAD} Separating vocals from audio with {blue_highlighted('demucs')} and {red_highlighted(device)} as worker." ) + + demucs.separate.main(shlex.split(f'--two-stems vocals -d {device} --out "{os.path.join(output_folder, "separated")}" "{input_file_path}"')) # Model selection? # -n mdx_q # -n htdemucs_ft - subprocess.run( - ["demucs", "-d", device, "--two-stems=vocals", input_file_path] - ) - separated_folder = path_join(current_executor_path(), "separated") - move(separated_folder, output_file) \ No newline at end of file + # subprocess.run( + # ["demucs", "-d", device, "--two-stems=vocals", input_file_path.replace("\\", "/")] + # ) \ No newline at end of file diff --git a/src/modules/Research/TestSong.py b/src/modules/Research/TestSong.py new file mode 100644 index 0000000..3aa262d --- /dev/null +++ b/src/modules/Research/TestSong.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass + +from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue + + +@dataclass +class TestSong: + """Test song""" + + txt: str + audio: float + ultrastar_class: UltrastarTxtValue \ No newline at end of file diff --git a/src/modules/Research/UltraSingerEvaluation.py b/src/modules/Research/UltraSingerEvaluation.py new file mode 100644 index 0000000..17cfcf6 --- /dev/null +++ b/src/modules/Research/UltraSingerEvaluation.py @@ -0,0 +1,109 @@ +import copy +import os +import sys +from datetime import datetime +from pathlib import Path +from typing import List + +import UltraSinger +from Settings import Settings +from modules.DeviceDetection.device_detection import check_gpu_support +from modules.Research.TestSong import TestSong +from modules.Ultrastar import ultrastar_parser +from modules.console_colors import ULTRASINGER_HEAD, red_highlighted + +test_input_folder = os.path.normpath( + os.path.abspath(__file__ + "../../../../../test_input") +) +test_output_folder = os.path.normpath( + os.path.abspath(__file__ + "../../../../../test_output") +) +test_run_folder = os.path.join( + test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S") +) + + +def main() -> None: + """Main function""" + test_input_folder_path = Path(test_input_folder) + test_input_folder_path.mkdir(parents=True, exist_ok=True) + + test_output_folder_path = Path(test_output_folder) + test_output_folder_path.mkdir(parents=True, exist_ok=True) + + test_run_folder_path = Path(test_run_folder) + test_run_folder_path.mkdir(parents=True) + + base_settings = initialize_settings() + base_settings.output_file_path = test_run_folder + + base_settings.test_songs_input_folder = os.path.normpath( + base_settings.test_songs_input_folder + ) + if not os.path.isdir(base_settings.test_songs_input_folder): + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test songs input folder configured (refer to " + f"evaluation section in readme)." + ) + exit(1) + + test_songs: List[TestSong] = [] + for dir_entry in os.listdir(base_settings.test_songs_input_folder): + dir_entry_path = os.path.join(base_settings.test_songs_input_folder, dir_entry) + if os.path.isdir(dir_entry_path): + for sub_dir_entry in os.listdir(dir_entry_path): + if sub_dir_entry.endswith(".txt") and sub_dir_entry != "license.txt": + txt_file = os.path.join( + base_settings.test_songs_input_folder, dir_entry, sub_dir_entry + ) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file) + + if ultrastar_class.mp3: + test_song = TestSong( + txt_file, ultrastar_class.mp3, ultrastar_class + ) + test_songs.append(test_song) + break + else: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Warning.')} {base_settings.test_songs_input_folder} contains an UltraStar text file but has no audio referenced in it. Skipping." + ) + + if len(test_songs) == 0: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test songs found in {base_settings.test_songs_input_folder}." + ) + exit(1) + + print(f"{ULTRASINGER_HEAD} Running evaluation for {len(test_songs)} songs") + + for index, test_song in enumerate(test_songs): + print(f"{ULTRASINGER_HEAD} ========================") + print( + f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.txt)}" + ) + + test_song_settings = copy.deepcopy(base_settings) + test_song_settings.input_file_path = test_song.txt + UltraSinger.settings = test_song_settings + UltraSinger.run() + + +def initialize_settings(): + s = Settings() + user_config_file = os.path.normpath( + os.path.join(test_input_folder, "config/local.py") + ) + if os.path.isfile(user_config_file): + sys.path.append(os.path.join(user_config_file, "..")) + import local + + s = local.user_settings + + if not s.force_cpu: + s.tensorflow_device, s.pytorch_device = check_gpu_support() + return s + + +if __name__ == "__main__": + main() diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index d9978eb..97f79ee 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -51,7 +51,7 @@ def ultrastar_note_to_midi_note(ultrastar_note: int) -> int: def get_start_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float: """Calculates the start time from the Ultrastar txt""" - gap = int(ultrastar_class.gap) / 1000 + gap = int(float(ultrastar_class.gap) / 1000) real_bpm = ultrastar_bpm_to_real_bpm( float(ultrastar_class.bpm.replace(",", ".")) ) @@ -64,7 +64,7 @@ def get_start_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float: """Calculates the end time from the Ultrastar txt""" - gap = int(ultrastar_class.gap) / 1000 + gap = int(float(ultrastar_class.gap) / 1000) real_bpm = ultrastar_bpm_to_real_bpm( float(ultrastar_class.bpm.replace(",", ".")) ) diff --git a/src/modules/Ultrastar/ultrastar_parser.py b/src/modules/Ultrastar/ultrastar_parser.py index 9bba162..f1aaca2 100644 --- a/src/modules/Ultrastar/ultrastar_parser.py +++ b/src/modules/Ultrastar/ultrastar_parser.py @@ -5,7 +5,13 @@ get_end_time_from_ultrastar, get_start_time_from_ultrastar, ) -from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, UltrastarTxtTag, UltrastarTxtNoteTypeTag, FILE_ENCODING +from modules.Ultrastar.ultrastar_txt import ( + UltrastarTxtValue, + UltrastarTxtTag, + UltrastarTxtNoteTypeTag, + FILE_ENCODING, +) + def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue: """Parse ultrastar txt file to UltrastarTxt class""" @@ -31,12 +37,23 @@ def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue: ultrastar_class.gap = line.split(":")[1].replace("\n", "") elif line.startswith(f"#{UltrastarTxtTag.BPM}"): ultrastar_class.bpm = line.split(":")[1].replace("\n", "") - elif line.startswith(( + elif line.startswith(f"#{UltrastarTxtTag.VIDEO}"): + ultrastar_class.video = line.split(":")[1].replace("\n", "") + elif line.startswith(f"#{UltrastarTxtTag.VIDEOGAP}"): + ultrastar_class.videoGap = line.split(":")[1].replace("\n", "") + elif line.startswith(f"#{UltrastarTxtTag.COVER}"): + ultrastar_class.cover = line.split(":")[1].replace("\n", "") + elif line.startswith(f"#{UltrastarTxtTag.BACKGROUND}"): + ultrastar_class.background = line.split(":")[1].replace("\n", "") + elif line.startswith( + ( f"{UltrastarTxtNoteTypeTag.FREESTYLE} ", f"{UltrastarTxtNoteTypeTag.NORMAL} ", f"{UltrastarTxtNoteTypeTag.GOLDEN} ", f"{UltrastarTxtNoteTypeTag.RAP} ", - f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ")): + f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ", + ) + ): parts = line.split() # [0] F : * R G # [1] start beat diff --git a/src/modules/Ultrastar/ultrastar_txt.py b/src/modules/Ultrastar/ultrastar_txt.py index d22f91d..3e21273 100644 --- a/src/modules/Ultrastar/ultrastar_txt.py +++ b/src/modules/Ultrastar/ultrastar_txt.py @@ -14,9 +14,11 @@ class UltrastarTxtTag(str, Enum): BPM = 'BPM' LANGUAGE = 'LANGUAGE' COVER = 'COVER' # Path to cover. Should end with `*[CO].jpg` + BACKGROUND = 'BACKGROUND' # Path to background. Is shown when there is no video. Should end with `*[BG].jpg` CREATOR = 'CREATOR' COMMENT = 'COMMENT' VIDEO = 'VIDEO' + VIDEOGAP = 'VIDEOGAP' FILE_END = 'E' LINEBREAK = '-' @@ -24,8 +26,6 @@ class UltrastarTxtTag(str, Enum): FIXER = 'FIXER' # Unused - BACKGROUND = 'BACKGROUND' # Path to background. Is shown when there is no video. Should end with `*[BG].jpg` - VIDEOGAP = 'VIDEOGAP' GENRE = 'GENRE' EDITION = 'EDITION' YEAR = 'YEAR' @@ -63,10 +63,12 @@ class UltrastarTxtValue: genre = "" mp3 = "" video = None + videoGap = None gap = "" bpm = "" language = None cover = None + background = None creator = "UltraSinger [GitHub]" comment = "UltraSinger [GitHub]" startBeat = [] diff --git a/src/modules/Ultrastar/ultrastar_writer.py b/src/modules/Ultrastar/ultrastar_writer.py index ecd616b..9dee2e3 100644 --- a/src/modules/Ultrastar/ultrastar_writer.py +++ b/src/modules/Ultrastar/ultrastar_writer.py @@ -65,10 +65,15 @@ def create_ultrastar_txt_from_automation( file.write(f"#{UltrastarTxtTag.GENRE}:{ultrastar_class.genre}\n") if ultrastar_class.cover is not None: file.write(f"#{UltrastarTxtTag.COVER}:{ultrastar_class.cover}\n") + if ultrastar_class.background is not None: + file.write(f"#{UltrastarTxtTag.BACKGROUND}:{ultrastar_class.background}\n") file.write(f"#{UltrastarTxtTag.MP3}:{ultrastar_class.mp3}\n") - file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n") - file.write(f"#{UltrastarTxtTag.BPM}:{round(ultrastar_bpm, 2)}\n") # not the real BPM! file.write(f"#{UltrastarTxtTag.GAP}:{int(gap * 1000)}\n") + if ultrastar_class.video is not None: + file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n") + if ultrastar_class.videoGap is not None: + file.write(f"#{UltrastarTxtTag.VIDEOGAP}:{ultrastar_class.videoGap}\n") + file.write(f"#{UltrastarTxtTag.BPM}:{round(ultrastar_bpm, 2)}\n") # not the real BPM! file.write(f"#{UltrastarTxtTag.CREATOR}:{ultrastar_class.creator}\n") file.write(f"#{UltrastarTxtTag.FIXER}:{ultrastar_class.fixer}\n") file.write(f"#{UltrastarTxtTag.COMMENT}:{ultrastar_class.comment}\n") From 63ad200990fbd1ccb98858431064f232ff31b2d3 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Fri, 6 Oct 2023 17:36:32 +0200 Subject: [PATCH 04/19] add caching for separation, denoise, transcription and pitching --- .../Speech_Recognition/test_Whisper.py | 15 +- .../UltraStar/test_ultrastar_writer.py | 23 +- src/Settings.py | 5 + src/UltraSinger.py | 358 ++++++++++-------- src/modules/Pitcher/PitchingResult.py | 15 + src/modules/Pitcher/pitched_data.py | 3 + src/modules/Pitcher/pitcher.py | 3 + src/modules/Research/TestSong.py | 1 + src/modules/Research/UltraSingerEvaluation.py | 31 +- .../Speech_Recognition/TranscribedData.py | 36 +- .../Speech_Recognition/TranscriptionResult.py | 14 + src/modules/Speech_Recognition/Whisper.py | 10 +- src/modules/Ultrastar/ultrastar_writer.py | 2 +- src/modules/console_colors.py | 6 + src/modules/csv_handler.py | 2 +- 15 files changed, 314 insertions(+), 210 deletions(-) create mode 100644 src/modules/Pitcher/PitchingResult.py create mode 100644 src/modules/Speech_Recognition/TranscriptionResult.py diff --git a/pytest/modules/Speech_Recognition/test_Whisper.py b/pytest/modules/Speech_Recognition/test_Whisper.py index 8f78701..cbb0d07 100644 --- a/pytest/modules/Speech_Recognition/test_Whisper.py +++ b/pytest/modules/Speech_Recognition/test_Whisper.py @@ -30,14 +30,13 @@ def test_convert_to_transcribed_data(self): # Words should have space at the end expected_output = [ - TranscribedData( - {"word": "UltraSinger ", "start": 1.23, "end": 2.34, "is_hyphen": None, "confidence": 0.95}), - TranscribedData({"word": "is ", "start": 2.34, "end": 3.45, "is_hyphen": None, "confidence": 0.9}), - TranscribedData({"word": "cool! ", "start": 3.45, "end": 4.56, "is_hyphen": None, "confidence": 0.85}), - TranscribedData({"word": "And ", "start": 4.56, "end": 5.67, "is_hyphen": None, "confidence": 0.95}), - TranscribedData({"word": "will ", "start": 5.67, "end": 6.78, "is_hyphen": None, "confidence": 0.9}), - TranscribedData({"word": "be ", "start": 6.78, "end": 7.89, "is_hyphen": None, "confidence": 0.85}), - TranscribedData({"word": "better! ", "start": 7.89, "end": 9.01, "is_hyphen": None, "confidence": 0.8}), + TranscribedData.from_dict({"word": "UltraSinger ", "start": 1.23, "end": 2.34, "is_hyphen": None, "confidence": 0.95}), + TranscribedData.from_dict({"word": "is ", "start": 2.34, "end": 3.45, "is_hyphen": None, "confidence": 0.9}), + TranscribedData.from_dict({"word": "cool! ", "start": 3.45, "end": 4.56, "is_hyphen": None, "confidence": 0.85}), + TranscribedData.from_dict({"word": "And ", "start": 4.56, "end": 5.67, "is_hyphen": None, "confidence": 0.95}), + TranscribedData.from_dict({"word": "will ", "start": 5.67, "end": 6.78, "is_hyphen": None, "confidence": 0.9}), + TranscribedData.from_dict({"word": "be ", "start": 6.78, "end": 7.89, "is_hyphen": None, "confidence": 0.85}), + TranscribedData.from_dict({"word": "better! ", "start": 7.89, "end": 9.01, "is_hyphen": None, "confidence": 0.8}), ] # Act diff --git a/pytest/modules/UltraStar/test_ultrastar_writer.py b/pytest/modules/UltraStar/test_ultrastar_writer.py index 4e82f4f..e614cb8 100644 --- a/pytest/modules/UltraStar/test_ultrastar_writer.py +++ b/pytest/modules/UltraStar/test_ultrastar_writer.py @@ -42,20 +42,20 @@ def test_create_ultrastar_txt_from_automation_full_values(self): def arrange(self): # Arrange transcribed_data = [ - TranscribedData({ - "conf": 0.95, + TranscribedData.from_dict({ + "confidence": 0.95, "word": "UltraSinger ", "end": 2.5, "start": 0.5 }), - TranscribedData({ - "conf": 0.9, + TranscribedData.from_dict({ + "confidence": 0.9, "word": "is ", "end": 4.5, "start": 3.0 }), - TranscribedData({ - "conf": 0.85, + TranscribedData.from_dict({ + "confidence": 0.85, "word": "cool! ", "end": 7.5, "start": 5.5 @@ -85,7 +85,14 @@ def default_values(default_ultrastar_class): f"#{UltrastarTxtTag.ARTIST}:{default_ultrastar_class.artist}\n", f"#{UltrastarTxtTag.TITLE}:{default_ultrastar_class.title}\n", f"#{UltrastarTxtTag.MP3}:{default_ultrastar_class.mp3}\n", - f"#{UltrastarTxtTag.VIDEO}:{default_ultrastar_class.video}\n", # todo: video is optional + ] + + if default_ultrastar_class.video is not None: + expected_calls += [ + f"#{UltrastarTxtTag.VIDEO}:{default_ultrastar_class.video}\n", + ] + + expected_calls += [ f"#{UltrastarTxtTag.BPM}:390.0\n", f"#{UltrastarTxtTag.GAP}:500\n", f"#{UltrastarTxtTag.CREATOR}:{default_ultrastar_class.creator}\n", @@ -96,7 +103,7 @@ def default_values(default_ultrastar_class): ": 65 39 2 is \n", "- 104\n", ": 130 52 3 cool! \n", - "E" + "E", ] return expected_calls diff --git a/src/Settings.py b/src/Settings.py index b698681..efcbf9e 100644 --- a/src/Settings.py +++ b/src/Settings.py @@ -36,3 +36,8 @@ class Settings: # UltraSinger Evaluation Configuration test_songs_input_folder = None + cache_override_path = None + skip_cache_vocal_separation = False + skip_cache_denoise_vocal_audio = False + skip_cache_transcription = False + skip_cache_pitch_detection = False diff --git a/src/UltraSinger.py b/src/UltraSinger.py index ded0f85..06b5d6f 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -2,6 +2,7 @@ import copy import getopt +import json import os import sys @@ -17,6 +18,8 @@ export_chunks_from_ultrastar_data, ) from modules.Audio.silence_processing import remove_silence_from_transcription_data +from modules.Pitcher.PitchingResult import PitchingResult +from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult from modules.csv_handler import export_transcribed_data_to_csv from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3 from modules.Audio.youtube import ( @@ -32,6 +35,7 @@ gold_highlighted, light_blue_highlighted, red_highlighted, + green_highlighted, ) from modules.Midi import midi_creator from modules.Midi.midi_creator import ( @@ -44,12 +48,22 @@ get_pitch_with_crepe_file, ) from modules.Pitcher.pitched_data import PitchedData -from modules.Speech_Recognition.hyphenation import hyphenation, language_check, create_hyphenator +from modules.Speech_Recognition.hyphenation import ( + hyphenation, + language_check, + create_hyphenator, +) from modules.Speech_Recognition.Whisper import transcribe_with_whisper -from modules.Ultrastar import ultrastar_score_calculator, ultrastar_writer, ultrastar_converter, ultrastar_parser -from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue +from modules.Ultrastar import ( + ultrastar_score_calculator, + ultrastar_writer, + ultrastar_converter, + ultrastar_parser, +) +from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, FILE_ENCODING from Settings import Settings from modules.Speech_Recognition.TranscribedData import TranscribedData +from modules.os_helper import check_file_exists from modules.plot import plot from modules.musicbrainz_client import get_music_infos @@ -57,6 +71,7 @@ SYLLABLE_SEGMENT_SIZE = 0.1 SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1 + def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]: """Convert midi notes to ultrastar notes""" print(f"{ULTRASINGER_HEAD} Creating Ultrastar notes from midi data") @@ -65,9 +80,7 @@ def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]: for i in enumerate(midi_notes): pos = i[0] note_number_librosa = librosa.note_to_midi(midi_notes[pos]) - pitch = ultrastar_converter.midi_note_to_ultrastar_note( - note_number_librosa - ) + pitch = ultrastar_converter.midi_note_to_ultrastar_note(note_number_librosa) ultrastar_note_numbers.append(pitch) # todo: Progress? # print( @@ -78,9 +91,7 @@ def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]: def pitch_each_chunk_with_crepe(directory: str) -> list[str]: """Pitch each chunk with crepe and return midi notes""" - print( - f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}" - ) + print(f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}") midi_notes = [] for filename in sorted( @@ -109,7 +120,9 @@ def pitch_each_chunk_with_crepe(directory: str) -> list[str]: return midi_notes -def add_hyphen_to_data(transcribed_data: list[TranscribedData], hyphen_words: list[list[str]]): +def add_hyphen_to_data( + transcribed_data: list[TranscribedData], hyphen_words: list[list[str]] +): """Add hyphen to transcribed data return new data list""" new_data = [] @@ -141,9 +154,7 @@ def get_bpm_from_data(data, sampling_rate): onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate) wav_tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sampling_rate) - print( - f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}" - ) + print(f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}") return wav_tempo[0] @@ -223,12 +234,12 @@ def remove_unecessary_punctuations(transcribed_data: list[TranscribedData]) -> N """Remove unecessary punctuations from transcribed data""" punctuation = ".," for i, data in enumerate(transcribed_data): - data.word = data.word.translate( - {ord(i): None for i in punctuation} - ) + data.word = data.word.translate({ord(i): None for i in punctuation}) -def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData]) -> list[list[str]] | None: +def hyphenate_each_word( + language: str, transcribed_data: list[TranscribedData] +) -> list[list[str]] | None: """Hyphenate each word in the transcribed data.""" hyphenated_word = [] lang_region = language_check(language) @@ -241,9 +252,7 @@ def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData]) hyphenator = create_hyphenator(lang_region) for i in tqdm(enumerate(transcribed_data)): pos = i[0] - hyphenated_word.append( - hyphenation(transcribed_data[pos].word, hyphenator) - ) + hyphenated_word.append(hyphenation(transcribed_data[pos].word, hyphenator)) return hyphenated_word @@ -261,18 +270,19 @@ def print_support() -> None: ) -def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> list[TranscribedData]: +def split_syllables_into_segments( + transcribed_data: list[TranscribedData], +) -> list[TranscribedData]: """Split every syllable into sub-segments""" segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1]) new_data = [] for i, data in enumerate(transcribed_data): - duration = data.end - data.start if duration <= SYLLABLE_SEGMENT_SIZE: new_data.append(data) continue - + has_space = str(data.word).endswith(" ") first_segment = copy.deepcopy(data) filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE @@ -289,26 +299,28 @@ def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> li for i in range(int(full_segments)): segment = TranscribedData() segment.word = "~" - segment.start = filler_words_start + round(i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points) + segment.start = filler_words_start + round( + i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points + ) segment.end = segment.start + SYLLABLE_SEGMENT_SIZE new_data.append(segment) - + if partial_segment >= 0.01: segment = TranscribedData() segment.word = "~" - segment.start = filler_words_start + round(full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points) + segment.start = filler_words_start + round( + full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points + ) segment.end = segment.start + partial_segment new_data.append(segment) - + if has_space: new_data[-1].word += " " return new_data def merge_syllable_segments( - transcribed_data: list[TranscribedData], - midi_notes: list[str], - us_notes = list[int] + transcribed_data: list[TranscribedData], midi_notes: list[str], us_notes=list[int] ) -> tuple[list[TranscribedData], list[str], list[int]]: """Merge sub-segments of a syllable where the pitch is the same""" new_data = [] @@ -318,7 +330,12 @@ def merge_syllable_segments( previous_data = None for i, data in enumerate(transcribed_data): - if str(data.word).startswith("~") and previous_data is not None and midi_notes[i] == midi_notes[i-1] and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE: + if ( + str(data.word).startswith("~") + and previous_data is not None + and midi_notes[i] == midi_notes[i - 1] + and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE + ): new_data[-1].end = data.end else: new_data.append(data) @@ -352,36 +369,31 @@ def run() -> None: ) exit(1) elif settings.input_file_path.startswith("https:"): # Youtube - print( - f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}" - ) + print(f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}") ( basename_without_ext, song_output, ultrastar_audio_input_path, - (title, artist, year, genre) + (title, artist, year, genre), ) = download_from_youtube() else: # Audio File - print( - f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}" - ) + print(f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}") ( basename_without_ext, song_output, ultrastar_audio_input_path, - (title, artist, year, genre) + (title, artist, year, genre), ) = infos_from_audio_input_file() - cache_path = os.path.join(song_output, "cache") - settings.mono_audio_path = os.path.join( - cache_path, basename_without_ext + ".wav" + cache_path = ( + os.path.join(song_output, "cache") + if settings.cache_override_path is None + else settings.cache_override_path ) - os_helper.create_folder(cache_path) + settings.mono_audio_path = os.path.join(cache_path, basename_without_ext + ".wav") # Separate vocal from audio - audio_separation_path = separate_vocal_from_audio( - basename_without_ext, cache_path, ultrastar_audio_input_path - ) + audio_separation_path = separate_vocal_from_audio(basename_without_ext, cache_path, ultrastar_audio_input_path) # Denoise vocal audio denoise_vocal_audio(basename_without_ext, cache_path) @@ -390,13 +402,13 @@ def run() -> None: transcribed_data = None language = settings.language if not settings.ignore_audio: - detected_language, transcribed_data = transcribe_audio() + transcription_result = transcribe_audio(cache_path) if language is None: - language = detected_language + language = transcription_result.detected_language - remove_unecessary_punctuations(transcribed_data) + remove_unecessary_punctuations(transcription_result.transcribed_data) transcribed_data = remove_silence_from_transcription_data( - settings.mono_audio_path, transcribed_data + settings.mono_audio_path, transcription_result.transcribed_data ) if settings.hyphenation: @@ -421,10 +433,12 @@ def run() -> None: # Pitch the audio midi_notes, pitched_data, ultrastar_note_numbers = pitch_audio( - transcribed_data, ultrastar_class + transcribed_data, ultrastar_class, cache_path ) - transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers) + transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments( + transcribed_data, midi_notes, ultrastar_note_numbers + ) # Create plot if settings.create_plot: @@ -443,7 +457,7 @@ def run() -> None: title, artist, year, - genre + genre, ) else: ultrastar_file_output = create_ultrastar_txt_from_ultrastar_data( @@ -456,9 +470,7 @@ def run() -> None: ) # Add calculated score to Ultrastar txt - ultrastar_writer.add_score_to_ultrastar_txt( - ultrastar_file_output, simple_score - ) + ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score) # Midi if settings.create_midi: @@ -488,36 +500,60 @@ def get_unused_song_output_dir(path: str) -> str: return path -def transcribe_audio() -> (str, list[TranscribedData]): +def transcribe_audio(cache_path: str) -> TranscriptionResult: """Transcribe audio with AI""" + transcription_result = None if settings.transcriber == "whisper": - transcribed_data, detected_language = transcribe_with_whisper( - settings.mono_audio_path, - settings.whisper_model, - settings.pytorch_device, - settings.whisper_align_model, - settings.whisper_batch_size, - settings.whisper_compute_type, - settings.language, - ) + transcription_config = f"{settings.transcriber}_{settings.whisper_model}_{settings.pytorch_device}_{settings.whisper_align_model}_{settings.whisper_align_model}_{settings.whisper_batch_size}_{settings.whisper_compute_type}_{settings.language}" + transcription_path = os.path.join(cache_path, f"{transcription_config}.json") + cached_transcription_available = check_file_exists(transcription_path) + if settings.skip_cache_transcription or not cached_transcription_available: + transcription_result = transcribe_with_whisper( + settings.mono_audio_path, + settings.whisper_model, + settings.pytorch_device, + settings.whisper_align_model, + settings.whisper_batch_size, + settings.whisper_compute_type, + settings.language, + ) + with open(transcription_path, "w", encoding=FILE_ENCODING) as file: + file.write(transcription_result.to_json()) + else: + print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached transcribed data") + with open(transcription_path) as file: + json = file.read() + transcription_result = TranscriptionResult.from_json(json) else: raise NotImplementedError - return detected_language, transcribed_data + return transcription_result def separate_vocal_from_audio( - basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str + basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str ) -> str: """Separate vocal from audio""" + demcus_output_folder = os.path.splitext( + os.path.basename(ultrastar_audio_input_path) + )[0] audio_separation_path = os.path.join( - cache_path, "separated", "htdemucs", os.path.splitext(os.path.basename(ultrastar_audio_input_path))[0] + cache_path, "separated", "htdemucs", demcus_output_folder ) + vocals_path = os.path.join(audio_separation_path, "vocals.wav") + instrumental_path = os.path.join(audio_separation_path, "no_vocals.wav") if settings.use_separated_vocal or settings.create_karaoke: - separate_audio(ultrastar_audio_input_path, cache_path, settings.pytorch_device) + cache_available = (check_file_exists(vocals_path) + and check_file_exists(instrumental_path)) + if settings.skip_cache_vocal_separation or not cache_available: + separate_audio( + ultrastar_audio_input_path, cache_path, settings.pytorch_device + ) + else: + print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached separated vocals") if settings.use_separated_vocal: - input_path = os.path.join(audio_separation_path, "vocals.wav") + input_path = vocals_path else: input_path = ultrastar_audio_input_path @@ -526,22 +562,18 @@ def separate_vocal_from_audio( def calculate_score_points( - pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str + pitched_data: PitchedData, + ultrastar_class: UltrastarTxtValue, + ultrastar_file_output: str, ): """Calculate score points""" if not settings.ignore_audio: - ultrastar_class = ultrastar_parser.parse_ultrastar_txt( - ultrastar_file_output - ) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(ultrastar_file_output) ( simple_score, accurate_score, - ) = ultrastar_score_calculator.calculate_score( - pitched_data, ultrastar_class - ) - ultrastar_score_calculator.print_score_calculation( - simple_score, accurate_score - ) + ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class) + ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score) else: print( f"{ULTRASINGER_HEAD} {blue_highlighted('Score of original Ultrastar txt')}" @@ -549,32 +581,24 @@ def calculate_score_points( ( simple_score, accurate_score, - ) = ultrastar_score_calculator.calculate_score( - pitched_data, ultrastar_class - ) - ultrastar_score_calculator.print_score_calculation( - simple_score, accurate_score - ) + ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class) + ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score) print( f"{ULTRASINGER_HEAD} {blue_highlighted('Score of re-pitched Ultrastar txt')}" ) - ultrastar_class = ultrastar_parser.parse_ultrastar_txt( - ultrastar_file_output - ) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(ultrastar_file_output) ( simple_score, accurate_score, - ) = ultrastar_score_calculator.calculate_score( - pitched_data, ultrastar_class - ) - ultrastar_score_calculator.print_score_calculation( - simple_score, accurate_score - ) + ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class) + ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score) return ultrastar_class, simple_score, accurate_score def create_ultrastar_txt_from_ultrastar_data( - song_output: str, ultrastar_class: UltrastarTxtValue, ultrastar_note_numbers: list[int] + song_output: str, + ultrastar_class: UltrastarTxtValue, + ultrastar_note_numbers: list[int], ) -> str: """Create Ultrastar txt from Ultrastar data""" output_repitched_ultrastar = os.path.join( @@ -599,7 +623,7 @@ def create_ultrastar_txt_from_automation( title: str, artist: str, year: str, - genre: str + genre: str, ): """Create Ultrastar txt from automation""" ultrastar_header = UltrastarTxtValue() @@ -610,9 +634,7 @@ def create_ultrastar_txt_from_automation( ultrastar_header.language = language cover = basename_without_ext + " [CO].jpg" ultrastar_header.cover = ( - cover - if os_helper.check_file_exists(os.path.join(song_output, cover)) - else None + cover if os_helper.check_file_exists(os.path.join(song_output, cover)) else None ) # Additional data @@ -626,9 +648,7 @@ def create_ultrastar_txt_from_automation( ultrastar_header.genre = genre real_bpm = get_bpm_from_file(ultrastar_audio_input_path) - ultrastar_file_output = os.path.join( - song_output, basename_without_ext + ".txt" - ) + ultrastar_file_output = os.path.join(song_output, basename_without_ext + ".txt") ultrastar_writer.create_ultrastar_txt_from_automation( transcribed_data, ultrastar_note_numbers, @@ -683,9 +703,17 @@ def infos_from_audio_input_file() -> tuple[str, str, str, tuple[str, str, str, s song_output = get_unused_song_output_dir(song_output) os_helper.create_folder(song_output) os_helper.copy(settings.input_file_path, song_output) - os_helper.rename(os.path.join(song_output, os.path.basename(settings.input_file_path)), os.path.join(song_output, basename)) + os_helper.rename( + os.path.join(song_output, os.path.basename(settings.input_file_path)), + os.path.join(song_output, basename), + ) ultrastar_audio_input_path = os.path.join(song_output, basename) - return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info) + return ( + basename_without_ext, + song_output, + ultrastar_audio_input_path, + (title, artist, year_info, genre_info), + ) FILENAME_REPLACEMENTS = (('?:"', ""), ("<", "("), (">", ")"), ("/\\|*", "-")) @@ -706,7 +734,9 @@ def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]: (artist, title) = get_youtube_title(settings.input_file_path) # Get additional data for song - (title_info, artist_info, year_info, genre_info) = get_music_infos(f"{artist} - {title}") + (title_info, artist_info, year_info, genre_info) = get_music_infos( + f"{artist} - {title}" + ) if title_info is not None: title = title_info @@ -717,24 +747,23 @@ def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]: song_output = os.path.join(settings.output_file_path, basename_without_ext) song_output = get_unused_song_output_dir(song_output) os_helper.create_folder(song_output) - download_youtube_audio( - settings.input_file_path, basename_without_ext, song_output - ) - download_youtube_video( - settings.input_file_path, basename_without_ext, song_output - ) + download_youtube_audio(settings.input_file_path, basename_without_ext, song_output) + download_youtube_video(settings.input_file_path, basename_without_ext, song_output) download_youtube_thumbnail( settings.input_file_path, basename_without_ext, song_output ) ultrastar_audio_input_path = os.path.join(song_output, basename) - return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info) + return ( + basename_without_ext, + song_output, + ultrastar_audio_input_path, + (title, artist, year_info, genre_info), + ) def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]: """Parse Ultrastar txt""" - ultrastar_class = ultrastar_parser.parse_ultrastar_txt( - settings.input_file_path - ) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(settings.input_file_path) real_bpm = ultrastar_converter.ultrastar_bpm_to_real_bpm( float(ultrastar_class.bpm.replace(",", ".")) ) @@ -758,63 +787,78 @@ def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]: ) -def create_midi_file(real_bpm: float, - song_output: str, - ultrastar_class: UltrastarTxtValue, - basename_without_ext: str) -> None: +def create_midi_file( + real_bpm: float, + song_output: str, + ultrastar_class: UltrastarTxtValue, + basename_without_ext: str, +) -> None: """Create midi file""" - print( - f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}" - ) + print(f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}") voice_instrument = [ midi_creator.convert_ultrastar_to_midi_instrument(ultrastar_class) ] midi_output = os.path.join(song_output, f"{basename_without_ext}.mid") - midi_creator.instruments_to_midi( - voice_instrument, real_bpm, midi_output - ) + midi_creator.instruments_to_midi(voice_instrument, real_bpm, midi_output) -def pitch_audio(transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[ +def pitch_audio( + transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue, cache_path: str) -> tuple[ list[str], PitchedData, list[int]]: """Pitch audio""" # todo: chunk pitching as option? # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name) - pitched_data = get_pitch_with_crepe_file( - settings.mono_audio_path, - settings.crepe_model_capacity, - settings.crepe_step_size, - settings.tensorflow_device - ) - if not settings.ignore_audio: - start_times = [] - end_times = [] - for i, data in enumerate(transcribed_data): - start_times.append(data.start) - end_times.append(data.end) - midi_notes = create_midi_notes_from_pitched_data( - start_times, end_times, pitched_data + + pitching_config = f"crepe_{settings.ignore_audio}_{settings.crepe_model_capacity}_{settings.crepe_step_size}_{settings.tensorflow_device}" + pitching_path = os.path.join(cache_path, f"{pitching_config}.json") + cache_available = check_file_exists(pitching_path) + + if settings.skip_cache_transcription or not cache_available: + pitched_data = get_pitch_with_crepe_file( + settings.mono_audio_path, + settings.crepe_model_capacity, + settings.crepe_step_size, + settings.tensorflow_device, ) + if not settings.ignore_audio: + start_times = [] + end_times = [] + for i, data in enumerate(transcribed_data): + start_times.append(data.start) + end_times.append(data.end) + midi_notes = create_midi_notes_from_pitched_data( + start_times, end_times, pitched_data + ) + else: + midi_notes = create_midi_notes_from_pitched_data( + ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data + ) + ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes) + + pitching_result = PitchingResult(midi_notes, pitched_data, ultrastar_note_numbers) + + pitching_result_json = pitching_result.to_json() + with open(pitching_path, "w", encoding=FILE_ENCODING) as file: + file.write(pitching_result_json) else: - midi_notes = create_midi_notes_from_pitched_data( - ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data - ) - ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes) - return midi_notes, pitched_data, ultrastar_note_numbers + print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data") + with open(pitching_path) as file: + json = file.read() + pitching_result = PitchingResult.from_json(json) + + return pitching_result.midi_notes, pitching_result.pitched_data, pitching_result.ultrastar_note_numbers def create_audio_chunks( cache_path: str, transcribed_data: list[TranscribedData], ultrastar_audio_input_path: str, - ultrastar_class: UltrastarTxtValue + ultrastar_class: UltrastarTxtValue, ) -> None: """Create audio chunks""" - audio_chunks_path = os.path.join( - cache_path, settings.audio_chunk_folder_name - ) + audio_chunks_path = os.path.join(cache_path, settings.audio_chunk_folder_name) os_helper.create_folder(audio_chunks_path) if not settings.ignore_audio: # and csv csv_filename = os.path.join(audio_chunks_path, "_chunks.csv") @@ -830,10 +874,14 @@ def create_audio_chunks( def denoise_vocal_audio(basename_without_ext: str, cache_path: str) -> None: """Denoise vocal audio""" - denoised_path = os.path.join( - cache_path, basename_without_ext + "_denoised.wav" - ) - ffmpeg_reduce_noise(settings.mono_audio_path, denoised_path) + denoised_path = os.path.join(cache_path, basename_without_ext + "_denoised.wav") + cache_available = check_file_exists(denoised_path) + + if settings.skip_cache_denoise_vocal_audio or not cache_available: + ffmpeg_reduce_noise(settings.mono_audio_path, denoised_path) + else: + print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached denoised audio") + settings.mono_audio_path = denoised_path diff --git a/src/modules/Pitcher/PitchingResult.py b/src/modules/Pitcher/PitchingResult.py new file mode 100644 index 0000000..47a74c3 --- /dev/null +++ b/src/modules/Pitcher/PitchingResult.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass + +from dataclasses_json import dataclass_json + +from modules.Pitcher.pitched_data import PitchedData + + +@dataclass_json +@dataclass +class PitchingResult: + """Pitching result""" + + midi_notes: list[str] + pitched_data: PitchedData + ultrastar_note_numbers: list[int] diff --git a/src/modules/Pitcher/pitched_data.py b/src/modules/Pitcher/pitched_data.py index f2d32df..3edb088 100644 --- a/src/modules/Pitcher/pitched_data.py +++ b/src/modules/Pitcher/pitched_data.py @@ -1,7 +1,10 @@ """Pitched data""" from dataclasses import dataclass +from dataclasses_json import dataclass_json + +@dataclass_json @dataclass class PitchedData: """Pitched data from crepe""" diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py index c3fc81b..80506c7 100644 --- a/src/modules/Pitcher/pitcher.py +++ b/src/modules/Pitcher/pitcher.py @@ -41,6 +41,9 @@ def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size) timer.log('Computing loudness end') + # convert to native float for serialization + confidence = [float(x) for x in confidence] + return PitchedData(times, frequencies, confidence, perceived_loudness) diff --git a/src/modules/Research/TestSong.py b/src/modules/Research/TestSong.py index 3aa262d..fc0d72e 100644 --- a/src/modules/Research/TestSong.py +++ b/src/modules/Research/TestSong.py @@ -8,5 +8,6 @@ class TestSong: """Test song""" txt: str + folder: str audio: float ultrastar_class: UltrastarTxtValue \ No newline at end of file diff --git a/src/modules/Research/UltraSingerEvaluation.py b/src/modules/Research/UltraSingerEvaluation.py index 17cfcf6..d0108c0 100644 --- a/src/modules/Research/UltraSingerEvaluation.py +++ b/src/modules/Research/UltraSingerEvaluation.py @@ -25,14 +25,9 @@ def main() -> None: """Main function""" - test_input_folder_path = Path(test_input_folder) - test_input_folder_path.mkdir(parents=True, exist_ok=True) - - test_output_folder_path = Path(test_output_folder) - test_output_folder_path.mkdir(parents=True, exist_ok=True) - - test_run_folder_path = Path(test_run_folder) - test_run_folder_path.mkdir(parents=True) + Path(test_input_folder).mkdir(parents=True, exist_ok=True) + Path(test_output_folder).mkdir(parents=True, exist_ok=True) + Path(test_run_folder).mkdir(parents=True) base_settings = initialize_settings() base_settings.output_file_path = test_run_folder @@ -49,19 +44,15 @@ def main() -> None: test_songs: List[TestSong] = [] for dir_entry in os.listdir(base_settings.test_songs_input_folder): - dir_entry_path = os.path.join(base_settings.test_songs_input_folder, dir_entry) - if os.path.isdir(dir_entry_path): - for sub_dir_entry in os.listdir(dir_entry_path): - if sub_dir_entry.endswith(".txt") and sub_dir_entry != "license.txt": - txt_file = os.path.join( - base_settings.test_songs_input_folder, dir_entry, sub_dir_entry - ) + song_folder = os.path.join(base_settings.test_songs_input_folder, dir_entry) + if os.path.isdir(song_folder): + for song_folder_item in os.listdir(song_folder): + if song_folder_item.endswith(".txt") and song_folder_item != "license.txt": + txt_file = os.path.join(song_folder, song_folder_item) ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file) if ultrastar_class.mp3: - test_song = TestSong( - txt_file, ultrastar_class.mp3, ultrastar_class - ) + test_song = TestSong(txt_file, song_folder, ultrastar_class.mp3, ultrastar_class) test_songs.append(test_song) break else: @@ -83,8 +74,12 @@ def main() -> None: f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.txt)}" ) + # prepare cache directory + song_cache_path = os.path.join(test_song.folder, "cache") + Path(song_cache_path).mkdir(parents=True, exist_ok=True) test_song_settings = copy.deepcopy(base_settings) test_song_settings.input_file_path = test_song.txt + test_song_settings.cache_override_path = song_cache_path UltraSinger.settings = test_song_settings UltraSinger.run() diff --git a/src/modules/Speech_Recognition/TranscribedData.py b/src/modules/Speech_Recognition/TranscribedData.py index 5962d9a..d006204 100644 --- a/src/modules/Speech_Recognition/TranscribedData.py +++ b/src/modules/Speech_Recognition/TranscribedData.py @@ -1,20 +1,28 @@ -"""Docstring""" +from dataclasses import dataclass +from dataclasses_json import dataclass_json + +@dataclass_json +@dataclass class TranscribedData: """Transcribed data from json file""" - def __init__(self, transcribed_json = None): + confidence: float = 0 + word: str = "" + start: float = 0 + end: float = 0 + is_hyphen: bool = False + - if transcribed_json: - # Vosk = conf, Whisper = confidence - self.conf = transcribed_json.get( - "conf", transcribed_json.get("confidence", None) - ) - # Vosk = word, Whisper = text - self.word = transcribed_json.get( - "word", transcribed_json.get("text", None) - ) - self.end = transcribed_json.get("end", None) - self.start = transcribed_json.get("start", None) - self.is_hyphen = None +def from_whisper(whisper_dict) -> TranscribedData: + transcribed_data = TranscribedData() + if "score" in whisper_dict: + transcribed_data.confidence = whisper_dict["score"] + if "word" in whisper_dict: + transcribed_data.word = whisper_dict["word"] + if "start" in whisper_dict: + transcribed_data.start = whisper_dict["start"] + if "end" in whisper_dict: + transcribed_data.end = whisper_dict["end"] + return transcribed_data diff --git a/src/modules/Speech_Recognition/TranscriptionResult.py b/src/modules/Speech_Recognition/TranscriptionResult.py new file mode 100644 index 0000000..1fa055f --- /dev/null +++ b/src/modules/Speech_Recognition/TranscriptionResult.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass + +from dataclasses_json import dataclass_json + +from modules.Speech_Recognition.TranscribedData import TranscribedData + + +@dataclass_json +@dataclass +class TranscriptionResult: + """Transcription result""" + + transcribed_data: list[TranscribedData] + detected_language: str diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index 0545ff7..565b123 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -5,8 +5,9 @@ import whisperx from torch.cuda import OutOfMemoryError +from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted -from modules.Speech_Recognition.TranscribedData import TranscribedData +from modules.Speech_Recognition.TranscribedData import TranscribedData, from_whisper def transcribe_with_whisper( @@ -17,7 +18,7 @@ def transcribe_with_whisper( batch_size: int = 16, compute_type: str = None, language: str = None, -) -> (list[TranscribedData], str): +) -> TranscriptionResult: """Transcribe with whisper""" print( @@ -90,20 +91,19 @@ def transcribe_with_whisper( transcribed_data = convert_to_transcribed_data(result_aligned) - return transcribed_data, detected_language + return TranscriptionResult(transcribed_data, detected_language) def convert_to_transcribed_data(result_aligned): transcribed_data = [] for segment in result_aligned["segments"]: for obj in segment["words"]: - vtd = TranscribedData(obj) # create custom Word object + vtd = from_whisper(obj) # create custom Word object vtd.word = vtd.word + " " # add space to end of word if len(obj) < 4: previous = transcribed_data[-1] if not previous: previous.end = 0 - previous.end = "" vtd.start = previous.end + 0.1 vtd.end = previous.end + 0.2 msg = f'Error: There is no timestamp for word: {obj["word"]}. ' \ diff --git a/src/modules/Ultrastar/ultrastar_writer.py b/src/modules/Ultrastar/ultrastar_writer.py index 9dee2e3..f6497c9 100644 --- a/src/modules/Ultrastar/ultrastar_writer.py +++ b/src/modules/Ultrastar/ultrastar_writer.py @@ -68,12 +68,12 @@ def create_ultrastar_txt_from_automation( if ultrastar_class.background is not None: file.write(f"#{UltrastarTxtTag.BACKGROUND}:{ultrastar_class.background}\n") file.write(f"#{UltrastarTxtTag.MP3}:{ultrastar_class.mp3}\n") - file.write(f"#{UltrastarTxtTag.GAP}:{int(gap * 1000)}\n") if ultrastar_class.video is not None: file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n") if ultrastar_class.videoGap is not None: file.write(f"#{UltrastarTxtTag.VIDEOGAP}:{ultrastar_class.videoGap}\n") file.write(f"#{UltrastarTxtTag.BPM}:{round(ultrastar_bpm, 2)}\n") # not the real BPM! + file.write(f"#{UltrastarTxtTag.GAP}:{int(gap * 1000)}\n") file.write(f"#{UltrastarTxtTag.CREATOR}:{ultrastar_class.creator}\n") file.write(f"#{UltrastarTxtTag.FIXER}:{ultrastar_class.fixer}\n") file.write(f"#{UltrastarTxtTag.COMMENT}:{ultrastar_class.comment}\n") diff --git a/src/modules/console_colors.py b/src/modules/console_colors.py index c4cc6f8..59328ff 100644 --- a/src/modules/console_colors.py +++ b/src/modules/console_colors.py @@ -8,6 +8,11 @@ def blue_highlighted(text: str) -> str: return f"{Bcolors.blue}{text}{Bcolors.endc}" +def green_highlighted(text: str) -> str: + """Returns a blue highlighted text""" + return f"{Bcolors.dark_green}{text}{Bcolors.endc}" + + def gold_highlighted(text: str) -> str: """Returns a gold highlighted text""" return f"{Bcolors.gold}{text}{Bcolors.endc}" @@ -37,6 +42,7 @@ class Bcolors: """Colors for the console""" blue = "\033[94m" + dark_green = "\033[32m" red = "\033[91m" light_blue = "\033[96m" cyan = "\033[36m" diff --git a/src/modules/csv_handler.py b/src/modules/csv_handler.py index 58cceeb..3bb01f4 100644 --- a/src/modules/csv_handler.py +++ b/src/modules/csv_handler.py @@ -20,7 +20,7 @@ def export_transcribed_data_to_csv(transcribed_data: list[TranscribedData], file data.word, data.start, data.end, - data.conf, + data.confidence, ] ) From cb4aa4e5ead28234f307eccdc2c940b5e823fdf2 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Fri, 6 Oct 2023 22:57:14 +0200 Subject: [PATCH 05/19] move UltraSingerEvaluation to top level --- .../Research => }/UltraSingerEvaluation.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) rename src/{modules/Research => }/UltraSingerEvaluation.py (87%) diff --git a/src/modules/Research/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py similarity index 87% rename from src/modules/Research/UltraSingerEvaluation.py rename to src/UltraSingerEvaluation.py index d0108c0..4e159ff 100644 --- a/src/modules/Research/UltraSingerEvaluation.py +++ b/src/UltraSingerEvaluation.py @@ -1,9 +1,9 @@ import copy import os -import sys from datetime import datetime from pathlib import Path from typing import List +import importlib.util import UltraSinger from Settings import Settings @@ -13,10 +13,10 @@ from modules.console_colors import ULTRASINGER_HEAD, red_highlighted test_input_folder = os.path.normpath( - os.path.abspath(__file__ + "../../../../../test_input") + os.path.abspath(__file__ + "/../../test_input") ) test_output_folder = os.path.normpath( - os.path.abspath(__file__ + "../../../../../test_output") + os.path.abspath(__file__ + "/../../test_output") ) test_run_folder = os.path.join( test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S") @@ -90,10 +90,15 @@ def initialize_settings(): os.path.join(test_input_folder, "config/local.py") ) if os.path.isfile(user_config_file): - sys.path.append(os.path.join(user_config_file, "..")) - import local + print(f"{ULTRASINGER_HEAD} Using custom settings found under {user_config_file}") - s = local.user_settings + spec = importlib.util.spec_from_file_location("custom_settings", user_config_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + s = module.user_settings + else: + print(f"{ULTRASINGER_HEAD} No custom settings found under {user_config_file}") if not s.force_cpu: s.tensorflow_device, s.pytorch_device = check_gpu_support() From 7d57225a517c8e05f41c01c055435a590116301b Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sat, 7 Oct 2023 10:37:25 +0200 Subject: [PATCH 06/19] make loudness threshold configurable --- requirements.txt | 8 ++-- src/Settings.py | 1 + src/UltraSinger.py | 60 +++++++++++++-------------- src/modules/Pitcher/PitchingResult.py | 15 ------- src/modules/Pitcher/pitcher.py | 8 ++-- 5 files changed, 39 insertions(+), 53 deletions(-) delete mode 100644 src/modules/Pitcher/PitchingResult.py diff --git a/requirements.txt b/requirements.txt index b939eb4..d5763a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -crepe~=0.0.13 +crepe~=0.0.14 demucs~=4.0.0 ffmpeg_python~=0.2.0 git+https://github.com/m-bain/whisperx.git @@ -14,13 +14,13 @@ pydub~=0.25.1 PyHyphen~=4.0.3 python_Levenshtein~=0.21.1 scipy~=1.11.2 -tensorflow<2.11 +tensorflow==2.10.1 tqdm~=4.65.2 whisperx~=3.1.1 -yt_dlp~=2023.7.6 +yt_dlp~=2023.9.24 isort~=5.12 black~=23.3 pylint~=2.17 pytest~=7.3.1 -protobuf==3.20.* \ No newline at end of file +protobuf==3.19.6 \ No newline at end of file diff --git a/src/Settings.py b/src/Settings.py index efcbf9e..c46aaf5 100644 --- a/src/Settings.py +++ b/src/Settings.py @@ -28,6 +28,7 @@ class Settings: # Pitch crepe_model_capacity = "full" # tiny|small|medium|large|full crepe_step_size = 10 # in miliseconds + pitch_loudness_threshold = -60 # Device pytorch_device = 'cpu' # cpu|cuda diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 06b5d6f..d280dcb 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -2,7 +2,6 @@ import copy import getopt -import json import os import sys @@ -18,7 +17,6 @@ export_chunks_from_ultrastar_data, ) from modules.Audio.silence_processing import remove_silence_from_transcription_data -from modules.Pitcher.PitchingResult import PitchingResult from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult from modules.csv_handler import export_transcribed_data_to_csv from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3 @@ -392,6 +390,7 @@ def run() -> None: ) settings.mono_audio_path = os.path.join(cache_path, basename_without_ext + ".wav") + os_helper.create_folder(cache_path) # Separate vocal from audio audio_separation_path = separate_vocal_from_audio(basename_without_ext, cache_path, ultrastar_audio_input_path) @@ -533,11 +532,11 @@ def separate_vocal_from_audio( basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str ) -> str: """Separate vocal from audio""" - demcus_output_folder = os.path.splitext( + demucs_output_folder = os.path.splitext( os.path.basename(ultrastar_audio_input_path) )[0] audio_separation_path = os.path.join( - cache_path, "separated", "htdemucs", demcus_output_folder + cache_path, "separated", "htdemucs", demucs_output_folder ) vocals_path = os.path.join(audio_separation_path, "vocals.wav") @@ -810,9 +809,10 @@ def pitch_audio( # todo: chunk pitching as option? # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name) - pitching_config = f"crepe_{settings.ignore_audio}_{settings.crepe_model_capacity}_{settings.crepe_step_size}_{settings.tensorflow_device}" - pitching_path = os.path.join(cache_path, f"{pitching_config}.json") - cache_available = check_file_exists(pitching_path) + pitching_config = f"crepe_{settings.ignore_audio}_{settings.crepe_model_capacity}_{settings.crepe_step_size}_{settings.tensorflow_device}_{settings.pitch_loudness_threshold}" + pitched_data_path = os.path.join(cache_path, f"{pitching_config}.json") + cache_available = check_file_exists(pitched_data_path) + pitched_data = None if settings.skip_cache_transcription or not cache_available: pitched_data = get_pitch_with_crepe_file( @@ -820,35 +820,35 @@ def pitch_audio( settings.crepe_model_capacity, settings.crepe_step_size, settings.tensorflow_device, + settings.pitch_loudness_threshold ) - if not settings.ignore_audio: - start_times = [] - end_times = [] - for i, data in enumerate(transcribed_data): - start_times.append(data.start) - end_times.append(data.end) - midi_notes = create_midi_notes_from_pitched_data( - start_times, end_times, pitched_data - ) - - else: - midi_notes = create_midi_notes_from_pitched_data( - ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data - ) - ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes) - - pitching_result = PitchingResult(midi_notes, pitched_data, ultrastar_note_numbers) - pitching_result_json = pitching_result.to_json() - with open(pitching_path, "w", encoding=FILE_ENCODING) as file: - file.write(pitching_result_json) + pitched_data_json = pitched_data.to_json() + with open(pitched_data_path, "w", encoding=FILE_ENCODING) as file: + file.write(pitched_data_json) else: print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data") - with open(pitching_path) as file: + with open(pitched_data_path) as file: json = file.read() - pitching_result = PitchingResult.from_json(json) + pitched_data = PitchedData.from_json(json) + + if not settings.ignore_audio: + start_times = [] + end_times = [] + for i, data in enumerate(transcribed_data): + start_times.append(data.start) + end_times.append(data.end) + midi_notes = create_midi_notes_from_pitched_data( + start_times, end_times, pitched_data + ) + else: + midi_notes = create_midi_notes_from_pitched_data( + ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data + ) + + ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes) - return pitching_result.midi_notes, pitching_result.pitched_data, pitching_result.ultrastar_note_numbers + return midi_notes, pitched_data, ultrastar_note_numbers def create_audio_chunks( diff --git a/src/modules/Pitcher/PitchingResult.py b/src/modules/Pitcher/PitchingResult.py deleted file mode 100644 index 47a74c3..0000000 --- a/src/modules/Pitcher/PitchingResult.py +++ /dev/null @@ -1,15 +0,0 @@ -from dataclasses import dataclass - -from dataclasses_json import dataclass_json - -from modules.Pitcher.pitched_data import PitchedData - - -@dataclass_json -@dataclass -class PitchingResult: - """Pitching result""" - - midi_notes: list[str] - pitched_data: PitchedData - ultrastar_note_numbers: list[int] diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py index 80506c7..d979c12 100644 --- a/src/modules/Pitcher/pitcher.py +++ b/src/modules/Pitcher/pitcher.py @@ -11,7 +11,7 @@ def get_pitch_with_crepe_file( - filename: str, model_capacity: str, step_size: int = 10, device: str = "cpu" + filename: str, model_capacity: str, step_size: int = 10, device: str = "cpu", filter_silence_threshold: int = -60 ) -> PitchedData: """Pitch with crepe""" @@ -22,10 +22,10 @@ def get_pitch_with_crepe_file( audio, sample_rate = librosa.load(filename) timer.log('Load file for pitch detection end') - return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size) + return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size, filter_silence_threshold) -def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10) -> PitchedData: +def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10, filter_silence_threshold: int = -60) -> PitchedData: """Pitch with crepe""" if sample_rate != CREPE_MODEL_SAMPLE_RATE: @@ -38,7 +38,7 @@ def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size timer.log('Crepe pitch detection end') timer.log('Computing loudness start') - confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size) + confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=filter_silence_threshold, step_size=step_size) timer.log('Computing loudness end') # convert to native float for serialization From 3bf7a06bb6b4cef34fd0eb7d16c4d54626cd4cd3 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sun, 8 Oct 2023 10:32:30 +0200 Subject: [PATCH 07/19] add pitch comparison --- src/Settings.py | 8 ++ src/UltraSinger.py | 53 +++++--- src/UltraSingerEvaluation.py | 115 +++++++++++++---- src/modules/Research/TestRun.py | 29 +++++ src/modules/Research/TestSong.py | 7 +- src/modules/Ultrastar/ultrastar_converter.py | 117 +++++++++++++++--- .../Ultrastar/ultrastar_score_calculator.py | 5 + 7 files changed, 273 insertions(+), 61 deletions(-) create mode 100644 src/modules/Research/TestRun.py diff --git a/src/Settings.py b/src/Settings.py index c46aaf5..709622a 100644 --- a/src/Settings.py +++ b/src/Settings.py @@ -1,3 +1,10 @@ +from dataclasses import dataclass + +from dataclasses_json import dataclass_json + + +@dataclass_json +@dataclass class Settings: create_midi = True create_plot = False @@ -42,3 +49,4 @@ class Settings: skip_cache_denoise_vocal_audio = False skip_cache_transcription = False skip_cache_pitch_detection = False + calculate_score = True \ No newline at end of file diff --git a/src/UltraSinger.py b/src/UltraSinger.py index d280dcb..d7bd992 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -4,6 +4,7 @@ import getopt import os import sys +from typing import Tuple, Any import Levenshtein import librosa @@ -18,6 +19,7 @@ ) from modules.Audio.silence_processing import remove_silence_from_transcription_data from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult +from modules.Ultrastar.ultrastar_score_calculator import Score from modules.csv_handler import export_transcribed_data_to_csv from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3 from modules.Audio.youtube import ( @@ -343,7 +345,7 @@ def merge_syllable_segments( return new_data, new_midi_notes, new_us_notes -def run() -> None: +def run() -> tuple[str, Score, Score]: """The processing function of this program""" settings.input_file_is_ultrastar_txt = settings.input_file_path.endswith(".txt") @@ -392,7 +394,9 @@ def run() -> None: os_helper.create_folder(cache_path) # Separate vocal from audio - audio_separation_path = separate_vocal_from_audio(basename_without_ext, cache_path, ultrastar_audio_input_path) + audio_separation_path = separate_vocal_from_audio( + basename_without_ext, cache_path, ultrastar_audio_input_path + ) # Denoise vocal audio denoise_vocal_audio(basename_without_ext, cache_path) @@ -463,13 +467,16 @@ def run() -> None: song_output, ultrastar_class, ultrastar_note_numbers ) - # Calc Points - ultrastar_class, simple_score, accurate_score = calculate_score_points( - pitched_data, ultrastar_class, ultrastar_file_output - ) + simple_score = None + accurate_score = None + if settings.calculate_score: + # Calc Points + ultrastar_class, simple_score, accurate_score = calculate_score_points( + pitched_data, ultrastar_class, ultrastar_file_output + ) - # Add calculated score to Ultrastar txt - ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score) + # Add calculated score to Ultrastar txt + ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score) # Midi if settings.create_midi: @@ -477,6 +484,7 @@ def run() -> None: # Print Support print_support() + return ultrastar_file_output, simple_score, accurate_score def get_unused_song_output_dir(path: str) -> str: @@ -519,7 +527,9 @@ def transcribe_audio(cache_path: str) -> TranscriptionResult: with open(transcription_path, "w", encoding=FILE_ENCODING) as file: file.write(transcription_result.to_json()) else: - print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached transcribed data") + print( + f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached transcribed data" + ) with open(transcription_path) as file: json = file.read() transcription_result = TranscriptionResult.from_json(json) @@ -542,14 +552,17 @@ def separate_vocal_from_audio( vocals_path = os.path.join(audio_separation_path, "vocals.wav") instrumental_path = os.path.join(audio_separation_path, "no_vocals.wav") if settings.use_separated_vocal or settings.create_karaoke: - cache_available = (check_file_exists(vocals_path) - and check_file_exists(instrumental_path)) + cache_available = check_file_exists(vocals_path) and check_file_exists( + instrumental_path + ) if settings.skip_cache_vocal_separation or not cache_available: separate_audio( ultrastar_audio_input_path, cache_path, settings.pytorch_device ) else: - print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached separated vocals") + print( + f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached separated vocals" + ) if settings.use_separated_vocal: input_path = vocals_path @@ -803,8 +816,10 @@ def create_midi_file( def pitch_audio( - transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue, cache_path: str) -> tuple[ - list[str], PitchedData, list[int]]: + transcribed_data: list[TranscribedData], + ultrastar_class: UltrastarTxtValue, + cache_path: str, +) -> tuple[list[str], PitchedData, list[int]]: """Pitch audio""" # todo: chunk pitching as option? # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name) @@ -820,14 +835,16 @@ def pitch_audio( settings.crepe_model_capacity, settings.crepe_step_size, settings.tensorflow_device, - settings.pitch_loudness_threshold + settings.pitch_loudness_threshold, ) pitched_data_json = pitched_data.to_json() with open(pitched_data_path, "w", encoding=FILE_ENCODING) as file: file.write(pitched_data_json) else: - print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data") + print( + f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data" + ) with open(pitched_data_path) as file: json = file.read() pitched_data = PitchedData.from_json(json) @@ -880,7 +897,9 @@ def denoise_vocal_audio(basename_without_ext: str, cache_path: str) -> None: if settings.skip_cache_denoise_vocal_audio or not cache_available: ffmpeg_reduce_noise(settings.mono_audio_path, denoised_path) else: - print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached denoised audio") + print( + f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached denoised audio" + ) settings.mono_audio_path = denoised_path diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py index 4e159ff..7b9eebe 100644 --- a/src/UltraSingerEvaluation.py +++ b/src/UltraSingerEvaluation.py @@ -8,16 +8,16 @@ import UltraSinger from Settings import Settings from modules.DeviceDetection.device_detection import check_gpu_support +from modules.Research.TestRun import TestRun, TestedSong from modules.Research.TestSong import TestSong from modules.Ultrastar import ultrastar_parser +from modules.Ultrastar.ultrastar_converter import compare_pitches +from modules.Ultrastar.ultrastar_parser import parse_ultrastar_txt +from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, FILE_ENCODING from modules.console_colors import ULTRASINGER_HEAD, red_highlighted -test_input_folder = os.path.normpath( - os.path.abspath(__file__ + "/../../test_input") -) -test_output_folder = os.path.normpath( - os.path.abspath(__file__ + "/../../test_output") -) +test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input")) +test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output")) test_run_folder = os.path.join( test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S") ) @@ -45,20 +45,11 @@ def main() -> None: test_songs: List[TestSong] = [] for dir_entry in os.listdir(base_settings.test_songs_input_folder): song_folder = os.path.join(base_settings.test_songs_input_folder, dir_entry) - if os.path.isdir(song_folder): - for song_folder_item in os.listdir(song_folder): - if song_folder_item.endswith(".txt") and song_folder_item != "license.txt": - txt_file = os.path.join(song_folder, song_folder_item) - ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file) - - if ultrastar_class.mp3: - test_song = TestSong(txt_file, song_folder, ultrastar_class.mp3, ultrastar_class) - test_songs.append(test_song) - break - else: - print( - f"{ULTRASINGER_HEAD} {red_highlighted('Warning.')} {base_settings.test_songs_input_folder} contains an UltraStar text file but has no audio referenced in it. Skipping." - ) + found_song = find_ultrastar_song(song_folder) + if found_song is None: + continue + + test_songs.append(TestSong(found_song[0], song_folder, found_song[1])) if len(test_songs) == 0: print( @@ -68,20 +59,88 @@ def main() -> None: print(f"{ULTRASINGER_HEAD} Running evaluation for {len(test_songs)} songs") + test_run = TestRun(base_settings) for index, test_song in enumerate(test_songs): print(f"{ULTRASINGER_HEAD} ========================") print( - f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.txt)}" + f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.input_txt)}" ) # prepare cache directory - song_cache_path = os.path.join(test_song.folder, "cache") + song_cache_path = os.path.join(test_song.input_folder, "cache") Path(song_cache_path).mkdir(parents=True, exist_ok=True) + test_song_settings = copy.deepcopy(base_settings) - test_song_settings.input_file_path = test_song.txt + test_song_settings.input_file_path = test_song.input_txt test_song_settings.cache_override_path = song_cache_path UltraSinger.settings = test_song_settings - UltraSinger.run() + + output_txt = None + simple_score = None + accurate_score = None + tested_song = TestedSong(test_song.input_txt) + test_run.tested_songs.append(tested_song) + try: + output_txt, simple_score, accurate_score = UltraSinger.run() + except Exception as error: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Failed to process {test_song.input_txt}\n{error}." + ) + continue + + + output_folder_name = f"{test_song.input_ultrastar_class.artist} - {test_song.input_ultrastar_class.title}" + output_folder = os.path.join(test_run_folder, output_folder_name) + + if not os.path.isfile(output_txt): + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Could not find song txt in '{output_folder}'." + ) + test_run.tested_songs.append(tested_song) + continue + + ultrastar_class = parse_ultrastar_txt(output_txt) + ( + input_match_ratio, + output_match_ratio, + pitch_where_should_be_no_pitch_ratio, + no_pitch_where_should_be_pitch_ratio, + ) = compare_pitches(test_song.input_ultrastar_class, ultrastar_class) + + tested_song.output_path = output_txt + tested_song.success = True + tested_song.input_match_ratio = input_match_ratio + tested_song.output_match_ratio = output_match_ratio + tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio + tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio + tested_song.output_score_simple = simple_score + tested_song.output_score_accurate = accurate_score + + test_run_result_file = os.path.join(test_run_folder, "run.json") + test_run_json = test_run.to_json() + with open(test_run_result_file, "w", encoding=FILE_ENCODING) as file: + file.write(test_run_json) + + +def find_ultrastar_song( + song_folder, require_audio: bool = True +) -> tuple[str, UltrastarTxtValue]: + if os.path.isdir(song_folder): + for song_folder_item in os.listdir(song_folder): + if ( + song_folder_item.endswith(".txt") + and song_folder_item != "license.txt" + and not song_folder_item.endswith("[Karaoke].txt") + ): + txt_file = os.path.join(song_folder, song_folder_item) + ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file) + + if ultrastar_class.mp3 != "" or not require_audio: + return txt_file, ultrastar_class + else: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Warning.')} {song_folder} contains an UltraStar text file but has no audio referenced in it. Skipping." + ) def initialize_settings(): @@ -90,9 +149,13 @@ def initialize_settings(): os.path.join(test_input_folder, "config/local.py") ) if os.path.isfile(user_config_file): - print(f"{ULTRASINGER_HEAD} Using custom settings found under {user_config_file}") + print( + f"{ULTRASINGER_HEAD} Using custom settings found under {user_config_file}" + ) - spec = importlib.util.spec_from_file_location("custom_settings", user_config_file) + spec = importlib.util.spec_from_file_location( + "custom_settings", user_config_file + ) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py new file mode 100644 index 0000000..be60a78 --- /dev/null +++ b/src/modules/Research/TestRun.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass, field + +from dataclasses_json import dataclass_json + +from Settings import Settings + + +@dataclass_json +@dataclass +class TestedSong: + """Tested song""" + + input_path: str + output_path: str = "" + success: bool = False + input_match_ratio: float = 0.0 + output_match_ratio: float = 0.0 + no_pitch_where_should_be_pitch_ratio: float = 0.0 + pitch_where_should_be_no_pitch_ratio: float = 0.0 + output_score_simple: int = 0 + output_score_accurate: int = 0 + + +@dataclass_json +@dataclass +class TestRun: + """Test run""" + settings: Settings + tested_songs: list[TestedSong] = field(default_factory=lambda: []) diff --git a/src/modules/Research/TestSong.py b/src/modules/Research/TestSong.py index fc0d72e..be3a1b4 100644 --- a/src/modules/Research/TestSong.py +++ b/src/modules/Research/TestSong.py @@ -7,7 +7,6 @@ class TestSong: """Test song""" - txt: str - folder: str - audio: float - ultrastar_class: UltrastarTxtValue \ No newline at end of file + input_txt: str + input_folder: str + input_ultrastar_class: UltrastarTxtValue diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index 97f79ee..01a143f 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -1,7 +1,13 @@ """Ultrastar Converter""" +from typing import Tuple + +import numpy from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue +NO_PITCH = -1000 + + def real_bpm_to_ultrastar_bpm(real_bpm: float) -> float: """Converts real BPM to UltraStar BPM""" # The UltraStar BPM info is a fourth beat of the real BPM @@ -48,32 +54,115 @@ def ultrastar_note_to_midi_note(ultrastar_note: int) -> int: return midi_note -def get_start_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float: +def get_start_time_from_ultrastar( + ultrastar_class: UltrastarTxtValue, pos: int +) -> float: """Calculates the start time from the Ultrastar txt""" - gap = int(float(ultrastar_class.gap) / 1000) - real_bpm = ultrastar_bpm_to_real_bpm( - float(ultrastar_class.bpm.replace(",", ".")) - ) - start_time = ( - beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap - ) + gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000) + real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", "."))) + start_time = beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap return start_time def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float: """Calculates the end time from the Ultrastar txt""" - gap = int(float(ultrastar_class.gap) / 1000) - real_bpm = ultrastar_bpm_to_real_bpm( - float(ultrastar_class.bpm.replace(",", ".")) - ) + gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000) + real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", "."))) end_time = ( beat_to_second( - int(ultrastar_class.startBeat[pos]) - + int(ultrastar_class.durations[pos]), + int(ultrastar_class.startBeat[pos]) + int(ultrastar_class.durations[pos]), real_bpm, ) + gap ) return end_time + + +def map_to_datapoints( + ultrastar_class: UltrastarTxtValue, step_size: int = 10 +) -> list[int]: + gap = int(float(ultrastar_class.gap.replace(",", "."))) + + data = [] + + previous_step = -step_size + for pos, pitch in enumerate(ultrastar_class.pitches): + if ultrastar_class.noteType[pos] == "F": + continue + + start_time = int(get_start_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap + end_time = int(get_end_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap + + start_nearest_step = (start_time + step_size - 1) // step_size * step_size + end_nearest_step = (end_time + step_size - 1) // step_size * step_size + + if previous_step == start_nearest_step: + start_nearest_step += step_size + + duration = end_nearest_step - start_nearest_step + + if duration < 10: + continue + + # pad gaps between pitches with empty datapoints + gap_steps_count = (start_nearest_step - previous_step - step_size) // step_size + data += [NO_PITCH] * gap_steps_count + + pitch_steps_count = duration // step_size + data += [pitch] * pitch_steps_count + previous_step = end_nearest_step + + return data + + +def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float]: + step_size = 10 + + input_datapoints = map_to_datapoints(input_ultrastar_class, step_size) + output_datapoints = map_to_datapoints(output_ultrastar_class, step_size) + + longest = max(len(input_datapoints), len(output_datapoints)) + for datapoints in [input_datapoints, output_datapoints]: + length = len(datapoints) + if length < longest: + gap_steps_count = longest - length + # pad gaps between pitches with empty datapoints + datapoints += [NO_PITCH] * gap_steps_count + + input_pitched_datapoints = len([x for x in input_datapoints if x != NO_PITCH]) + output_pitched_datapoints = len([x for x in output_datapoints if x != NO_PITCH]) + + matches = 0 + pitch_where_should_be_no_pitch = 0 + no_pitch_where_should_be_pitch = 0 + for index, _ in enumerate(input_datapoints): + input_pitch = input_datapoints[index] + output_pitch = output_datapoints[index] + if input_pitch != NO_PITCH and output_pitch != NO_PITCH: + continue + + if input_pitch == output_pitch: + matches += 1 + elif input_pitch == NO_PITCH: + pitch_where_should_be_no_pitch += 1 + else: + no_pitch_where_should_be_pitch += 1 + + input_match_ratio = matches / input_pitched_datapoints + output_match_ratio = matches / output_pitched_datapoints + + output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints + output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints + + + return input_match_ratio, output_match_ratio, output_pitch_where_should_be_no_pitch_ratio, output_no_pitch_where_should_be_pitch_ratio + + +def determine_nearest_end_step(input_ultrastar_class, step_size) -> int: + pitches_count = len(input_ultrastar_class.pitches) - 1 + end_time = int( + get_end_time_from_ultrastar(input_ultrastar_class, pitches_count) * 1000 + ) + int(input_ultrastar_class.gap) + return (end_time + step_size - 1) // step_size * step_size diff --git a/src/modules/Ultrastar/ultrastar_score_calculator.py b/src/modules/Ultrastar/ultrastar_score_calculator.py index a02194c..60fdd06 100644 --- a/src/modules/Ultrastar/ultrastar_score_calculator.py +++ b/src/modules/Ultrastar/ultrastar_score_calculator.py @@ -1,4 +1,7 @@ """Ultrastar score calculator.""" +from dataclasses import dataclass + +from dataclasses_json import dataclass_json import librosa @@ -48,6 +51,8 @@ def add_point(note_type: str, points: Points) -> Points: return points +@dataclass_json +@dataclass class Score: """Docstring""" From eae51e7598c4eb902679c0c19fc7d89b5a2e022f Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sun, 8 Oct 2023 12:34:03 +0200 Subject: [PATCH 08/19] add cross octave pitch comparison --- src/UltraSingerEvaluation.py | 6 ++++++ src/modules/Research/TestRun.py | 2 ++ src/modules/Ultrastar/ultrastar_converter.py | 19 +++++++++++++++++-- src/modules/Ultrastar/ultrastar_txt.py | 2 +- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py index 7b9eebe..1393722 100644 --- a/src/UltraSingerEvaluation.py +++ b/src/UltraSingerEvaluation.py @@ -103,6 +103,8 @@ def main() -> None: ( input_match_ratio, output_match_ratio, + cross_octave_input_match_ratio, + cross_octave_output_match_ratio, pitch_where_should_be_no_pitch_ratio, no_pitch_where_should_be_pitch_ratio, ) = compare_pitches(test_song.input_ultrastar_class, ultrastar_class) @@ -111,6 +113,8 @@ def main() -> None: tested_song.success = True tested_song.input_match_ratio = input_match_ratio tested_song.output_match_ratio = output_match_ratio + tested_song.cross_octave_input_match_ratio = cross_octave_input_match_ratio + tested_song.cross_octave_output_match_ratio = cross_octave_output_match_ratio tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio tested_song.output_score_simple = simple_score @@ -131,6 +135,8 @@ def find_ultrastar_song( song_folder_item.endswith(".txt") and song_folder_item != "license.txt" and not song_folder_item.endswith("[Karaoke].txt") + and not song_folder_item.endswith("[MULTI].txt") + and not song_folder_item.endswith("[DUET].txt") ): txt_file = os.path.join(song_folder, song_folder_item) ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file) diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py index be60a78..7070c28 100644 --- a/src/modules/Research/TestRun.py +++ b/src/modules/Research/TestRun.py @@ -15,6 +15,8 @@ class TestedSong: success: bool = False input_match_ratio: float = 0.0 output_match_ratio: float = 0.0 + cross_octave_input_match_ratio: float = 0.0 + cross_octave_output_match_ratio: float = 0.0 no_pitch_where_should_be_pitch_ratio: float = 0.0 pitch_where_should_be_no_pitch_ratio: float = 0.0 output_score_simple: int = 0 diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index 01a143f..8ef002c 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -135,6 +135,7 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa output_pitched_datapoints = len([x for x in output_datapoints if x != NO_PITCH]) matches = 0 + cross_octave_matches = 0 pitch_where_should_be_no_pitch = 0 no_pitch_where_should_be_pitch = 0 for index, _ in enumerate(input_datapoints): @@ -147,17 +148,31 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa matches += 1 elif input_pitch == NO_PITCH: pitch_where_should_be_no_pitch += 1 - else: + elif output_pitch == NO_PITCH: no_pitch_where_should_be_pitch += 1 + else: + _, input_pitch_remainder = divmod(input_pitch, 12) + _, output_pitch_remainder = divmod(output_pitch, 12) + if input_pitch_remainder == output_pitch_remainder: + cross_octave_matches += 1 input_match_ratio = matches / input_pitched_datapoints output_match_ratio = matches / output_pitched_datapoints + cross_octave_input_match_ratio = (matches + cross_octave_matches) / input_pitched_datapoints + cross_octave_output_match_ratio = (matches + cross_octave_matches) / output_pitched_datapoints + output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints - return input_match_ratio, output_match_ratio, output_pitch_where_should_be_no_pitch_ratio, output_no_pitch_where_should_be_pitch_ratio + return (input_match_ratio, + output_match_ratio, + cross_octave_input_match_ratio, + cross_octave_output_match_ratio, + output_pitch_where_should_be_no_pitch_ratio, + output_no_pitch_where_should_be_pitch_ratio + ) def determine_nearest_end_step(input_ultrastar_class, step_size) -> int: diff --git a/src/modules/Ultrastar/ultrastar_txt.py b/src/modules/Ultrastar/ultrastar_txt.py index 3e21273..cdddca0 100644 --- a/src/modules/Ultrastar/ultrastar_txt.py +++ b/src/modules/Ultrastar/ultrastar_txt.py @@ -64,7 +64,7 @@ class UltrastarTxtValue: mp3 = "" video = None videoGap = None - gap = "" + gap = "0" bpm = "" language = None cover = None From c139c74775f36c80f387d47fd98109464ee99fe3 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sun, 8 Oct 2023 19:25:31 +0200 Subject: [PATCH 09/19] fix some bugs --- src/UltraSingerEvaluation.py | 6 +- src/modules/Speech_Recognition/Whisper.py | 92 +++++++++++--------- src/modules/Ultrastar/ultrastar_converter.py | 9 +- src/modules/Ultrastar/ultrastar_parser.py | 48 +++++----- 4 files changed, 82 insertions(+), 73 deletions(-) diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py index 1393722..03230b3 100644 --- a/src/UltraSingerEvaluation.py +++ b/src/UltraSingerEvaluation.py @@ -1,5 +1,6 @@ import copy import os +import traceback from datetime import datetime from pathlib import Path from typing import List @@ -75,9 +76,6 @@ def main() -> None: test_song_settings.cache_override_path = song_cache_path UltraSinger.settings = test_song_settings - output_txt = None - simple_score = None - accurate_score = None tested_song = TestedSong(test_song.input_txt) test_run.tested_songs.append(tested_song) try: @@ -86,6 +84,7 @@ def main() -> None: print( f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Failed to process {test_song.input_txt}\n{error}." ) + traceback.print_exc() continue @@ -137,6 +136,7 @@ def find_ultrastar_song( and not song_folder_item.endswith("[Karaoke].txt") and not song_folder_item.endswith("[MULTI].txt") and not song_folder_item.endswith("[DUET].txt") + and not song_folder_item.endswith("instrumental.txt") ): txt_file = os.path.join(song_folder, song_folder_item) ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file) diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index 565b123..daa0123 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -10,6 +10,8 @@ from modules.Speech_Recognition.TranscribedData import TranscribedData, from_whisper +MEMORY_ERROR_MESSAGE = f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu" + def transcribe_with_whisper( audio_path: str, model: str, @@ -34,6 +36,46 @@ def transcribe_with_whisper( loaded_whisper_model = whisperx.load_model( model, language=language, device=device, compute_type=compute_type ) + + audio = whisperx.load_audio(audio_path) + + print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}") + + result = loaded_whisper_model.transcribe( + audio, batch_size=batch_size, language=language + ) + + detected_language = result["language"] + if language is None: + language = detected_language + + # load alignment model and metadata + try: + model_a, metadata = whisperx.load_align_model( + language_code=language, device=device, model_name=model_name + ) + except ValueError as ve: + print( + f"{red_highlighted(f'{ve}')}" + f"\n" + f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. " + f"Try add it with --align_model [hugingface]." + ) + sys.exit(1) + + # align whisper output + result_aligned = whisperx.align( + result["segments"], + model_a, + metadata, + audio, + device, + return_char_alignments=False, + ) + + transcribed_data = convert_to_transcribed_data(result_aligned) + + return TranscriptionResult(transcribed_data, detected_language) except ValueError as value_error: if ( "Requested float16 compute type, but the target device or backend do not support efficient float16 computation." @@ -48,50 +90,14 @@ def transcribe_with_whisper( raise value_error except OutOfMemoryError as oom_exception: print(oom_exception) - print( - f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu" - ) + print(MEMORY_ERROR_MESSAGE) sys.exit(1) - - audio = whisperx.load_audio(audio_path) - - print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}") - - result = loaded_whisper_model.transcribe( - audio, batch_size=batch_size, language=language - ) - - detected_language = result["language"] - if language is None: - language = detected_language - - # load alignment model and metadata - try: - model_a, metadata = whisperx.load_align_model( - language_code=language, device=device, model_name=model_name - ) - except ValueError as ve: - print( - f"{red_highlighted(f'{ve}')}" - f"\n" - f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. " - f"Try add it with --align_model [hugingface]." - ) - sys.exit(1) - - # align whisper output - result_aligned = whisperx.align( - result["segments"], - model_a, - metadata, - audio, - device, - return_char_alignments=False, - ) - - transcribed_data = convert_to_transcribed_data(result_aligned) - - return TranscriptionResult(transcribed_data, detected_language) + except Exception as exception: + if "CUDA failed with error out of memory" in str(exception.args[0]): + print(exception) + print(MEMORY_ERROR_MESSAGE) + sys.exit(1) + raise exception def convert_to_transcribed_data(result_aligned): diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index 8ef002c..6e1e83c 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -111,13 +111,13 @@ def map_to_datapoints( data += [NO_PITCH] * gap_steps_count pitch_steps_count = duration // step_size - data += [pitch] * pitch_steps_count + data += [int(pitch)] * pitch_steps_count previous_step = end_nearest_step return data -def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float]: +def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float, float, float]: step_size = 10 input_datapoints = map_to_datapoints(input_ultrastar_class, step_size) @@ -141,7 +141,7 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa for index, _ in enumerate(input_datapoints): input_pitch = input_datapoints[index] output_pitch = output_datapoints[index] - if input_pitch != NO_PITCH and output_pitch != NO_PITCH: + if input_pitch == NO_PITCH and output_pitch == NO_PITCH: continue if input_pitch == output_pitch: @@ -165,14 +165,13 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints - return (input_match_ratio, output_match_ratio, cross_octave_input_match_ratio, cross_octave_output_match_ratio, output_pitch_where_should_be_no_pitch_ratio, output_no_pitch_where_should_be_pitch_ratio - ) + ) def determine_nearest_end_step(input_ultrastar_class, step_size) -> int: diff --git a/src/modules/Ultrastar/ultrastar_parser.py b/src/modules/Ultrastar/ultrastar_parser.py index f1aaca2..63f19de 100644 --- a/src/modules/Ultrastar/ultrastar_parser.py +++ b/src/modules/Ultrastar/ultrastar_parser.py @@ -12,6 +12,8 @@ FILE_ENCODING, ) +CHARACTERS_TO_REMOVE = ["\ufeff"] + def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue: """Parse ultrastar txt file to UltrastarTxt class""" @@ -19,33 +21,35 @@ def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue: with open(input_file, "r", encoding=FILE_ENCODING) as file: txt = file.readlines() - ultrastar_class = UltrastarTxtValue() count = 0 # Strips the newline character for line in txt: + filtered_line = line + for character_to_remove in CHARACTERS_TO_REMOVE: + filtered_line = filtered_line.replace(character_to_remove, "") count += 1 - if line.startswith("#"): - if line.startswith(f"#{UltrastarTxtTag.ARTIST}"): - ultrastar_class.artist = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.TITLE}"): - ultrastar_class.title = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.MP3}"): - ultrastar_class.mp3 = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.GAP}"): - ultrastar_class.gap = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.BPM}"): - ultrastar_class.bpm = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.VIDEO}"): - ultrastar_class.video = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.VIDEOGAP}"): - ultrastar_class.videoGap = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.COVER}"): - ultrastar_class.cover = line.split(":")[1].replace("\n", "") - elif line.startswith(f"#{UltrastarTxtTag.BACKGROUND}"): - ultrastar_class.background = line.split(":")[1].replace("\n", "") - elif line.startswith( + if filtered_line.startswith("#"): + if filtered_line.startswith(f"#{UltrastarTxtTag.ARTIST}"): + ultrastar_class.artist = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.TITLE}"): + ultrastar_class.title = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.MP3}"): + ultrastar_class.mp3 = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.GAP}"): + ultrastar_class.gap = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.BPM}"): + ultrastar_class.bpm = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.VIDEO}"): + ultrastar_class.video = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.VIDEOGAP}"): + ultrastar_class.videoGap = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.COVER}"): + ultrastar_class.cover = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith(f"#{UltrastarTxtTag.BACKGROUND}"): + ultrastar_class.background = filtered_line.split(":")[1].replace("\n", "") + elif filtered_line.startswith( ( f"{UltrastarTxtNoteTypeTag.FREESTYLE} ", f"{UltrastarTxtNoteTypeTag.NORMAL} ", @@ -54,7 +58,7 @@ def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue: f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ", ) ): - parts = line.split() + parts = filtered_line.split() # [0] F : * R G # [1] start beat # [2] duration From 4339c3594ada94b1e4d63891d07fa3c349654fa7 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sun, 8 Oct 2023 19:43:35 +0200 Subject: [PATCH 10/19] fix some bugs --- src/modules/Speech_Recognition/Whisper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index daa0123..c8bbc5a 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -107,9 +107,7 @@ def convert_to_transcribed_data(result_aligned): vtd = from_whisper(obj) # create custom Word object vtd.word = vtd.word + " " # add space to end of word if len(obj) < 4: - previous = transcribed_data[-1] - if not previous: - previous.end = 0 + previous = transcribed_data[-1] if len(transcribed_data) != 0 else {"end": 0, "word": ""} vtd.start = previous.end + 0.1 vtd.end = previous.end + 0.2 msg = f'Error: There is no timestamp for word: {obj["word"]}. ' \ From e2c22095fd82d8e7709e8fc78468b0e0ec8646d4 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sun, 8 Oct 2023 20:36:10 +0200 Subject: [PATCH 11/19] fix some bugs --- src/UltraSingerEvaluation.py | 8 ++++---- src/modules/Research/TestRun.py | 5 +++-- src/modules/Ultrastar/ultrastar_converter.py | 19 +++++++++++-------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py index 03230b3..a884e13 100644 --- a/src/UltraSingerEvaluation.py +++ b/src/UltraSingerEvaluation.py @@ -102,8 +102,8 @@ def main() -> None: ( input_match_ratio, output_match_ratio, - cross_octave_input_match_ratio, - cross_octave_output_match_ratio, + input_pitch_shift_match_ratios, + output_pitch_shift_match_ratios, pitch_where_should_be_no_pitch_ratio, no_pitch_where_should_be_pitch_ratio, ) = compare_pitches(test_song.input_ultrastar_class, ultrastar_class) @@ -112,8 +112,8 @@ def main() -> None: tested_song.success = True tested_song.input_match_ratio = input_match_ratio tested_song.output_match_ratio = output_match_ratio - tested_song.cross_octave_input_match_ratio = cross_octave_input_match_ratio - tested_song.cross_octave_output_match_ratio = cross_octave_output_match_ratio + tested_song.input_pitch_shift_match_ratios = input_pitch_shift_match_ratios + tested_song.output_pitch_shift_match_ratios = output_pitch_shift_match_ratios tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio tested_song.output_score_simple = simple_score diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py index 7070c28..f381a01 100644 --- a/src/modules/Research/TestRun.py +++ b/src/modules/Research/TestRun.py @@ -15,8 +15,8 @@ class TestedSong: success: bool = False input_match_ratio: float = 0.0 output_match_ratio: float = 0.0 - cross_octave_input_match_ratio: float = 0.0 - cross_octave_output_match_ratio: float = 0.0 + input_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {}) + output_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {}) no_pitch_where_should_be_pitch_ratio: float = 0.0 pitch_where_should_be_no_pitch_ratio: float = 0.0 output_score_simple: int = 0 @@ -27,5 +27,6 @@ class TestedSong: @dataclass class TestRun: """Test run""" + settings: Settings tested_songs: list[TestedSong] = field(default_factory=lambda: []) diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index 6e1e83c..28a7a9b 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -117,7 +117,7 @@ def map_to_datapoints( return data -def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float, float, float]: +def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, dict[int, float], dict[int, float], float, float]: step_size = 10 input_datapoints = map_to_datapoints(input_ultrastar_class, step_size) @@ -135,7 +135,7 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa output_pitched_datapoints = len([x for x in output_datapoints if x != NO_PITCH]) matches = 0 - cross_octave_matches = 0 + pitch_shift_matches = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] pitch_where_should_be_no_pitch = 0 no_pitch_where_should_be_pitch = 0 for index, _ in enumerate(input_datapoints): @@ -153,22 +153,25 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa else: _, input_pitch_remainder = divmod(input_pitch, 12) _, output_pitch_remainder = divmod(output_pitch, 12) - if input_pitch_remainder == output_pitch_remainder: - cross_octave_matches += 1 + pitch_difference = abs(input_pitch_remainder - output_pitch_remainder) + pitch_shift_matches[pitch_difference] += 1 input_match_ratio = matches / input_pitched_datapoints output_match_ratio = matches / output_pitched_datapoints - cross_octave_input_match_ratio = (matches + cross_octave_matches) / input_pitched_datapoints - cross_octave_output_match_ratio = (matches + cross_octave_matches) / output_pitched_datapoints + input_pitch_shift_match_ratios = {} + output_pitch_shift_match_ratios = {} + for index, pitch_shift_matches_item in enumerate(pitch_shift_matches): + input_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / input_pitched_datapoints + output_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / output_pitched_datapoints output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints return (input_match_ratio, output_match_ratio, - cross_octave_input_match_ratio, - cross_octave_output_match_ratio, + input_pitch_shift_match_ratios, + output_pitch_shift_match_ratios, output_pitch_where_should_be_no_pitch_ratio, output_no_pitch_where_should_be_pitch_ratio ) From c3bd2d0b76f2acfc1f09dc28e0b087ccbe027457 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sun, 8 Oct 2023 21:10:05 +0200 Subject: [PATCH 12/19] fix some bugs --- src/UltraSinger.py | 1 + src/modules/Speech_Recognition/Whisper.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/UltraSinger.py b/src/UltraSinger.py index d7bd992..2fadec7 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -570,6 +570,7 @@ def separate_vocal_from_audio( input_path = ultrastar_audio_input_path convert_audio_to_mono_wav(input_path, settings.mono_audio_path) + return audio_separation_path diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index c8bbc5a..599a7f3 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -107,7 +107,7 @@ def convert_to_transcribed_data(result_aligned): vtd = from_whisper(obj) # create custom Word object vtd.word = vtd.word + " " # add space to end of word if len(obj) < 4: - previous = transcribed_data[-1] if len(transcribed_data) != 0 else {"end": 0, "word": ""} + previous = transcribed_data[-1] if len(transcribed_data) != 0 else TranscribedData() vtd.start = previous.end + 0.1 vtd.end = previous.end + 0.2 msg = f'Error: There is no timestamp for word: {obj["word"]}. ' \ From 4729daf2dd6e0a339f6b5979ab93d9240090e9b6 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sat, 14 Oct 2023 10:43:06 +0200 Subject: [PATCH 13/19] remove hard sys.exit calls for batch processing --- src/UltraSinger.py | 2 +- src/modules/Speech_Recognition/Whisper.py | 9 +++++---- src/modules/Ultrastar/ultrastar_converter.py | 7 +++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 2fadec7..baffdcf 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -503,7 +503,7 @@ def get_unused_song_output_dir(path: str) -> str: print( f"{ULTRASINGER_HEAD} {red_highlighted('Error: Could not create output folder! (999) is the maximum number of tries.')}" ) - sys.exit(1) + raise ValueError("Could not create output folder! (999) is the maximum number of tries.") return path diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index 599a7f3..46cc8dd 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -2,6 +2,7 @@ import sys +import torch import whisperx from torch.cuda import OutOfMemoryError @@ -12,6 +13,7 @@ MEMORY_ERROR_MESSAGE = f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu" + def transcribe_with_whisper( audio_path: str, model: str, @@ -33,6 +35,7 @@ def transcribe_with_whisper( compute_type = "float16" if device == "cuda" else "int8" try: + torch.cuda.empty_cache() loaded_whisper_model = whisperx.load_model( model, language=language, device=device, compute_type=compute_type ) @@ -61,7 +64,7 @@ def transcribe_with_whisper( f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. " f"Try add it with --align_model [hugingface]." ) - sys.exit(1) + raise ve # align whisper output result_aligned = whisperx.align( @@ -85,18 +88,16 @@ def transcribe_with_whisper( print( f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'" ) - sys.exit(1) raise value_error except OutOfMemoryError as oom_exception: print(oom_exception) print(MEMORY_ERROR_MESSAGE) - sys.exit(1) + raise oom_exception except Exception as exception: if "CUDA failed with error out of memory" in str(exception.args[0]): print(exception) print(MEMORY_ERROR_MESSAGE) - sys.exit(1) raise exception diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index 28a7a9b..7a4aee6 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -162,8 +162,11 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa input_pitch_shift_match_ratios = {} output_pitch_shift_match_ratios = {} for index, pitch_shift_matches_item in enumerate(pitch_shift_matches): - input_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / input_pitched_datapoints - output_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / output_pitched_datapoints + pitch_shift_matches_count = pitch_shift_matches_item + if index == 0: + pitch_shift_matches_count += matches + input_pitch_shift_match_ratios[index] = pitch_shift_matches_item / input_pitched_datapoints + output_pitch_shift_match_ratios[index] = pitch_shift_matches_item / output_pitched_datapoints output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints From 945581dbd8528bf2e681b076c29f4ca737d4f555 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sat, 14 Oct 2023 14:30:17 +0200 Subject: [PATCH 14/19] add test run evaluation --- src/UltraSingerEvaluation.py | 18 +++-- src/UltraSingerMetaEvaluation.py | 109 +++++++++++++++++++++++++++++++ src/modules/Research/TestRun.py | 5 +- 3 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 src/UltraSingerMetaEvaluation.py diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py index a884e13..1d06292 100644 --- a/src/UltraSingerEvaluation.py +++ b/src/UltraSingerEvaluation.py @@ -6,6 +6,8 @@ from typing import List import importlib.util +import pandas + import UltraSinger from Settings import Settings from modules.DeviceDetection.device_detection import check_gpu_support @@ -19,9 +21,11 @@ test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input")) test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output")) +test_start_time = datetime.now() test_run_folder = os.path.join( - test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + test_output_folder, test_start_time.strftime("%Y-%m-%d_%H-%M-%S") ) +test_run_songs_folder = os.path.join(test_run_folder, "songs") def main() -> None: @@ -29,9 +33,10 @@ def main() -> None: Path(test_input_folder).mkdir(parents=True, exist_ok=True) Path(test_output_folder).mkdir(parents=True, exist_ok=True) Path(test_run_folder).mkdir(parents=True) + Path(test_run_songs_folder).mkdir(parents=True) base_settings = initialize_settings() - base_settings.output_file_path = test_run_folder + base_settings.output_file_path = test_run_songs_folder base_settings.test_songs_input_folder = os.path.normpath( base_settings.test_songs_input_folder @@ -60,7 +65,7 @@ def main() -> None: print(f"{ULTRASINGER_HEAD} Running evaluation for {len(test_songs)} songs") - test_run = TestRun(base_settings) + test_run = TestRun(base_settings, test_start_time) for index, test_song in enumerate(test_songs): print(f"{ULTRASINGER_HEAD} ========================") print( @@ -79,7 +84,7 @@ def main() -> None: tested_song = TestedSong(test_song.input_txt) test_run.tested_songs.append(tested_song) try: - output_txt, simple_score, accurate_score = UltraSinger.run() + output_txt, _, _ = UltraSinger.run() except Exception as error: print( f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Failed to process {test_song.input_txt}\n{error}." @@ -89,7 +94,7 @@ def main() -> None: output_folder_name = f"{test_song.input_ultrastar_class.artist} - {test_song.input_ultrastar_class.title}" - output_folder = os.path.join(test_run_folder, output_folder_name) + output_folder = os.path.join(test_run_songs_folder, output_folder_name) if not os.path.isfile(output_txt): print( @@ -116,9 +121,8 @@ def main() -> None: tested_song.output_pitch_shift_match_ratios = output_pitch_shift_match_ratios tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio - tested_song.output_score_simple = simple_score - tested_song.output_score_accurate = accurate_score + test_run.end_time = datetime.now() test_run_result_file = os.path.join(test_run_folder, "run.json") test_run_json = test_run.to_json() with open(test_run_result_file, "w", encoding=FILE_ENCODING) as file: diff --git a/src/UltraSingerMetaEvaluation.py b/src/UltraSingerMetaEvaluation.py new file mode 100644 index 0000000..d26da4b --- /dev/null +++ b/src/UltraSingerMetaEvaluation.py @@ -0,0 +1,109 @@ +import os +from pathlib import Path +from typing import List + +import pandas + +from modules.Research.TestRun import TestRun +from modules.console_colors import ULTRASINGER_HEAD, red_highlighted + +test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input")) +test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output")) + + +def main() -> None: + """Main function""" + Path(test_output_folder).mkdir(parents=True, exist_ok=True) + + test_runs: List[TestRun] = [] + for dir_entry in os.listdir(test_output_folder): + test_run_folder = os.path.join(test_output_folder, dir_entry) + test_run = find_test_run_result(test_run_folder) + if test_run is None: + continue + + test_runs.append(test_run) + + if len(test_runs) == 0: + print( + f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test runs found in {test_output_folder}." + ) + exit(1) + + print(f"{ULTRASINGER_HEAD} Running meta evaluation for {len(test_runs)} test runs") + + for test_run in test_runs: + tested_songs_dicts = [] + for tested_song in [s for s in test_run.tested_songs if s.success]: + tested_song_dict = tested_song.to_dict() + + best_input_pitch_shift_match_ratio = max( + tested_song.input_pitch_shift_match_ratios.values() + ) + + # based on the pitch shift of the highest input_pitch_shift_match_ratio picked previously + # we pick the corresponding value of output_pitch_shift_match_ratios + matching_input_best_output_pitch_shift_match_ratio = ( + tested_song.output_pitch_shift_match_ratios[ + list(tested_song.input_pitch_shift_match_ratios.values()).index( + best_input_pitch_shift_match_ratio + ) + ] + ) + + best_output_pitch_shift_match_ratio = max( + tested_song.output_pitch_shift_match_ratios.values() + ) + + # based on the pitch shift of the highest output_pitch_shift_match_ratio picked previously + # we pick the corresponding value of input_pitch_shift_match_ratios + matching_output_best_input_pitch_shift_match_ratio = ( + tested_song.input_pitch_shift_match_ratios[ + list(tested_song.output_pitch_shift_match_ratios.values()).index( + best_output_pitch_shift_match_ratio + ) + ] + ) + + tested_song_dict[ + "best_input_pitch_shift_match_ratio" + ] = best_input_pitch_shift_match_ratio + tested_song_dict[ + "matching_input_best_output_pitch_shift_match_ratio" + ] = matching_input_best_output_pitch_shift_match_ratio + tested_song_dict[ + "best_output_pitch_shift_match_ratio" + ] = best_output_pitch_shift_match_ratio + tested_song_dict[ + "matching_output_best_input_pitch_shift_match_ratio" + ] = matching_output_best_input_pitch_shift_match_ratio + + tested_songs_dicts.append(tested_song_dict) + + records = pandas.DataFrame.from_records(tested_songs_dicts) + pandas.options.display.max_columns = records.shape[1] + describe_result = records.describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99]) + print(describe_result) + + print("Done") + + +def find_test_run_result(test_run_folder) -> TestRun: + if os.path.isdir(test_run_folder): + for test_run_folder_item in os.listdir(test_run_folder): + test_run_folder_item_path = os.path.join( + test_run_folder, test_run_folder_item + ) + if ( + os.path.isfile(test_run_folder_item_path) + and test_run_folder_item == "run.json" + ): + test_run = None + with open(test_run_folder_item_path) as file: + json = file.read() + test_run = TestRun.from_json(json) + return test_run + + +if __name__ == "__main__": + main() diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py index f381a01..ed573a8 100644 --- a/src/modules/Research/TestRun.py +++ b/src/modules/Research/TestRun.py @@ -1,3 +1,4 @@ +import datetime from dataclasses import dataclass, field from dataclasses_json import dataclass_json @@ -19,8 +20,6 @@ class TestedSong: output_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {}) no_pitch_where_should_be_pitch_ratio: float = 0.0 pitch_where_should_be_no_pitch_ratio: float = 0.0 - output_score_simple: int = 0 - output_score_accurate: int = 0 @dataclass_json @@ -29,4 +28,6 @@ class TestRun: """Test run""" settings: Settings + start_time: datetime.datetime = None + end_time: datetime.datetime = None tested_songs: list[TestedSong] = field(default_factory=lambda: []) From 582b6440f27ce75c5a68e18bc701ae8632966672 Mon Sep 17 00:00:00 2001 From: Vadim Date: Wed, 19 Jun 2024 01:18:06 +0200 Subject: [PATCH 15/19] Fix merge conflicts --- src/UltraSinger.py | 4 +- src/modules/Audio/separation.py | 14 +++-- src/modules/Audio/silence_processing.py | 2 +- src/modules/Speech_Recognition/Whisper.py | 77 ++++++++++++----------- 4 files changed, 50 insertions(+), 47 deletions(-) diff --git a/src/UltraSinger.py b/src/UltraSinger.py index db49042..9d6205d 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -489,9 +489,9 @@ def run() -> tuple[str, Score, Score]: remove_unecessary_punctuations(transcription_result.transcribed_data) if settings.hyphenation: - hyphen_words = hyphenate_each_word(language, transcribed_data) + hyphen_words = hyphenate_each_word(language, transcription_result.transcribed_data) if hyphen_words is not None: - transcribed_data = add_hyphen_to_data(transcribed_data, hyphen_words) + transcribed_data = add_hyphen_to_data(transcription_result.transcribed_data, hyphen_words) transcribed_data = remove_silence_from_transcription_data( settings.processing_audio_path, transcribed_data diff --git a/src/modules/Audio/separation.py b/src/modules/Audio/separation.py index 479966d..55335ac 100644 --- a/src/modules/Audio/separation.py +++ b/src/modules/Audio/separation.py @@ -20,11 +20,13 @@ def separate_audio(input_file_path: str, output_folder: str, device="cpu") -> No f"{ULTRASINGER_HEAD} Separating vocals from audio with {blue_highlighted('demucs')} and {red_highlighted(device)} as worker." ) - demucs.separate.main(shlex.split(f'--two-stems vocals -d {device} --out "{os.path.join(output_folder, "separated")}" "{input_file_path}"')) # Model selection? # -n htdemucs_ft - subprocess.run( - ["demucs", "-d", device, "--two-stems=vocals", "--float32", input_file_path] - ) - separated_folder = path_join(current_executor_path(), "separated") - move(separated_folder, output_file) \ No newline at end of file + # subprocess.run( + # ["demucs", "-d", device, "--two-stems=vocals", "--float32", input_file_path] + # ) + # separated_folder = path_join(current_executor_path(), "separated") + # move(separated_folder, output_file) + + # fixme "--float32" is missing + demucs.separate.main(shlex.split(f'--two-stems vocals -d {device} --out "{os.path.join(output_folder, "separated")}" "{input_file_path}"')) diff --git a/src/modules/Audio/silence_processing.py b/src/modules/Audio/silence_processing.py index da11172..46f9fea 100644 --- a/src/modules/Audio/silence_processing.py +++ b/src/modules/Audio/silence_processing.py @@ -63,7 +63,7 @@ def remove_silence(silence_parts_list: list[tuple[float, float]], transcribed_da split_word = "~ " is_word_end = True - split_data = TranscribedData({"conf": data.conf, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end}) + split_data = TranscribedData({"conf": data.confidence, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end}) if not was_split: data.end = silence_start diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index db176f9..a0fec68 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -41,18 +41,38 @@ def transcribe_with_whisper( loaded_whisper_model = whisperx.load_model( model, language=language, device=device, compute_type=compute_type ) + except ValueError as value_error: + if ( + "Requested float16 compute type, but the target device or backend do not support efficient float16 computation." + in str(value_error.args[0]) + ): + print(value_error) + print( + f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'" + ) + + raise value_error + except OutOfMemoryError as oom_exception: + print(oom_exception) + print(MEMORY_ERROR_MESSAGE) + raise oom_exception + except Exception as exception: + if "CUDA failed with error out of memory" in str(exception.args[0]): + print(exception) + print(MEMORY_ERROR_MESSAGE) + raise exception - audio = whisperx.load_audio(audio_path) + audio = whisperx.load_audio(audio_path) - print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}") + print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}") - result = loaded_whisper_model.transcribe( - audio, batch_size=batch_size, language=language - ) + result = loaded_whisper_model.transcribe( + audio, batch_size=batch_size, language=language + ) - detected_language = result["language"] - if language is None: - language = detected_language + detected_language = result["language"] + if language is None: + language = detected_language # load alignment model and metadata try: @@ -68,39 +88,20 @@ def transcribe_with_whisper( ) raise ve - # align whisper output - result_aligned = whisperx.align( - result["segments"], - model_a, - metadata, - audio, - device, - return_char_alignments=False, - ) + # align whisper output + result_aligned = whisperx.align( + result["segments"], + model_a, + metadata, + audio, + device, + return_char_alignments=False, + ) - transcribed_data = convert_to_transcribed_data(result_aligned) + transcribed_data = convert_to_transcribed_data(result_aligned) - return TranscriptionResult(transcribed_data, detected_language) - except ValueError as value_error: - if ( - "Requested float16 compute type, but the target device or backend do not support efficient float16 computation." - in str(value_error.args[0]) - ): - print(value_error) - print( - f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'" - ) + return TranscriptionResult(transcribed_data, detected_language) - raise value_error - except OutOfMemoryError as oom_exception: - print(oom_exception) - print(MEMORY_ERROR_MESSAGE) - raise oom_exception - except Exception as exception: - if "CUDA failed with error out of memory" in str(exception.args[0]): - print(exception) - print(MEMORY_ERROR_MESSAGE) - raise exception def convert_to_transcribed_data(result_aligned): From 10e063aae2706546e842da1cf664b90ea7acbda1 Mon Sep 17 00:00:00 2001 From: Vadim Date: Sat, 22 Jun 2024 18:26:54 +0200 Subject: [PATCH 16/19] Fix merge conflicts --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index d83b1e1..1f39d8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -crepe~=0.0.13 -demucs~=4.0.0 +crepe~=0.0.15 +demucs~=4.0.1 ffmpeg_python~=0.2.0 git+https://github.com/m-bain/whisperx.git langcodes~=3.4.0 From 8f597719ba7081f02479a1c78b6971c1ef0a5f72 Mon Sep 17 00:00:00 2001 From: Vadim Date: Thu, 4 Jul 2024 18:34:53 +0200 Subject: [PATCH 17/19] Merge fix --- src/UltraSinger.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/UltraSinger.py b/src/UltraSinger.py index 3c77c7e..bbb9580 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -345,7 +345,7 @@ def split_syllables_into_segments( def merge_syllable_segments( - transcribed_data: list[TranscribedData], midi_notes: list[str], us_notes=list[int] + transcribed_data: list[TranscribedData], midi_segments: list[MidiSegment], us_notes=list[int] ) -> tuple[list[TranscribedData], list[str], list[int]]: """Merge sub-segments of a syllable where the pitch is the same""" new_data = [] @@ -358,13 +358,13 @@ def merge_syllable_segments( if ( str(data.word).startswith("~") and previous_data is not None - and midi_notes[i] == midi_notes[i - 1] + and midi_segments[i].note == midi_segments[i - 1].note and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE ): new_data[-1].end = data.end else: new_data.append(data) - new_midi_notes.append(midi_notes[i]) + new_midi_notes.append(midi_segments[i].note) new_us_notes.append(us_notes[i]) previous_data = data return new_data, new_midi_notes, new_us_notes @@ -500,10 +500,10 @@ def run() -> tuple[str, Score, Score]: # Pitch the audio midi_segments, pitched_data, ultrastar_note_numbers, transcribed_data = pitch_audio( - is_audio, transcribed_data, ultrastar_class + transcribed_data, ultrastar_class, cache_path) transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments( - transcribed_data, midi_notes, ultrastar_note_numbers + transcribed_data, midi_segments, ultrastar_note_numbers ) # Create plot From 76a495653bc4c7ccd21b5a448bc4362de39efa96 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sat, 20 Jul 2024 10:36:05 +0200 Subject: [PATCH 18/19] fix score calculation --- src/UltraSinger.py | 10 +++++----- src/modules/Speech_Recognition/Whisper.py | 8 +++++++- src/modules/Ultrastar/ultrastar_converter.py | 10 +++++----- src/modules/console_colors.py | 2 +- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/UltraSinger.py b/src/UltraSinger.py index bbb9580..50db73f 100644 --- a/src/UltraSinger.py +++ b/src/UltraSinger.py @@ -540,10 +540,10 @@ def run() -> tuple[str, Score, Score]: pitched_data, ultrastar_class, ultrastar_file_output ) - # Add calculated score to Ultrastar txt #Todo: Missing Karaoke - ultrastar_writer.add_score_to_ultrastar_txt( - ultrastar_file_output, simple_score - ) + # Add calculated score to Ultrastar txt #Todo: Missing Karaoke + ultrastar_writer.add_score_to_ultrastar_txt( + ultrastar_file_output, simple_score + ) # Midi if settings.create_midi: @@ -984,7 +984,7 @@ def pitch_audio( new_transcribed_data = [] for i, midi_segment in enumerate(midi_segments): - new_transcribed_data.append(TranscribedData({"word": midi_segment.word, "start": midi_segment.start, "end": midi_segment.end, "is_hyphen": None, "confidence": 1})) + new_transcribed_data.append(TranscribedData(word=midi_segment.word, start=midi_segment.start, end=midi_segment.end, is_hyphen=None, confidence=1)) return midi_segments, pitched_data, ultrastar_note_numbers, new_transcribed_data diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py index a0fec68..9cb228d 100644 --- a/src/modules/Speech_Recognition/Whisper.py +++ b/src/modules/Speech_Recognition/Whisper.py @@ -38,8 +38,14 @@ def transcribe_with_whisper( try: torch.cuda.empty_cache() + asr_options = { + "max_new_tokens": None, + "clip_timestamps": None, + "hallucination_silence_threshold": None + } + loaded_whisper_model = whisperx.load_model( - model, language=language, device=device, compute_type=compute_type + model, asr_options=asr_options, language=language, device=device, compute_type=compute_type ) except ValueError as value_error: if ( diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py index 7a4aee6..795c20c 100644 --- a/src/modules/Ultrastar/ultrastar_converter.py +++ b/src/modules/Ultrastar/ultrastar_converter.py @@ -59,7 +59,7 @@ def get_start_time_from_ultrastar( ) -> float: """Calculates the start time from the Ultrastar txt""" - gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000) + gap = float(ultrastar_class.gap.replace(",", ".")) / 1000 real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", "."))) start_time = beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap return start_time @@ -68,7 +68,7 @@ def get_start_time_from_ultrastar( def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float: """Calculates the end time from the Ultrastar txt""" - gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000) + gap = float(ultrastar_class.gap.replace(",", ".")) / 1000 real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", "."))) end_time = ( beat_to_second( @@ -83,7 +83,7 @@ def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> def map_to_datapoints( ultrastar_class: UltrastarTxtValue, step_size: int = 10 ) -> list[int]: - gap = int(float(ultrastar_class.gap.replace(",", "."))) + gap = float(ultrastar_class.gap.replace(",", ".")) data = [] @@ -92,8 +92,8 @@ def map_to_datapoints( if ultrastar_class.noteType[pos] == "F": continue - start_time = int(get_start_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap - end_time = int(get_end_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap + start_time = int(get_start_time_from_ultrastar(ultrastar_class, pos) * 1000 + gap) + end_time = int(get_end_time_from_ultrastar(ultrastar_class, pos) * 1000 + gap) start_nearest_step = (start_time + step_size - 1) // step_size * step_size end_nearest_step = (end_time + step_size - 1) // step_size * step_size diff --git a/src/modules/console_colors.py b/src/modules/console_colors.py index 59328ff..e5d9375 100644 --- a/src/modules/console_colors.py +++ b/src/modules/console_colors.py @@ -9,7 +9,7 @@ def blue_highlighted(text: str) -> str: def green_highlighted(text: str) -> str: - """Returns a blue highlighted text""" + """Returns a green highlighted text""" return f"{Bcolors.dark_green}{text}{Bcolors.endc}" From 051290c7682aa0795b1f74a2a93f282a1c1057c4 Mon Sep 17 00:00:00 2001 From: Benedikt Wagener Date: Sat, 20 Jul 2024 11:02:46 +0200 Subject: [PATCH 19/19] delete obsolete testfile --- pytest/modules/UltraSinger.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 pytest/modules/UltraSinger.py diff --git a/pytest/modules/UltraSinger.py b/pytest/modules/UltraSinger.py deleted file mode 100644 index 2aa2d27..0000000 --- a/pytest/modules/UltraSinger.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Tests for UltraSinger.py""" - -import os -import unittest -import src.modules.Pitcher.pitcher as test_subject - -import pytest -from src.modules.plot import plot - - -class PitcherTest(unittest.TestCase): - # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") - def test_get_pitch_with_crepe_file(self): - # Arrange - test_dir = os.path.dirname(os.path.abspath(__file__)) - root_dir = os.path.abspath(test_dir + "/../../..") - # test_file_abs_path = os.path.abspath(root_dir + "/test_input/audio_denoised.wav") - test_file_abs_path = os.path.abspath(root_dir + "/test_input/test_denoised.wav") - test_output = root_dir + "/test_output" - - # Act - # pitched_data = test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', device="cuda") - # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024) - # plot(pitched_data, test_output, title="pitching test") - - print("done") - - -if __name__ == "__main__": - unittest.main()