From da434684844c0757373a3ed11a211b8ea3d456e9 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Thu, 14 Sep 2023 17:51:49 +0200
Subject: [PATCH 01/19] add loudness threshold to filter silent regions from
 pitched data

---
 LICENSE                                       |  1 +
 pytest/modules/Pitcher/test_pitcher.py        |  2 +-
 src/UltraSinger.py                            | 76 ++++++++++++++++++-
 src/modules/Pitcher/core.py                   |  1 +
 src/modules/Pitcher/loudness.py               | 69 +++++++++++++++++
 src/modules/Pitcher/pitched_data.py           |  1 +
 src/modules/Pitcher/pitcher.py                | 34 ++++++---
 .../Speech_Recognition/TranscribedData.py     | 26 ++++---
 src/modules/plot.py                           |  2 +-
 9 files changed, 186 insertions(+), 26 deletions(-)
 create mode 100644 src/modules/Pitcher/core.py
 create mode 100644 src/modules/Pitcher/loudness.py

diff --git a/LICENSE b/LICENSE
index dfe8101..a346333 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,7 @@
 MIT License
 
 Copyright (c) 2023 Vadim Rangnau
+Copyright (c) 2020 Max Morrison (torchcrepe code adapted for crepe output filtering abd thresholding)
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/pytest/modules/Pitcher/test_pitcher.py b/pytest/modules/Pitcher/test_pitcher.py
index e623986..ad5b253 100644
--- a/pytest/modules/Pitcher/test_pitcher.py
+++ b/pytest/modules/Pitcher/test_pitcher.py
@@ -8,7 +8,7 @@
 
 
 class PitcherTest(unittest.TestCase):
-    @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
+    # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
     def test_get_pitch_with_crepe_file(self):
         # Arrange
         test_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index b1cb8a4..47610c2 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -55,7 +55,8 @@
 from modules.musicbrainz_client import get_music_infos
 
 settings = Settings()
-
+SYLLABLE_SEGMENT_SIZE = 0.1
+SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1
 
 def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
     """Convert midi notes to ultrastar notes"""
@@ -255,6 +256,73 @@ def print_support() -> None:
     )
 
 
+def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> list[TranscribedData]:
+    """Split every syllable into sub-segments"""
+    segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1])
+    new_data = []
+
+    for i, data in enumerate(transcribed_data):
+
+        duration = data.end - data.start
+        if duration <= SYLLABLE_SEGMENT_SIZE:
+            new_data.append(data)
+            continue
+        
+        has_space = str(data.word).endswith(" ")
+        first_segment = copy.deepcopy(data)
+        filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE
+        remainder = data.end - (filler_words_start)
+        first_segment.end = filler_words_start
+        if has_space:
+            first_segment.word = first_segment.word[:-1]
+
+        new_data.append(first_segment)
+
+        full_segments, partial_segment = divmod(remainder, SYLLABLE_SEGMENT_SIZE)
+
+        if full_segments >= 1:
+            for i in range(int(full_segments)):
+                segment = TranscribedData()
+                segment.word = "~"
+                segment.start = filler_words_start + round(i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
+                segment.end = segment.start + SYLLABLE_SEGMENT_SIZE
+                new_data.append(segment)
+        
+        if partial_segment >= 0.01:
+            segment = TranscribedData()
+            segment.word = "~"
+            segment.start = filler_words_start + round(full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
+            segment.end = segment.start + partial_segment
+            new_data.append(segment)
+        
+        if has_space:
+            new_data[-1].word += " "
+    return new_data
+
+
+def merge_syllable_segments(
+        transcribed_data: list[TranscribedData],
+        midi_notes: list[str],
+        us_notes = list[int]
+) -> tuple[list[TranscribedData], list[str], list[int]]:
+    """Merge sub-segments of a syllable where the pitch is the same"""
+    new_data = []
+    new_midi_notes = []
+    new_us_notes = []
+
+    previous_data = None
+
+    for i, data in enumerate(transcribed_data):
+        if str(data.word).startswith("~") and previous_data is not None and midi_notes[i] == midi_notes[i-1] and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE:
+            new_data[-1].end = data.end
+        else:
+            new_data.append(data)
+            new_midi_notes.append(midi_notes[i])
+            new_us_notes.append(us_notes[i])
+        previous_data = data
+    return new_data, new_midi_notes, new_us_notes
+
+
 def run() -> None:
     """The processing function of this program"""
     is_audio = ".txt" not in settings.input_file_path
@@ -330,6 +398,8 @@ def run() -> None:
         # lyric = 'input/faber_lyric.txt'
         # --corrected_words = correct_words(vosk_speech, lyric)
 
+    transcribed_data = split_syllables_into_segments(transcribed_data)
+
     # Create audio chunks
     if settings.create_audio_chunks:
         create_audio_chunks(
@@ -345,6 +415,8 @@ def run() -> None:
         is_audio, transcribed_data, ultrastar_class
     )
 
+    transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers)
+
     # Create plot
     if settings.create_plot:
         plot(pitched_data, song_output, transcribed_data, midi_notes)
@@ -706,7 +778,7 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast
         settings.mono_audio_path,
         settings.crepe_model_capacity,
         settings.crepe_step_size,
-        settings.tensorflow_device,
+        settings.tensorflow_device
     )
     if is_audio:
         start_times = []
diff --git a/src/modules/Pitcher/core.py b/src/modules/Pitcher/core.py
new file mode 100644
index 0000000..7b252a6
--- /dev/null
+++ b/src/modules/Pitcher/core.py
@@ -0,0 +1 @@
+CREPE_MODEL_SAMPLE_RATE = 16000
\ No newline at end of file
diff --git a/src/modules/Pitcher/loudness.py b/src/modules/Pitcher/loudness.py
new file mode 100644
index 0000000..f72d8a5
--- /dev/null
+++ b/src/modules/Pitcher/loudness.py
@@ -0,0 +1,69 @@
+import warnings
+
+import librosa
+import numpy as np
+from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE
+
+###############################################################################
+# Constants
+###############################################################################
+
+WINDOW_SIZE = 1024
+TIMES_DECIMAL_PLACES: int = 3
+# Minimum decibel level
+MIN_DB = -100.
+
+# Reference decibel level
+REF_DB = 20.
+
+def set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=-60, step_size=10, pad=True):
+    # Don't modify in-place
+    confidence = confidence[:]
+
+    # Compute loudness
+    loudness = a_weighted(audio, step_size, pad)
+
+    # Threshold silence
+    confidence[loudness < threshold] = 0.
+
+    return confidence, loudness
+
+def a_weighted(audio, step_size=10, pad=True):
+    """Retrieve the per-frame loudness"""
+    step_size_seconds = round(step_size / 1000, TIMES_DECIMAL_PLACES)
+    steps_per_second = 1 / step_size_seconds
+    hop_length = int(CREPE_MODEL_SAMPLE_RATE // steps_per_second)
+
+    a_perceptual_weights = perceptual_weights()
+
+    # Take stft
+    stft = librosa.stft(audio,
+                        n_fft=WINDOW_SIZE,
+                        hop_length=hop_length,
+                        win_length=WINDOW_SIZE,
+                        center=pad,
+                        pad_mode='constant')
+
+    # Compute magnitude on db scale
+    db = librosa.amplitude_to_db(np.abs(stft))
+
+    # Apply A-weighting
+    weighted = db + a_perceptual_weights
+
+    # Threshold
+    weighted[weighted < MIN_DB] = MIN_DB
+
+    # Average over weighted frequencies
+    return weighted.mean(axis=0)
+
+
+def perceptual_weights():
+    """A-weighted frequency-dependent perceptual loudness weights"""
+    frequencies = librosa.fft_frequencies(sr=CREPE_MODEL_SAMPLE_RATE,
+                                          n_fft=WINDOW_SIZE)
+
+    # A warning is raised for nearly inaudible frequencies, but it ends up
+    # defaulting to -100 db. That default is fine for our purposes.
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', RuntimeWarning)
+        return librosa.A_weighting(frequencies)[:, None] - REF_DB
\ No newline at end of file
diff --git a/src/modules/Pitcher/pitched_data.py b/src/modules/Pitcher/pitched_data.py
index 13d828c..f2d32df 100644
--- a/src/modules/Pitcher/pitched_data.py
+++ b/src/modules/Pitcher/pitched_data.py
@@ -9,3 +9,4 @@ class PitchedData:
     times: list[float]
     frequencies: list[float]
     confidence: list[float]
+    perceived_loudness_db: list[float]
diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py
index 5bf9f77..c3fc81b 100644
--- a/src/modules/Pitcher/pitcher.py
+++ b/src/modules/Pitcher/pitcher.py
@@ -1,10 +1,13 @@
 """Pitcher module"""
 
 import crepe
-from scipy.io import wavfile
+import librosa
 
 from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
+from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE
+from modules.Pitcher.loudness import set_confidence_to_zero_in_silent_regions
 from modules.Pitcher.pitched_data import PitchedData
+import modules.timer as timer
 
 
 def get_pitch_with_crepe_file(
@@ -15,26 +18,37 @@ def get_pitch_with_crepe_file(
     print(
         f"{ULTRASINGER_HEAD} Pitching with {blue_highlighted('crepe')} and model {blue_highlighted(model_capacity)} and {red_highlighted(device)} as worker"
     )
-    sample_rate, audio = wavfile.read(filename)
+    timer.log('Load file for pitch detection start')
+    audio, sample_rate = librosa.load(filename)
+    timer.log('Load file for pitch detection end')
 
     return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size)
 
 
-def get_pitch_with_crepe(
-    audio, sample_rate: int, model_capacity: str, step_size: int = 10
-) -> PitchedData:
+def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10) -> PitchedData:
     """Pitch with crepe"""
-    times, frequencies, confidence, activation = crepe.predict(
-        audio, sample_rate, model_capacity, step_size=step_size, viterbi=True
-    )
-    return PitchedData(times, frequencies, confidence)
+
+    if sample_rate != CREPE_MODEL_SAMPLE_RATE:
+        from resampy import resample
+        audio = resample(audio, sample_rate, CREPE_MODEL_SAMPLE_RATE)
+        sample_rate = CREPE_MODEL_SAMPLE_RATE
+
+    timer.log('Crepe pitch detection start')
+    times, frequencies, confidence, activation = crepe.predict(audio, sample_rate, model_capacity, step_size=step_size, viterbi=True)
+    timer.log('Crepe pitch detection end')
+
+    timer.log('Computing loudness start')
+    confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size)
+    timer.log('Computing loudness end')
+
+    return PitchedData(times, frequencies, confidence, perceived_loudness)
 
 
 def get_pitched_data_with_high_confidence(
     pitched_data: PitchedData, threshold=0.4
 ) -> PitchedData:
     """Get frequency with high confidence"""
-    new_pitched_data = PitchedData([], [], [])
+    new_pitched_data = PitchedData([], [], [], [])
     for i, conf in enumerate(pitched_data.confidence):
         if conf > threshold:
             new_pitched_data.times.append(pitched_data.times[i])
diff --git a/src/modules/Speech_Recognition/TranscribedData.py b/src/modules/Speech_Recognition/TranscribedData.py
index 8ae2f4a..5962d9a 100644
--- a/src/modules/Speech_Recognition/TranscribedData.py
+++ b/src/modules/Speech_Recognition/TranscribedData.py
@@ -4,15 +4,17 @@
 class TranscribedData:
     """Transcribed data from json file"""
 
-    def __init__(self, transcribed_json):
-        # Vosk = conf, Whisper = confidence
-        self.conf = transcribed_json.get(
-            "conf", transcribed_json.get("confidence", None)
-        )
-        # Vosk = word, Whisper = text
-        self.word = transcribed_json.get(
-            "word", transcribed_json.get("text", None)
-        )
-        self.end = transcribed_json.get("end", None)
-        self.start = transcribed_json.get("start", None)
-        self.is_hyphen = None
+    def __init__(self, transcribed_json = None):
+
+        if transcribed_json:
+            # Vosk = conf, Whisper = confidence
+            self.conf = transcribed_json.get(
+                "conf", transcribed_json.get("confidence", None)
+            )
+            # Vosk = word, Whisper = text
+            self.word = transcribed_json.get(
+                "word", transcribed_json.get("text", None)
+            )
+            self.end = transcribed_json.get("end", None)
+            self.start = transcribed_json.get("start", None)
+            self.is_hyphen = None
diff --git a/src/modules/plot.py b/src/modules/plot.py
index 881a453..01b121e 100644
--- a/src/modules/plot.py
+++ b/src/modules/plot.py
@@ -187,7 +187,7 @@ def create_gaps(pitched_data: PitchedData, step_size: float) -> PitchedData:
     This way the graph is only continuous where it should be.
 
     """
-    pitched_data_with_gaps = PitchedData([], [], [])
+    pitched_data_with_gaps = PitchedData([], [], [], [])
 
     previous_time = 0
     for i, time in enumerate(pitched_data.times):

From b52945be200d649d992678f12ede9ff4a490ddf3 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Tue, 3 Oct 2023 07:52:01 +0200
Subject: [PATCH 02/19] wip

---
 pytest/modules/Pitcher/test_pitcher.py | 64 ++++++++++++++++++++++++++
 src/modules/plot.py                    |  4 +-
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/pytest/modules/Pitcher/test_pitcher.py b/pytest/modules/Pitcher/test_pitcher.py
index ad5b253..a028843 100644
--- a/pytest/modules/Pitcher/test_pitcher.py
+++ b/pytest/modules/Pitcher/test_pitcher.py
@@ -3,6 +3,14 @@
 import os
 import unittest
 import src.modules.Pitcher.pitcher as test_subject
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn import preprocessing as p
+from sklearn.decomposition import PCA
+
 import pytest
 from src.modules.plot import plot
 
@@ -21,7 +29,63 @@ def test_get_pitch_with_crepe_file(self):
         pitched_data = test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', device="cuda")
         # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024)
         plot(pitched_data, test_output, title="pitching test")
+
         print("done")
 
+    
+    # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
+    def test_pitch_clustering(self):
+        # Arrange
+        times = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, 1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31, 1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43, 1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55, 1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, 1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79, 1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91, 1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2, 2.01, 2.02, 2.03, 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15, 2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27, 2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39, 2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51, 2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63, 2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75, 2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, 2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99, 3, 3.01, 3.02, 3.03, 3.04, 3.05, 3.06, 3.07, 3.08, 3.09, 3.1, 3.11, 3.12, 3.13, 3.14, 3.15, 3.16, 3.17, 3.18, 3.19, 3.2, 3.21, 3.22, 3.23, 3.24, 3.25, 3.26, 3.27, 3.28, 3.29, 3.3, 3.31, 3.32, 3.33, 3.34, 3.35, 3.36, 3.37, 3.38, 3.39, 3.4, 3.41, 3.42, 3.43, 3.44, 3.45, 3.46, 3.47, 3.48, 3.49, 3.5, 3.51, 3.52, 3.53, 3.54, 3.55, 3.56, 3.57, 3.58, 3.59, 3.6, 3.61, 3.62, 3.63, 3.64, 3.65, 3.66, 3.67, 3.68, 3.69, 3.7, 3.71, 3.72, 3.73, 3.74, 3.75, 3.76, 3.77, 3.78, 3.79, 3.8, 3.81, 3.82, 3.83, 3.84, 3.85, 3.86, 3.87, 3.88, 3.89, 3.9, 3.91, 3.92, 3.93, 3.94, 3.95, 3.96, 3.97, 3.98, 3.99, 4, 4.01, 4.02, 4.03, 4.04, 4.05, 4.06, 4.07, 4.08, 4.09, 4.1, 4.11, 4.12, 4.13, 4.14, 4.15, 4.16, 4.17, 4.18, 4.19, 4.2, 4.21, 4.22, 4.23, 4.24, 4.25, 4.26, 4.27, 4.28, 4.29, 4.3, 4.31, 4.32, 4.33, 4.34, 4.35, 4.36, 4.37, 4.38, 4.39, 4.4, 4.41, 4.42, 4.43, 4.44, 4.45, 4.46, 4.47, 4.48, 4.49, 4.5, 4.51, 4.52, 4.53, 4.54, 4.55, 4.56, 4.57, 4.58, 4.59, 4.6, 4.61, 4.62, 4.63, 4.64, 4.65, 4.66, 4.67, 4.68, 4.69, 4.7, 4.71, 4.72, 4.73, 4.74, 4.75, 4.76, 4.77, 4.78, 4.79, 4.8, 4.81, 4.82, 4.83, 4.84, 4.85, 4.86, 4.87, 4.88, 4.89, 4.9, 4.91, 4.92, 4.93, 4.94, 4.95, 4.96, 4.97, 4.98, 4.99, 5, 5.01, 5.02, 5.03, 5.04, 5.05, 5.06, 5.07, 5.08, 5.09, 5.1, 5.11, 5.12, 5.13, 5.14, 5.15, 5.16, 5.17, 5.18, 5.19, 5.2, 5.21, 5.22, 5.23, 5.24, 5.25, 5.26, 5.27, 5.28, 5.29, 5.3, 5.31, 5.32, 5.33, 5.34, 5.35, 5.36, 5.37, 5.38, 5.39, 5.4, 5.41, 5.42, 5.43, 5.44, 5.45, 5.46, 5.47, 5.48, 5.49, 5.5, 5.51, 5.52, 5.53, 5.54, 5.55, 5.56, 5.57, 5.58, 5.59, 5.6, 5.61, 5.62, 5.63, 5.64, 5.65, 5.66, 5.67, 5.68, 5.69, 5.7, 5.71, 5.72, 5.73, 5.74, 5.75, 5.76, 5.77, 5.78, 5.79, 5.8, 5.81, 5.82, 5.83, 5.84, 5.85, 5.86, 5.87, 5.88, 5.89, 5.9, 5.91, 5.92, 5.93, 5.94, 5.95, 5.96, 5.97, 5.98, 5.99, 6, 6.01, 6.02, 6.03, 6.04, 6.05, 6.06, 6.07, 6.08, 6.09, 6.1, 6.11, 6.12, 6.13, 6.14, 6.15, 6.16, 6.17, 6.18, 6.19, 6.2, 6.21, 6.22, 6.23, 6.24, 6.25, 6.26, 6.27, 6.28, 6.29, 6.3, 6.31, 6.32, 6.33, 6.34, 6.35, 6.36, 6.37, 6.38, 6.39, 6.4, 6.41, 6.42, 6.43, 6.44, 6.45, 6.46, 6.47, 6.48, 6.49, 6.5, 6.51, 6.52, 6.53, 6.54, 6.55, 6.56, 6.57, 6.58, 6.59, 6.6, 6.61, 6.62, 6.63, 6.64, 6.65, 6.66, 6.67, 6.68, 6.69, 6.7, 6.71, 6.72, 6.73, 6.74, 6.75, 6.76, 6.77, 6.78, 6.79, 6.8, 6.81, 6.82, 6.83, 6.84, 6.85, 6.86, 6.87, 6.88, 6.89, 6.9, 6.91, 6.92, 6.93, 6.94, 6.95, 6.96, 6.97, 6.98, 6.99, 7, 7.01, 7.02, 7.03, 7.04, 7.05, 7.06, 7.07, 7.08, 7.09, 7.1, 7.11, 7.12, 7.13, 7.14, 7.15, 7.16, 7.17, 7.18, 7.19, 7.2, 7.21, 7.22, 7.23, 7.24, 7.25, 7.26, 7.27, 7.28, 7.29, 7.3, 7.31, 7.32, 7.33, 7.34, 7.35, 7.36, 7.37, 7.38, 7.39, 7.4, 7.41, 7.42, 7.43, 7.44, 7.45, 7.46, 7.47, 7.48, 7.49, 7.5, 7.51, 7.52, 7.53, 7.54, 7.55, 7.56, 7.57, 7.58, 7.59, 7.6, 7.61, 7.62, 7.63, 7.64, 7.65, 7.66, 7.67, 7.68, 7.69, 7.7, 7.71, 7.72, 7.73, 7.74, 7.75, 7.76, 7.77, 7.78, 7.79, 7.8, 7.81, 7.82, 7.83, 7.84, 7.85, 7.86, 7.87, 7.88, 7.89, 7.9, 7.91, 7.92, 7.93, 7.94, 7.95, 7.96, 7.97, 7.98, 7.99, 8, 8.01, 8.02, 8.03, 8.04, 8.05, 8.06, 8.07, 8.08, 8.09, 8.1, 8.11, 8.12, 8.13, 8.14, 8.15, 8.16, 8.17, 8.18, 8.19, 8.2, 8.21, 8.22, 8.23, 8.24, 8.25, 8.26, 8.27, 8.28, 8.29, 8.3, 8.31, 8.32, 8.33, 8.34, 8.35, 8.36, 8.37, 8.38, 8.39, 8.4, 8.41, 8.42, 8.43]
+        frequencies = [665.03, 659.52, 646.07, 572.62, 590.38, 649.3, 600.02, 624.6, 646.16, 650.34, 646.06, 651.35, 650.49, 589.08, 603.26, 625.12, 627.36, 636.09, 660.45, 659.91, 648.32, 657.78, 597.3, 595.76, 594.63, 659.18, 625.98, 645.65, 645.76, 650.01, 652.28, 653.14, 664.93, 662.59, 660.76, 642.67, 644.79, 649.94, 625.44, 627.31, 645.4, 645.22, 652.12, 598.35, 623.99, 644.58, 645.09, 650.61, 650.01, 598.85, 656.88, 636.74, 652.16, 650.66, 649.87, 657.12, 625.48, 634.35, 661.24, 651.33, 655.77, 658.05, 661.75, 660.07, 661.37, 662.0, 662.29, 662.33, 664.63, 661.54, 661.31, 630.51, 590.5, 658.83, 625.16, 635.46, 661.08, 659.04, 659.97, 666.0, 670.29, 666.15, 658.62, 662.12, 662.78, 656.71, 662.49, 661.36, 641.94, 647.05, 652.86, 598.11, 599.27, 656.18, 625.69, 659.14, 657.94, 659.05, 657.8, 656.69, 653.63, 638.82, 631.24, 625.38, 635.76, 658.93, 657.88, 660.1, 660.92, 613.83, 593.21, 608.63, 624.15, 633.75, 659.97, 658.17, 657.64, 590.49, 651.47, 656.66, 657.47, 637.08, 658.92, 659.45, 590.42, 592.88, 592.52, 596.55, 657.24, 660.3, 634.03, 635.35, 647.72, 648.55, 648.52, 647.42, 645.35, 648.98, 653.16, 650.04, 656.79, 644.61, 646.96, 648.74, 646.2, 642.01, 632.67, 698.0, 660.9, 635.71, 637.32, 646.88, 645.85, 644.92, 647.01, 646.71, 645.62, 645.52, 653.7, 660.06, 626.52, 646.6, 652.7, 653.21, 597.68, 658.95, 660.67, 636.1, 657.82, 659.44, 653.16, 652.2, 657.95, 660.01, 627.14, 636.14, 644.73, 649.03, 651.96, 662.32, 675.07, 679.81, 683.4, 692.69, 699.38, 695.89, 697.21, 698.53, 700.52, 704.75, 707.07, 691.78, 682.75, 677.22, 669.76, 660.38, 656.93, 645.32, 643.54, 647.62, 592.21, 658.68, 658.62, 660.63, 634.49, 642.43, 654.18, 659.81, 671.21, 678.8, 684.99, 690.6, 648.73, 598.17, 658.13, 659.56, 690.23, 642.85, 647.82, 648.53, 650.73, 647.39, 645.03, 647.35, 650.27, 649.18, 647.71, 646.93, 646.93, 640.58, 636.02, 632.72, 626.74, 619.02, 612.84, 597.52, 632.17, 663.0, 690.65, 643.34, 653.35, 647.88, 653.15, 652.6, 658.93, 658.99, 659.11, 658.06, 659.11, 660.6, 641.79, 610.65, 598.81, 659.95, 635.14, 660.15, 662.85, 651.13, 635.97, 658.73, 658.91, 659.11, 690.8, 695.01, 683.24, 681.22, 674.65, 663.59, 659.06, 649.47, 640.65, 632.87, 624.67, 645.13, 661.67, 646.04, 647.7, 651.95, 652.31, 659.4, 692.03, 635.43, 628.9, 624.13, 604.23, 597.85, 624.24, 624.9, 645.21, 645.66, 646.12, 649.43, 598.26, 657.7, 635.79, 652.29, 657.12, 659.24, 654.06, 646.01, 645.04, 584.87, 532.52, 537.38, 470.4, 419.16, 376.77, 331.49, 300.82, 266.99, 235.5, 208.51, 187.03, 166.8, 145.7, 128.59, 114.6, 111.88, 111.34, 111.4, 112.1, 112.58, 112.77, 112.31, 111.91, 111.61, 111.35, 111.24, 111.36, 111.71, 112.28, 112.58, 112.6, 112.72, 112.64, 112.59, 112.58, 112.61, 112.7, 112.64, 112.4, 112.35, 112.19, 112.12, 112.28, 112.71, 113.53, 114.07, 114.72, 115.76, 116.9, 118.35, 120.27, 122.07, 123.86, 125.5, 127.17, 128.75, 130.15, 130.75, 131.19, 131.61, 132.12, 132.39, 132.65, 133.09, 133.35, 133.27, 133.47, 133.67, 133.72, 134.03, 134.05, 133.88, 133.98, 133.74, 133.51, 133.39, 133.18, 132.97, 132.75, 132.42, 132.22, 132.33, 132.47, 132.95, 133.73, 134.7, 136.04, 137.57, 138.72, 140.38, 142.22, 144.34, 146.13, 147.59, 149.4, 151.91, 154.03, 155.77, 157.2, 158.19, 158.3, 157.86, 157.38, 156.9, 156.33, 155.93, 155.5, 155.25, 155.1, 154.86, 154.75, 154.79, 154.81, 154.89, 154.93, 154.99, 155.17, 155.21, 155.27, 155.36, 155.26, 155.27, 155.32, 155.59, 155.95, 157.1, 159.24, 161.65, 163.48, 165.11, 166.87, 169.52, 172.43, 174.91, 177.59, 180.13, 182.91, 185.0, 186.43, 187.73, 188.14, 188.07, 187.49, 186.78, 186.54, 186.58, 186.74, 186.89, 186.78, 186.7, 186.94, 187.54, 188.14, 188.32, 188.13, 187.78, 187.68, 187.78, 187.92, 188.01, 187.96, 188.35, 189.13, 190.21, 190.87, 191.13, 190.8, 190.31, 189.41, 188.83, 188.26, 187.96, 187.35, 186.75, 186.32, 185.88, 185.66, 185.5, 185.51, 185.99, 186.39, 187.09, 187.52, 187.75, 187.83, 188.01, 188.66, 189.73, 190.58, 190.67, 190.03, 189.38, 188.8, 188.28, 188.16, 188.06, 187.99, 187.94, 187.91, 188.05, 188.52, 189.14, 189.76, 190.26, 190.42, 190.45, 190.43, 190.29, 190.09, 189.81, 189.93, 189.84, 189.58, 189.03, 188.63, 188.51, 188.85, 189.62, 190.32, 190.56, 190.57, 190.23, 189.8, 189.29, 189.1, 188.58, 187.76, 185.73, 183.73, 179.79, 174.77, 167.02, 164.36, 163.86, 164.43, 165.23, 166.1, 166.97, 167.8, 168.58, 169.3, 169.57, 169.86, 169.71, 169.03, 168.81, 168.58, 168.81, 169.13, 169.9, 170.26, 170.77, 171.23, 171.33, 171.52, 171.68, 171.48, 170.94, 170.4, 169.77, 169.56, 169.39, 169.32, 169.49, 169.51, 169.93, 170.61, 171.28, 171.92, 172.58, 172.45, 172.17, 172.0, 171.76, 171.48, 171.1, 170.78, 170.49, 170.16, 170.08, 170.66, 171.31, 172.1, 172.42, 172.75, 173.04, 173.32, 173.54, 173.76, 173.83, 173.66, 173.12, 172.18, 170.25, 166.95, 164.22, 159.63, 153.98, 149.11, 147.08, 147.28, 147.78, 148.69, 149.42, 149.85, 150.08, 150.11, 150.21, 150.23, 150.26, 150.09, 149.83, 149.6, 149.44, 149.46, 149.45, 149.46, 149.54, 149.83, 150.41, 151.16, 152.16, 152.9, 153.67, 154.13, 154.6, 154.88, 155.06, 155.09, 154.94, 154.86, 154.6, 154.1, 153.51, 152.86, 152.64, 152.4, 152.49, 152.71, 152.89, 153.26, 153.54, 154.2, 154.68, 155.2, 155.77, 156.31, 156.93, 157.24, 157.01, 157.61, 156.31, 170.0, 193.15, 209.91, 234.64, 255.53, 284.74, 306.29, 333.92, 377.15, 401.52, 448.67, 492.14, 535.32, 591.44, 589.15, 595.15, 607.96, 625.84, 636.75, 646.5, 644.68, 652.56, 658.79, 649.39, 631.42, 633.55, 645.17, 644.46, 649.97, 581.36, 581.38, 590.79, 601.73, 592.23, 591.62, 594.09, 633.63, 633.25, 627.58, 658.55, 663.0, 662.13, 648.45, 640.45, 634.05, 645.37, 659.85, 662.4, 667.63, 668.55, 662.2, 662.92, 661.72, 657.75, 653.06, 640.97, 628.55, 626.07, 629.33, 628.84, 635.41, 635.19, 645.97, 650.58, 655.78, 657.31, 657.48, 646.94, 645.51, 651.18, 655.65, 627.13, 647.02, 652.56, 651.73, 643.98, 649.71, 659.13, 689.11, 672.49, 653.14, 646.99, 647.11, 599.3, 624.89, 626.45, 634.17, 646.55, 653.63, 658.17, 662.79, 683.29, 691.02, 635.07, 615.7, 598.63, 616.46, 633.13, 658.27, 643.39, 647.15, 650.84, 656.71, 625.19, 646.25, 657.19, 647.88, 634.7, 636.86, 646.4, 649.72, 596.76, 597.36, 656.17, 626.22, 644.55, 644.42, 651.81, 625.18, 626.33, 634.01, 644.88, 651.92, 595.9, 652.5, 623.37, 656.65, 646.16, 645.91, 651.86, 596.94, 656.35, 658.52, 635.42, 652.11, 660.49, 590.83, 603.91, 613.4, 632.95, 646.0, 658.46, 646.69, 647.74, 590.35, 591.16, 650.7, 596.67, 657.69, 660.8, 689.3, 636.49, 658.31, 647.31, 645.17, 645.15, 649.0, 657.39, 636.03, 647.63, 657.81, 591.34, 596.66, 656.91, 658.68, 641.36, 648.52, 659.13, 590.37, 591.02, 650.28, 656.05, 624.6, 657.67, 652.16, 650.86, 650.79, 657.52, 634.04, 641.58, 645.91, 658.51, 625.29, 634.09, 645.1, 642.77, 634.01, 626.52, 645.07, 650.76101509]
+        frequencies_log_10 = [freq * 10 for freq in np.log10(frequencies)]
+        confidence = [0.04, 0.044, 0.109, 0.033, 0.094, 0.078, 0.085, 0.093, 0.099, 0.125, 0.156, 0.168, 0.094, 0.153, 0.063, 0.06, 0.095, 0.119, 0.121, 0.04, 0.098, 0.102, 0.076, 0.089, 0.076, 0.058, 0.075, 0.089, 0.139, 0.157, 0.144, 0.095, 0.032, 0.041, 0.094, 0.124, 0.112, 0.103, 0.104, 0.113, 0.096, 0.177, 0.149, 0.086, 0.079, 0.088, 0.134, 0.111, 0.071, 0.082, 0.097, 0.109, 0.149, 0.142, 0.154, 0.132, 0.117, 0.071, 0.071, 0.10, 0.098, 0.106, 0.087, 0.103, 0.067, 0.069, 0.078, 0.094, 0.303, 0.365, 0.056, 0.014, 0.037, 0.068, 0.106, 0.097, 0.09, 0.092, 0.034, 0.078, 0.028, 0.037, 0.016, 0.009, 0.042, 0.042, 0.041, 0.06, 0.115, 0.151, 0.132, 0.103, 0.092, 0.094, 0.08, 0.106, 0.138, 0.083, 0.077, 0.233, 0.273, 0.074, 0.073, 0.106, 0.103, 0.117, 0.081, 0.084, 0.051, 0.08, 0.036, 0.027, 0.047, 0.108, 0.085, 0.117, 0.099, 0.085, 0.084, 0.092, 0.105, 0.104, 0.086, 0.14, 0.083, 0.04, 0.083, 0.057, 0.08, 0.083, 0.058, 0.106, 0.089, 0.095, 0.046, 0.034, 0.039, 0.138, 0.23, 0.628, 0.397, 0.106, 0.036, 0.034, 0.039, 0.048, 0.056, 0.067, 0.066, 0.047, 0.022, 0.078, 0.04, 0.026, 0.028, 0.069, 0.061, 0.105, 0.056, 0.056, 0.085, 0.097, 0.093, 0.093, 0.075, 0.061, 0.066, 0.10, 0.102, 0.147, 0.093, 0.083, 0.088, 0.083, 0.069, 0.051, 0.04, 0.05, 0.133, 0.075, 0.051, 0.051, 0.108, 0.229, 0.038, 0.03, 0.052, 0.043, 0.068, 0.056, 0.081, 0.131, 0.104, 0.072, 0.056, 0.098, 0.025, 0.047, 0.074, 0.063, 0.068, 0.067, 0.072, 0.084, 0.085, 0.105, 0.051, 0.047, 0.04, 0.116, 0.038, 0.073, 0.037, 0.072, 0.087, 0.083, 0.08, 0.081, 0.075, 0.057, 0.076, 0.046, 0.038, 0.015, 0.03, 0.259, 0.571, 0.455, 0.101, 0.131, 0.033, 0.08, 0.069, 0.094, 0.111, 0.046, 0.037, 0.042, 0.046, 0.083, 0.108, 0.129, 0.101, 0.083, 0.062, 0.091, 0.095, 0.105, 0.123, 0.046, 0.046, 0.045, 0.048, 0.075, 0.07, 0.064, 0.032, 0.045, 0.069, 0.058, 0.091, 0.096, 0.051, 0.028, 0.04, 0.021, 0.048, 0.067, 0.102, 0.071, 0.019, 0.041, 0.048, 0.084, 0.075, 0.108, 0.122, 0.108, 0.087, 0.088, 0.051, 0.079, 0.042, 0.016, 0.028, 0.09, 0.105, 0.084, 0.073, 0.122, 0.125, 0.073, 0.07, 0.099, 0.094, 0.137, 0.112, 0.068, 0.023, 0.069, 0.087, 0.04, 0.079, 0.026, 0.092, 0.027, 0.05, 0.12, 0.112, 0.094, 0.063, 0.041, 0.129, 0.758, 0.529, 0.106, 0.117, 0.432, 0.798, 0.908, 0.898, 0.879, 0.889, 0.89, 0.919, 0.926, 0.923, 0.918, 0.921, 0.916, 0.893, 0.891, 0.893, 0.899, 0.901, 0.904, 0.897, 0.891, 0.895, 0.892, 0.887, 0.893, 0.909, 0.916, 0.902, 0.889, 0.889, 0.918, 0.911, 0.937, 0.936, 0.912, 0.895, 0.915, 0.926, 0.891, 0.884, 0.893, 0.90, 0.934, 0.943, 0.942, 0.925, 0.924, 0.936, 0.945, 0.931, 0.937, 0.931, 0.938, 0.94, 0.952, 0.949, 0.942, 0.95, 0.941, 0.929, 0.936, 0.937, 0.945, 0.95, 0.932, 0.927, 0.938, 0.935, 0.945, 0.945, 0.94, 0.902, 0.915, 0.912, 0.88, 0.912, 0.915, 0.953, 0.959, 0.933, 0.922, 0.939, 0.955, 0.937, 0.959, 0.956, 0.961, 0.953, 0.938, 0.961, 0.967, 0.959, 0.95, 0.95, 0.951, 0.965, 0.958, 0.958, 0.96, 0.955, 0.955, 0.948, 0.95, 0.951, 0.957, 0.948, 0.956, 0.952, 0.962, 0.966, 0.927, 0.928, 0.936, 0.953, 0.966, 0.942, 0.897, 0.911, 0.923, 0.931, 0.921, 0.935, 0.953, 0.923, 0.932, 0.924, 0.927, 0.94, 0.924, 0.935, 0.929, 0.921, 0.93, 0.922, 0.918, 0.931, 0.94, 0.928, 0.92, 0.919, 0.938, 0.938, 0.935, 0.932, 0.933, 0.932, 0.92, 0.915, 0.928, 0.911, 0.901, 0.916, 0.931, 0.924, 0.903, 0.919, 0.933, 0.939, 0.918, 0.94, 0.953, 0.958, 0.963, 0.964, 0.955, 0.928, 0.93, 0.936, 0.933, 0.935, 0.932, 0.895, 0.929, 0.916, 0.91, 0.93, 0.917, 0.894, 0.918, 0.926, 0.928, 0.933, 0.935, 0.927, 0.929, 0.906, 0.916, 0.924, 0.926, 0.925, 0.917, 0.92, 0.924, 0.928, 0.931, 0.928, 0.93, 0.921, 0.905, 0.90, 0.905, 0.896, 0.921, 0.917, 0.912, 0.909, 0.924, 0.92, 0.916, 0.912, 0.898, 0.928, 0.944, 0.934, 0.815, 0.639, 0.781, 0.924, 0.956, 0.952, 0.976, 0.961, 0.955, 0.949, 0.909, 0.927, 0.935, 0.92, 0.906, 0.915, 0.916, 0.912, 0.927, 0.932, 0.915, 0.90, 0.918, 0.932, 0.941, 0.932, 0.926, 0.934, 0.939, 0.899, 0.906, 0.924, 0.929, 0.927, 0.928, 0.918, 0.901, 0.90, 0.937, 0.931, 0.95, 0.951, 0.934, 0.917, 0.924, 0.942, 0.937, 0.923, 0.918, 0.912, 0.909, 0.912, 0.926, 0.925, 0.943, 0.954, 0.958, 0.958, 0.956, 0.953, 0.953, 0.956, 0.96, 0.925, 0.861, 0.893, 0.864, 0.779, 0.709, 0.846, 0.944, 0.963, 0.957, 0.931, 0.939, 0.93, 0.926, 0.929, 0.929, 0.932, 0.931, 0.927, 0.925, 0.937, 0.936, 0.941, 0.935, 0.94, 0.932, 0.925, 0.931, 0.931, 0.938, 0.944, 0.939, 0.955, 0.958, 0.951, 0.951, 0.946, 0.953, 0.957, 0.96, 0.951, 0.931, 0.941, 0.944, 0.941, 0.942, 0.946, 0.94, 0.936, 0.93, 0.954, 0.954, 0.943, 0.954, 0.938, 0.876, 0.728, 0.592, 0.365, 0.058, 0.025, 0.043, 0.031, 0.041, 0.029, 0.025, 0.012, 0.009, 0.018, 0.029, 0.033, 0.026, 0.018, 0.04, 0.069, 0.045, 0.078, 0.01, 0.064, 0.327, 0.034, 0.012, 0.091, 0.08, 0.082, 0.09, 0.117, 0.148, 0.123, 0.315, 0.07, 0.005, 0.008, 0.039, 0.13, 0.084, 0.081, 0.14, 0.102, 0.053, 0.043, 0.163, 0.067, 0.037, 0.04, 0.058, 0.029, 0.034, 0.03, 0.025, 0.261, 0.122, 0.047, 0.081, 0.047, 0.184, 0.10, 0.177, 0.092, 0.052, 0.032, 0.039, 0.127, 0.084, 0.072, 0.09, 0.125, 0.127, 0.128, 0.118, 0.097, 0.124, 0.114, 0.148, 0.124, 0.081, 0.095, 0.097, 0.11, 0.074, 0.14, 0.117, 0.089, 0.09, 0.089, 0.101, 0.122, 0.128, 0.123, 0.094, 0.062, 0.038, 0.035, 0.064, 0.032, 0.033, 0.045, 0.074, 0.109, 0.126, 0.112, 0.127, 0.093, 0.093, 0.059, 0.091, 0.026, 0.053, 0.10, 0.132, 0.115, 0.065, 0.071, 0.063, 0.081, 0.126, 0.143, 0.125, 0.093, 0.112, 0.102, 0.17, 0.128, 0.077, 0.087, 0.069, 0.061, 0.102, 0.139, 0.143, 0.07, 0.076, 0.124, 0.114, 0.14, 0.099, 0.033, 0.062, 0.056, 0.091, 0.041, 0.112, 0.116, 0.145, 0.077, 0.08, 0.073, 0.074, 0.09, 0.091, 0.105, 0.045, 0.034, 0.164, 0.131, 0.102, 0.082, 0.115, 0.085, 0.149, 0.081, 0.04, 0.064, 0.103, 0.135, 0.09, 0.13, 0.109, 0.071, 0.082, 0.08, 0.088, 0.099, 0.105, 0.133, 0.136, 0.133, 0.117, 0.098, 0.023, 0.105, 0.089, 0.102, 0.029, 0.097, 0.034, 0.076, 0.095, 0.103, 0.15038174]
+
+        matrix = [[times[i], frequencies_log_10[i], confidence[i]] for i, _ in enumerate(times)]
+        # Act
+        df = pd.DataFrame(matrix)
+        df.columns = ['time', 'log 10 frequency', 'confidence']
+        df_ss = pd.DataFrame(p.minmax_scale(df))
+        df_ss.columns = ['time', 'log 10 frequency', 'confidence']
+
+        # apply custom weight to frequency
+        df_ss['log 10 frequency'] = df_ss['log 10 frequency'] / 2
+
+        clusters = 20
+        labels = fit_kmeans(df_ss, clusters)
+        figure, axis = plt.subplots(2, 2)
+        axis[0][0].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5)
+        axis[0][0].set_title("Ratio 1:1:1")
+
+        # apply custom weight to frequency
+        df_ss['log 10 frequency'] = df['log 10 frequency'] / 5
+        labels = fit_kmeans(df_ss, clusters)
+        axis[1][0].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5)
+        axis[1][0].set_title("Ratio 1:5:1")
+
+        # apply custom weight to frequency
+        df_ss['confidence'] = df['confidence'] / 100
+        labels = fit_kmeans(df_ss, clusters)
+        axis[0][1].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5)
+        axis[0][1].set_title("Ratio 1:1:100")
+
+        # apply custom weight to frequency
+        df_ss['time'] = df['time'] / 100
+        labels = fit_kmeans(df_ss, clusters)
+        axis[1][1].scatter(df['time'], df['log 10 frequency'], c=labels, cmap='Set1', s=5)
+        axis[1][1].set_title("Ratio 100:1:1")
+
+        figure.set_figwidth(12.8)
+        plt.show()
+        print("done")
+
+
+def fit_kmeans(data, centers):
+    kmeans = KMeans(centers)
+    labels = kmeans.fit_predict(data)
+    return labels
+
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/src/modules/plot.py b/src/modules/plot.py
index 01b121e..01a3e7f 100644
--- a/src/modules/plot.py
+++ b/src/modules/plot.py
@@ -175,7 +175,7 @@ def determine_bounds(frequency_log_10: list[float]) -> tuple[float, float]:
 def set_figure_dimensions(time_range, frequency_log_10_range):
     """Dynamically scale the figure dimensions based on the duration/frequency amplitude of the song"""
     height = frequency_log_10_range / 0.06
-    width = time_range / 2
+    width = time_range / 4
 
     plt.figure(1).set_figwidth(max(6.4, width))
     plt.figure(1).set_figheight(max(4, height))
@@ -212,7 +212,7 @@ def create_gaps(pitched_data: PitchedData, step_size: float) -> PitchedData:
 
 def draw_words(transcribed_data, midi_notes):
     """Draw rectangles for each word"""
-    if transcribed_data is not None:
+    if transcribed_data is not None and len(transcribed_data) > 0:
         for i, data in enumerate(transcribed_data):
             note_frequency = librosa.note_to_hz(midi_notes[i])
             frequency_range = get_frequency_range(midi_notes[i])

From 442c5b1e56183716fc980968771b3241959fc5a0 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Tue, 3 Oct 2023 13:39:51 +0200
Subject: [PATCH 03/19] UltraSinger evaluation wip

---
 README.md                                     |   1 +
 pytest/modules/UltraSinger.py                 |  30 +++++
 src/Settings.py                               |   5 +
 src/UltraSinger.py                            |  45 +++++---
 src/modules/Audio/separation.py               |  17 +--
 src/modules/Research/TestSong.py              |  12 ++
 src/modules/Research/UltraSingerEvaluation.py | 109 ++++++++++++++++++
 src/modules/Ultrastar/ultrastar_converter.py  |   4 +-
 src/modules/Ultrastar/ultrastar_parser.py     |  23 +++-
 src/modules/Ultrastar/ultrastar_txt.py        |   6 +-
 src/modules/Ultrastar/ultrastar_writer.py     |   9 +-
 11 files changed, 226 insertions(+), 35 deletions(-)
 create mode 100644 pytest/modules/UltraSinger.py
 create mode 100644 src/modules/Research/TestSong.py
 create mode 100644 src/modules/Research/UltraSingerEvaluation.py

diff --git a/README.md b/README.md
index 63dbea5..1895469 100644
--- a/README.md
+++ b/README.md
@@ -158,6 +158,7 @@ _Not all options working now!_
     --hyphenation           True|False >> ((default) is True)
     --disable_separation    True|False >> ((default) is False)
     --disable_karaoke       True|False >> ((default) is False)
+    --ignore_audio          True|False >> ((default) is False)
     --create_audio_chunks   True|False >> ((default) is False)
     --plot                  True|False >> ((default) is False)
     --force_cpu             True|False >> ((default) is False)
diff --git a/pytest/modules/UltraSinger.py b/pytest/modules/UltraSinger.py
new file mode 100644
index 0000000..2aa2d27
--- /dev/null
+++ b/pytest/modules/UltraSinger.py
@@ -0,0 +1,30 @@
+"""Tests for UltraSinger.py"""
+
+import os
+import unittest
+import src.modules.Pitcher.pitcher as test_subject
+
+import pytest
+from src.modules.plot import plot
+
+
+class PitcherTest(unittest.TestCase):
+    # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
+    def test_get_pitch_with_crepe_file(self):
+        # Arrange
+        test_dir = os.path.dirname(os.path.abspath(__file__))
+        root_dir = os.path.abspath(test_dir + "/../../..")
+        # test_file_abs_path = os.path.abspath(root_dir + "/test_input/audio_denoised.wav")
+        test_file_abs_path = os.path.abspath(root_dir + "/test_input/test_denoised.wav")
+        test_output = root_dir + "/test_output"
+
+        # Act
+        # pitched_data = test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', device="cuda")
+        # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024)
+        # plot(pitched_data, test_output, title="pitching test")
+
+        print("done")
+
+    
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/Settings.py b/src/Settings.py
index 6884d5d..b698681 100644
--- a/src/Settings.py
+++ b/src/Settings.py
@@ -5,6 +5,8 @@ class Settings:
     hyphenation = True
     use_separated_vocal = True
     create_karaoke = True
+    ignore_audio = False
+    input_file_is_ultrastar_txt = False
 
     input_file_path = ""
     output_file_path = ""
@@ -31,3 +33,6 @@ class Settings:
     pytorch_device = 'cpu'  # cpu|cuda
     tensorflow_device = 'cpu'  # cpu|cuda
     force_cpu = False
+
+    # UltraSinger Evaluation Configuration
+    test_songs_input_folder = None
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index 628b446..ded0f85 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -330,15 +330,13 @@ def merge_syllable_segments(
 
 def run() -> None:
     """The processing function of this program"""
-    is_audio = ".txt" not in settings.input_file_path
+    settings.input_file_is_ultrastar_txt = settings.input_file_path.endswith(".txt")
+
     ultrastar_class = None
     real_bpm = None
     (title, artist, year, genre) = (None, None, None, None)
 
-    if not is_audio:  # Parse Ultrastar txt
-        print(
-            f"{ULTRASINGER_HEAD} {gold_highlighted('re-pitch mode')}"
-        )
+    if settings.input_file_is_ultrastar_txt:  # Parse Ultrastar txt
         (
             basename_without_ext,
             real_bpm,
@@ -346,6 +344,13 @@ def run() -> None:
             ultrastar_audio_input_path,
             ultrastar_class,
         ) = parse_ultrastar_txt()
+
+        if not ultrastar_class.mp3:
+            print(
+                f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} The provided text file does not have a reference to "
+                f"an audio file."
+            )
+            exit(1)
     elif settings.input_file_path.startswith("https:"):  # Youtube
         print(
             f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}"
@@ -384,7 +389,7 @@ def run() -> None:
     # Audio transcription
     transcribed_data = None
     language = settings.language
-    if is_audio:
+    if not settings.ignore_audio:
         detected_language, transcribed_data = transcribe_audio()
         if language is None:
             language = detected_language
@@ -409,7 +414,6 @@ def run() -> None:
     if settings.create_audio_chunks:
         create_audio_chunks(
             cache_path,
-            is_audio,
             transcribed_data,
             ultrastar_audio_input_path,
             ultrastar_class,
@@ -417,7 +421,7 @@ def run() -> None:
 
     # Pitch the audio
     midi_notes, pitched_data, ultrastar_note_numbers = pitch_audio(
-        is_audio, transcribed_data, ultrastar_class
+        transcribed_data, ultrastar_class
     )
 
     transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers)
@@ -427,7 +431,7 @@ def run() -> None:
         plot(pitched_data, song_output, transcribed_data, midi_notes)
 
     # Write Ultrastar txt
-    if is_audio:
+    if not settings.ignore_audio:
         real_bpm, ultrastar_file_output = create_ultrastar_txt_from_automation(
             audio_separation_path,
             basename_without_ext,
@@ -448,7 +452,7 @@ def run() -> None:
 
     # Calc Points
     ultrastar_class, simple_score, accurate_score = calculate_score_points(
-        is_audio, pitched_data, ultrastar_class, ultrastar_file_output
+        pitched_data, ultrastar_class, ultrastar_file_output
     )
 
     # Add calculated score to Ultrastar txt
@@ -506,7 +510,7 @@ def separate_vocal_from_audio(
 ) -> str:
     """Separate vocal from audio"""
     audio_separation_path = os.path.join(
-        cache_path, "separated", "htdemucs", basename_without_ext
+        cache_path, "separated", "htdemucs", os.path.splitext(os.path.basename(ultrastar_audio_input_path))[0]
     )
 
     if settings.use_separated_vocal or settings.create_karaoke:
@@ -522,10 +526,10 @@ def separate_vocal_from_audio(
 
 
 def calculate_score_points(
-    is_audio: bool, pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str
+    pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str
 ):
     """Calculate score points"""
-    if is_audio:
+    if not settings.ignore_audio:
         ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
             ultrastar_file_output
         )
@@ -735,12 +739,13 @@ def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]:
         float(ultrastar_class.bpm.replace(",", "."))
     )
     ultrastar_mp3_name = ultrastar_class.mp3
-    basename_without_ext = os.path.splitext(ultrastar_mp3_name)[0]
+
+    basename_without_ext = f"{ultrastar_class.artist} - {ultrastar_class.title}"
     dirname = os.path.dirname(settings.input_file_path)
     ultrastar_audio_input_path = os.path.join(dirname, ultrastar_mp3_name)
     song_output = os.path.join(
         settings.output_file_path,
-        ultrastar_class.artist + " - " + ultrastar_class.title,
+        basename_without_ext,
     )
     song_output = get_unused_song_output_dir(song_output)
     os_helper.create_folder(song_output)
@@ -771,7 +776,7 @@ def create_midi_file(real_bpm: float,
     )
 
 
-def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[
+def pitch_audio(transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[
     list[str], PitchedData, list[int]]:
     """Pitch audio"""
     # todo: chunk pitching as option?
@@ -782,7 +787,7 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast
         settings.crepe_step_size,
         settings.tensorflow_device
     )
-    if is_audio:
+    if not settings.ignore_audio:
         start_times = []
         end_times = []
         for i, data in enumerate(transcribed_data):
@@ -802,7 +807,6 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast
 
 def create_audio_chunks(
     cache_path: str,
-    is_audio: bool,
     transcribed_data: list[TranscribedData],
     ultrastar_audio_input_path: str,
     ultrastar_class: UltrastarTxtValue
@@ -812,7 +816,7 @@ def create_audio_chunks(
         cache_path, settings.audio_chunk_folder_name
     )
     os_helper.create_folder(audio_chunks_path)
-    if is_audio:  # and csv
+    if not settings.ignore_audio:  # and csv
         csv_filename = os.path.join(audio_chunks_path, "_chunks.csv")
         export_chunks_from_transcribed_data(
             settings.mono_audio_path, transcribed_data, audio_chunks_path
@@ -883,6 +887,8 @@ def init_settings(argv: list[str]) -> None:
             settings.create_karaoke = not arg
         elif opt in ("--create_audio_chunks"):
             settings.create_audio_chunks = arg
+        elif opt in ("--ignore_audio"):
+            settings.ignore_audio = arg in ["True", "true"]
         elif opt in ("--force_cpu"):
             settings.force_cpu = arg
             if settings.force_cpu:
@@ -917,6 +923,7 @@ def arg_options():
         "disable_separation=",
         "disable_karaoke=",
         "create_audio_chunks=",
+        "ignore_audio=",
         "force_cpu=",
     ]
     return long, short
diff --git a/src/modules/Audio/separation.py b/src/modules/Audio/separation.py
index 42885c4..bc2dd37 100644
--- a/src/modules/Audio/separation.py
+++ b/src/modules/Audio/separation.py
@@ -1,7 +1,10 @@
 """Separate vocals from audio"""
-
+import os
+import shlex
 import subprocess
 
+import demucs.separate
+
 from modules.console_colors import (
     ULTRASINGER_HEAD,
     blue_highlighted,
@@ -10,17 +13,17 @@
 from modules.os_helper import current_executor_path, move, path_join
 
 
-def separate_audio(input_file_path: str, output_file: str, device="cpu") -> None:
+def separate_audio(input_file_path: str, output_folder: str, device="cpu") -> None:
     """Separate vocals from audio with demucs."""
 
     print(
         f"{ULTRASINGER_HEAD} Separating vocals from audio with {blue_highlighted('demucs')} and {red_highlighted(device)} as worker."
     )
+
+    demucs.separate.main(shlex.split(f'--two-stems vocals -d {device} --out "{os.path.join(output_folder, "separated")}" "{input_file_path}"'))
     # Model selection?
     # -n mdx_q
     # -n htdemucs_ft
-    subprocess.run(
-        ["demucs", "-d", device, "--two-stems=vocals", input_file_path]
-    )
-    separated_folder = path_join(current_executor_path(), "separated")
-    move(separated_folder, output_file)
\ No newline at end of file
+    # subprocess.run(
+    #     ["demucs", "-d", device, "--two-stems=vocals", input_file_path.replace("\\", "/")]
+    # )
\ No newline at end of file
diff --git a/src/modules/Research/TestSong.py b/src/modules/Research/TestSong.py
new file mode 100644
index 0000000..3aa262d
--- /dev/null
+++ b/src/modules/Research/TestSong.py
@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+
+from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue
+
+
+@dataclass
+class TestSong:
+    """Test song"""
+
+    txt: str
+    audio: float
+    ultrastar_class: UltrastarTxtValue
\ No newline at end of file
diff --git a/src/modules/Research/UltraSingerEvaluation.py b/src/modules/Research/UltraSingerEvaluation.py
new file mode 100644
index 0000000..17cfcf6
--- /dev/null
+++ b/src/modules/Research/UltraSingerEvaluation.py
@@ -0,0 +1,109 @@
+import copy
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+import UltraSinger
+from Settings import Settings
+from modules.DeviceDetection.device_detection import check_gpu_support
+from modules.Research.TestSong import TestSong
+from modules.Ultrastar import ultrastar_parser
+from modules.console_colors import ULTRASINGER_HEAD, red_highlighted
+
+test_input_folder = os.path.normpath(
+    os.path.abspath(__file__ + "../../../../../test_input")
+)
+test_output_folder = os.path.normpath(
+    os.path.abspath(__file__ + "../../../../../test_output")
+)
+test_run_folder = os.path.join(
+    test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+)
+
+
+def main() -> None:
+    """Main function"""
+    test_input_folder_path = Path(test_input_folder)
+    test_input_folder_path.mkdir(parents=True, exist_ok=True)
+
+    test_output_folder_path = Path(test_output_folder)
+    test_output_folder_path.mkdir(parents=True, exist_ok=True)
+
+    test_run_folder_path = Path(test_run_folder)
+    test_run_folder_path.mkdir(parents=True)
+
+    base_settings = initialize_settings()
+    base_settings.output_file_path = test_run_folder
+
+    base_settings.test_songs_input_folder = os.path.normpath(
+        base_settings.test_songs_input_folder
+    )
+    if not os.path.isdir(base_settings.test_songs_input_folder):
+        print(
+            f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test songs input folder configured (refer to "
+            f"evaluation section in readme)."
+        )
+        exit(1)
+
+    test_songs: List[TestSong] = []
+    for dir_entry in os.listdir(base_settings.test_songs_input_folder):
+        dir_entry_path = os.path.join(base_settings.test_songs_input_folder, dir_entry)
+        if os.path.isdir(dir_entry_path):
+            for sub_dir_entry in os.listdir(dir_entry_path):
+                if sub_dir_entry.endswith(".txt") and sub_dir_entry != "license.txt":
+                    txt_file = os.path.join(
+                        base_settings.test_songs_input_folder, dir_entry, sub_dir_entry
+                    )
+                    ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file)
+
+                    if ultrastar_class.mp3:
+                        test_song = TestSong(
+                            txt_file, ultrastar_class.mp3, ultrastar_class
+                        )
+                        test_songs.append(test_song)
+                        break
+                    else:
+                        print(
+                            f"{ULTRASINGER_HEAD} {red_highlighted('Warning.')} {base_settings.test_songs_input_folder} contains an UltraStar text file but has no audio referenced in it. Skipping."
+                        )
+
+    if len(test_songs) == 0:
+        print(
+            f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test songs found in {base_settings.test_songs_input_folder}."
+        )
+        exit(1)
+
+    print(f"{ULTRASINGER_HEAD} Running evaluation for {len(test_songs)} songs")
+
+    for index, test_song in enumerate(test_songs):
+        print(f"{ULTRASINGER_HEAD} ========================")
+        print(
+            f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.txt)}"
+        )
+
+        test_song_settings = copy.deepcopy(base_settings)
+        test_song_settings.input_file_path = test_song.txt
+        UltraSinger.settings = test_song_settings
+        UltraSinger.run()
+
+
+def initialize_settings():
+    s = Settings()
+    user_config_file = os.path.normpath(
+        os.path.join(test_input_folder, "config/local.py")
+    )
+    if os.path.isfile(user_config_file):
+        sys.path.append(os.path.join(user_config_file, ".."))
+        import local
+
+        s = local.user_settings
+
+    if not s.force_cpu:
+        s.tensorflow_device, s.pytorch_device = check_gpu_support()
+    return s
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py
index d9978eb..97f79ee 100644
--- a/src/modules/Ultrastar/ultrastar_converter.py
+++ b/src/modules/Ultrastar/ultrastar_converter.py
@@ -51,7 +51,7 @@ def ultrastar_note_to_midi_note(ultrastar_note: int) -> int:
 def get_start_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float:
     """Calculates the start time from the Ultrastar txt"""
 
-    gap = int(ultrastar_class.gap) / 1000
+    gap = int(float(ultrastar_class.gap) / 1000)
     real_bpm = ultrastar_bpm_to_real_bpm(
         float(ultrastar_class.bpm.replace(",", "."))
     )
@@ -64,7 +64,7 @@ def get_start_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int)
 def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float:
     """Calculates the end time from the Ultrastar txt"""
 
-    gap = int(ultrastar_class.gap) / 1000
+    gap = int(float(ultrastar_class.gap) / 1000)
     real_bpm = ultrastar_bpm_to_real_bpm(
         float(ultrastar_class.bpm.replace(",", "."))
     )
diff --git a/src/modules/Ultrastar/ultrastar_parser.py b/src/modules/Ultrastar/ultrastar_parser.py
index 9bba162..f1aaca2 100644
--- a/src/modules/Ultrastar/ultrastar_parser.py
+++ b/src/modules/Ultrastar/ultrastar_parser.py
@@ -5,7 +5,13 @@
     get_end_time_from_ultrastar,
     get_start_time_from_ultrastar,
 )
-from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, UltrastarTxtTag, UltrastarTxtNoteTypeTag, FILE_ENCODING
+from modules.Ultrastar.ultrastar_txt import (
+    UltrastarTxtValue,
+    UltrastarTxtTag,
+    UltrastarTxtNoteTypeTag,
+    FILE_ENCODING,
+)
+
 
 def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue:
     """Parse ultrastar txt file to UltrastarTxt class"""
@@ -31,12 +37,23 @@ def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue:
                 ultrastar_class.gap = line.split(":")[1].replace("\n", "")
             elif line.startswith(f"#{UltrastarTxtTag.BPM}"):
                 ultrastar_class.bpm = line.split(":")[1].replace("\n", "")
-        elif line.startswith((
+            elif line.startswith(f"#{UltrastarTxtTag.VIDEO}"):
+                ultrastar_class.video = line.split(":")[1].replace("\n", "")
+            elif line.startswith(f"#{UltrastarTxtTag.VIDEOGAP}"):
+                ultrastar_class.videoGap = line.split(":")[1].replace("\n", "")
+            elif line.startswith(f"#{UltrastarTxtTag.COVER}"):
+                ultrastar_class.cover = line.split(":")[1].replace("\n", "")
+            elif line.startswith(f"#{UltrastarTxtTag.BACKGROUND}"):
+                ultrastar_class.background = line.split(":")[1].replace("\n", "")
+        elif line.startswith(
+            (
                 f"{UltrastarTxtNoteTypeTag.FREESTYLE} ",
                 f"{UltrastarTxtNoteTypeTag.NORMAL} ",
                 f"{UltrastarTxtNoteTypeTag.GOLDEN} ",
                 f"{UltrastarTxtNoteTypeTag.RAP} ",
-                f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ")):
+                f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ",
+            )
+        ):
             parts = line.split()
             # [0] F : * R G
             # [1] start beat
diff --git a/src/modules/Ultrastar/ultrastar_txt.py b/src/modules/Ultrastar/ultrastar_txt.py
index d22f91d..3e21273 100644
--- a/src/modules/Ultrastar/ultrastar_txt.py
+++ b/src/modules/Ultrastar/ultrastar_txt.py
@@ -14,9 +14,11 @@ class UltrastarTxtTag(str, Enum):
     BPM = 'BPM'
     LANGUAGE = 'LANGUAGE'
     COVER = 'COVER'  # Path to cover. Should end with `*[CO].jpg`
+    BACKGROUND = 'BACKGROUND'  # Path to background. Is shown when there is no video. Should end with `*[BG].jpg`
     CREATOR = 'CREATOR'
     COMMENT = 'COMMENT'
     VIDEO = 'VIDEO'
+    VIDEOGAP = 'VIDEOGAP'
     FILE_END = 'E'
     LINEBREAK = '-'
 
@@ -24,8 +26,6 @@ class UltrastarTxtTag(str, Enum):
     FIXER = 'FIXER'
 
     # Unused
-    BACKGROUND = 'BACKGROUND'  # Path to background. Is shown when there is no video. Should end with `*[BG].jpg`
-    VIDEOGAP = 'VIDEOGAP'
     GENRE = 'GENRE'
     EDITION = 'EDITION'
     YEAR = 'YEAR'
@@ -63,10 +63,12 @@ class UltrastarTxtValue:
     genre = ""
     mp3 = ""
     video = None
+    videoGap = None
     gap = ""
     bpm = ""
     language = None
     cover = None
+    background = None
     creator = "UltraSinger [GitHub]"
     comment = "UltraSinger [GitHub]"
     startBeat = []
diff --git a/src/modules/Ultrastar/ultrastar_writer.py b/src/modules/Ultrastar/ultrastar_writer.py
index ecd616b..9dee2e3 100644
--- a/src/modules/Ultrastar/ultrastar_writer.py
+++ b/src/modules/Ultrastar/ultrastar_writer.py
@@ -65,10 +65,15 @@ def create_ultrastar_txt_from_automation(
             file.write(f"#{UltrastarTxtTag.GENRE}:{ultrastar_class.genre}\n")
         if ultrastar_class.cover is not None:
             file.write(f"#{UltrastarTxtTag.COVER}:{ultrastar_class.cover}\n")
+        if ultrastar_class.background is not None:
+            file.write(f"#{UltrastarTxtTag.BACKGROUND}:{ultrastar_class.background}\n")
         file.write(f"#{UltrastarTxtTag.MP3}:{ultrastar_class.mp3}\n")
-        file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n")
-        file.write(f"#{UltrastarTxtTag.BPM}:{round(ultrastar_bpm, 2)}\n")  # not the real BPM!
         file.write(f"#{UltrastarTxtTag.GAP}:{int(gap * 1000)}\n")
+        if ultrastar_class.video is not None:
+            file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n")
+        if ultrastar_class.videoGap is not None:
+            file.write(f"#{UltrastarTxtTag.VIDEOGAP}:{ultrastar_class.videoGap}\n")
+        file.write(f"#{UltrastarTxtTag.BPM}:{round(ultrastar_bpm, 2)}\n")  # not the real BPM!
         file.write(f"#{UltrastarTxtTag.CREATOR}:{ultrastar_class.creator}\n")
         file.write(f"#{UltrastarTxtTag.FIXER}:{ultrastar_class.fixer}\n")
         file.write(f"#{UltrastarTxtTag.COMMENT}:{ultrastar_class.comment}\n")

From 63ad200990fbd1ccb98858431064f232ff31b2d3 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Fri, 6 Oct 2023 17:36:32 +0200
Subject: [PATCH 04/19] add caching for separation, denoise, transcription and
 pitching

---
 .../Speech_Recognition/test_Whisper.py        |  15 +-
 .../UltraStar/test_ultrastar_writer.py        |  23 +-
 src/Settings.py                               |   5 +
 src/UltraSinger.py                            | 358 ++++++++++--------
 src/modules/Pitcher/PitchingResult.py         |  15 +
 src/modules/Pitcher/pitched_data.py           |   3 +
 src/modules/Pitcher/pitcher.py                |   3 +
 src/modules/Research/TestSong.py              |   1 +
 src/modules/Research/UltraSingerEvaluation.py |  31 +-
 .../Speech_Recognition/TranscribedData.py     |  36 +-
 .../Speech_Recognition/TranscriptionResult.py |  14 +
 src/modules/Speech_Recognition/Whisper.py     |  10 +-
 src/modules/Ultrastar/ultrastar_writer.py     |   2 +-
 src/modules/console_colors.py                 |   6 +
 src/modules/csv_handler.py                    |   2 +-
 15 files changed, 314 insertions(+), 210 deletions(-)
 create mode 100644 src/modules/Pitcher/PitchingResult.py
 create mode 100644 src/modules/Speech_Recognition/TranscriptionResult.py

diff --git a/pytest/modules/Speech_Recognition/test_Whisper.py b/pytest/modules/Speech_Recognition/test_Whisper.py
index 8f78701..cbb0d07 100644
--- a/pytest/modules/Speech_Recognition/test_Whisper.py
+++ b/pytest/modules/Speech_Recognition/test_Whisper.py
@@ -30,14 +30,13 @@ def test_convert_to_transcribed_data(self):
 
         # Words should have space at the end
         expected_output = [
-            TranscribedData(
-                {"word": "UltraSinger ", "start": 1.23, "end": 2.34, "is_hyphen": None, "confidence": 0.95}),
-            TranscribedData({"word": "is ", "start": 2.34, "end": 3.45, "is_hyphen": None, "confidence": 0.9}),
-            TranscribedData({"word": "cool! ", "start": 3.45, "end": 4.56, "is_hyphen": None, "confidence": 0.85}),
-            TranscribedData({"word": "And ", "start": 4.56, "end": 5.67, "is_hyphen": None, "confidence": 0.95}),
-            TranscribedData({"word": "will ", "start": 5.67, "end": 6.78, "is_hyphen": None, "confidence": 0.9}),
-            TranscribedData({"word": "be ", "start": 6.78, "end": 7.89, "is_hyphen": None, "confidence": 0.85}),
-            TranscribedData({"word": "better! ", "start": 7.89, "end": 9.01, "is_hyphen": None, "confidence": 0.8}),
+            TranscribedData.from_dict({"word": "UltraSinger ", "start": 1.23, "end": 2.34, "is_hyphen": None, "confidence": 0.95}),
+            TranscribedData.from_dict({"word": "is ", "start": 2.34, "end": 3.45, "is_hyphen": None, "confidence": 0.9}),
+            TranscribedData.from_dict({"word": "cool! ", "start": 3.45, "end": 4.56, "is_hyphen": None, "confidence": 0.85}),
+            TranscribedData.from_dict({"word": "And ", "start": 4.56, "end": 5.67, "is_hyphen": None, "confidence": 0.95}),
+            TranscribedData.from_dict({"word": "will ", "start": 5.67, "end": 6.78, "is_hyphen": None, "confidence": 0.9}),
+            TranscribedData.from_dict({"word": "be ", "start": 6.78, "end": 7.89, "is_hyphen": None, "confidence": 0.85}),
+            TranscribedData.from_dict({"word": "better! ", "start": 7.89, "end": 9.01, "is_hyphen": None, "confidence": 0.8}),
         ]
 
         # Act
diff --git a/pytest/modules/UltraStar/test_ultrastar_writer.py b/pytest/modules/UltraStar/test_ultrastar_writer.py
index 4e82f4f..e614cb8 100644
--- a/pytest/modules/UltraStar/test_ultrastar_writer.py
+++ b/pytest/modules/UltraStar/test_ultrastar_writer.py
@@ -42,20 +42,20 @@ def test_create_ultrastar_txt_from_automation_full_values(self):
     def arrange(self):
         # Arrange
         transcribed_data = [
-            TranscribedData({
-                "conf": 0.95,
+            TranscribedData.from_dict({
+                "confidence": 0.95,
                 "word": "UltraSinger ",
                 "end": 2.5,
                 "start": 0.5
             }),
-            TranscribedData({
-                "conf": 0.9,
+            TranscribedData.from_dict({
+                "confidence": 0.9,
                 "word": "is ",
                 "end": 4.5,
                 "start": 3.0
             }),
-            TranscribedData({
-                "conf": 0.85,
+            TranscribedData.from_dict({
+                "confidence": 0.85,
                 "word": "cool! ",
                 "end": 7.5,
                 "start": 5.5
@@ -85,7 +85,14 @@ def default_values(default_ultrastar_class):
             f"#{UltrastarTxtTag.ARTIST}:{default_ultrastar_class.artist}\n",
             f"#{UltrastarTxtTag.TITLE}:{default_ultrastar_class.title}\n",
             f"#{UltrastarTxtTag.MP3}:{default_ultrastar_class.mp3}\n",
-            f"#{UltrastarTxtTag.VIDEO}:{default_ultrastar_class.video}\n", # todo: video is optional
+        ]
+
+        if default_ultrastar_class.video is not None:
+            expected_calls += [
+                f"#{UltrastarTxtTag.VIDEO}:{default_ultrastar_class.video}\n",
+            ]
+
+        expected_calls += [
             f"#{UltrastarTxtTag.BPM}:390.0\n",
             f"#{UltrastarTxtTag.GAP}:500\n",
             f"#{UltrastarTxtTag.CREATOR}:{default_ultrastar_class.creator}\n",
@@ -96,7 +103,7 @@ def default_values(default_ultrastar_class):
             ": 65 39 2 is \n",
             "- 104\n",
             ": 130 52 3 cool! \n",
-            "E"
+            "E",
         ]
         return expected_calls
 
diff --git a/src/Settings.py b/src/Settings.py
index b698681..efcbf9e 100644
--- a/src/Settings.py
+++ b/src/Settings.py
@@ -36,3 +36,8 @@ class Settings:
 
     # UltraSinger Evaluation Configuration
     test_songs_input_folder = None
+    cache_override_path = None
+    skip_cache_vocal_separation = False
+    skip_cache_denoise_vocal_audio = False
+    skip_cache_transcription = False
+    skip_cache_pitch_detection = False
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index ded0f85..06b5d6f 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -2,6 +2,7 @@
 
 import copy
 import getopt
+import json
 import os
 import sys
 
@@ -17,6 +18,8 @@
     export_chunks_from_ultrastar_data,
 )
 from modules.Audio.silence_processing import remove_silence_from_transcription_data
+from modules.Pitcher.PitchingResult import PitchingResult
+from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult
 from modules.csv_handler import export_transcribed_data_to_csv
 from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
 from modules.Audio.youtube import (
@@ -32,6 +35,7 @@
     gold_highlighted,
     light_blue_highlighted,
     red_highlighted,
+    green_highlighted,
 )
 from modules.Midi import midi_creator
 from modules.Midi.midi_creator import (
@@ -44,12 +48,22 @@
     get_pitch_with_crepe_file,
 )
 from modules.Pitcher.pitched_data import PitchedData
-from modules.Speech_Recognition.hyphenation import hyphenation, language_check, create_hyphenator
+from modules.Speech_Recognition.hyphenation import (
+    hyphenation,
+    language_check,
+    create_hyphenator,
+)
 from modules.Speech_Recognition.Whisper import transcribe_with_whisper
-from modules.Ultrastar import ultrastar_score_calculator, ultrastar_writer, ultrastar_converter, ultrastar_parser
-from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue
+from modules.Ultrastar import (
+    ultrastar_score_calculator,
+    ultrastar_writer,
+    ultrastar_converter,
+    ultrastar_parser,
+)
+from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, FILE_ENCODING
 from Settings import Settings
 from modules.Speech_Recognition.TranscribedData import TranscribedData
+from modules.os_helper import check_file_exists
 from modules.plot import plot
 from modules.musicbrainz_client import get_music_infos
 
@@ -57,6 +71,7 @@
 SYLLABLE_SEGMENT_SIZE = 0.1
 SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1
 
+
 def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
     """Convert midi notes to ultrastar notes"""
     print(f"{ULTRASINGER_HEAD} Creating Ultrastar notes from midi data")
@@ -65,9 +80,7 @@ def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
     for i in enumerate(midi_notes):
         pos = i[0]
         note_number_librosa = librosa.note_to_midi(midi_notes[pos])
-        pitch = ultrastar_converter.midi_note_to_ultrastar_note(
-            note_number_librosa
-        )
+        pitch = ultrastar_converter.midi_note_to_ultrastar_note(note_number_librosa)
         ultrastar_note_numbers.append(pitch)
         # todo: Progress?
         # print(
@@ -78,9 +91,7 @@ def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
 
 def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
     """Pitch each chunk with crepe and return midi notes"""
-    print(
-        f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}"
-    )
+    print(f"{ULTRASINGER_HEAD} Pitching each chunk with {blue_highlighted('crepe')}")
 
     midi_notes = []
     for filename in sorted(
@@ -109,7 +120,9 @@ def pitch_each_chunk_with_crepe(directory: str) -> list[str]:
     return midi_notes
 
 
-def add_hyphen_to_data(transcribed_data: list[TranscribedData], hyphen_words: list[list[str]]):
+def add_hyphen_to_data(
+    transcribed_data: list[TranscribedData], hyphen_words: list[list[str]]
+):
     """Add hyphen to transcribed data return new data list"""
     new_data = []
 
@@ -141,9 +154,7 @@ def get_bpm_from_data(data, sampling_rate):
     onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate)
     wav_tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sampling_rate)
 
-    print(
-        f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}"
-    )
+    print(f"{ULTRASINGER_HEAD} BPM is {blue_highlighted(str(round(wav_tempo[0], 2)))}")
     return wav_tempo[0]
 
 
@@ -223,12 +234,12 @@ def remove_unecessary_punctuations(transcribed_data: list[TranscribedData]) -> N
     """Remove unecessary punctuations from transcribed data"""
     punctuation = ".,"
     for i, data in enumerate(transcribed_data):
-        data.word = data.word.translate(
-            {ord(i): None for i in punctuation}
-        )
+        data.word = data.word.translate({ord(i): None for i in punctuation})
 
 
-def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData]) -> list[list[str]] | None:
+def hyphenate_each_word(
+    language: str, transcribed_data: list[TranscribedData]
+) -> list[list[str]] | None:
     """Hyphenate each word in the transcribed data."""
     hyphenated_word = []
     lang_region = language_check(language)
@@ -241,9 +252,7 @@ def hyphenate_each_word(language: str, transcribed_data: list[TranscribedData])
     hyphenator = create_hyphenator(lang_region)
     for i in tqdm(enumerate(transcribed_data)):
         pos = i[0]
-        hyphenated_word.append(
-            hyphenation(transcribed_data[pos].word, hyphenator)
-        )
+        hyphenated_word.append(hyphenation(transcribed_data[pos].word, hyphenator))
     return hyphenated_word
 
 
@@ -261,18 +270,19 @@ def print_support() -> None:
     )
 
 
-def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> list[TranscribedData]:
+def split_syllables_into_segments(
+    transcribed_data: list[TranscribedData],
+) -> list[TranscribedData]:
     """Split every syllable into sub-segments"""
     segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1])
     new_data = []
 
     for i, data in enumerate(transcribed_data):
-
         duration = data.end - data.start
         if duration <= SYLLABLE_SEGMENT_SIZE:
             new_data.append(data)
             continue
-        
+
         has_space = str(data.word).endswith(" ")
         first_segment = copy.deepcopy(data)
         filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE
@@ -289,26 +299,28 @@ def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> li
             for i in range(int(full_segments)):
                 segment = TranscribedData()
                 segment.word = "~"
-                segment.start = filler_words_start + round(i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
+                segment.start = filler_words_start + round(
+                    i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points
+                )
                 segment.end = segment.start + SYLLABLE_SEGMENT_SIZE
                 new_data.append(segment)
-        
+
         if partial_segment >= 0.01:
             segment = TranscribedData()
             segment.word = "~"
-            segment.start = filler_words_start + round(full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
+            segment.start = filler_words_start + round(
+                full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points
+            )
             segment.end = segment.start + partial_segment
             new_data.append(segment)
-        
+
         if has_space:
             new_data[-1].word += " "
     return new_data
 
 
 def merge_syllable_segments(
-        transcribed_data: list[TranscribedData],
-        midi_notes: list[str],
-        us_notes = list[int]
+    transcribed_data: list[TranscribedData], midi_notes: list[str], us_notes=list[int]
 ) -> tuple[list[TranscribedData], list[str], list[int]]:
     """Merge sub-segments of a syllable where the pitch is the same"""
     new_data = []
@@ -318,7 +330,12 @@ def merge_syllable_segments(
     previous_data = None
 
     for i, data in enumerate(transcribed_data):
-        if str(data.word).startswith("~") and previous_data is not None and midi_notes[i] == midi_notes[i-1] and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE:
+        if (
+            str(data.word).startswith("~")
+            and previous_data is not None
+            and midi_notes[i] == midi_notes[i - 1]
+            and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE
+        ):
             new_data[-1].end = data.end
         else:
             new_data.append(data)
@@ -352,36 +369,31 @@ def run() -> None:
             )
             exit(1)
     elif settings.input_file_path.startswith("https:"):  # Youtube
-        print(
-            f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}"
-        )
+        print(f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}")
         (
             basename_without_ext,
             song_output,
             ultrastar_audio_input_path,
-            (title, artist, year, genre)
+            (title, artist, year, genre),
         ) = download_from_youtube()
     else:  # Audio File
-        print(
-            f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}"
-        )
+        print(f"{ULTRASINGER_HEAD} {gold_highlighted('full automatic mode')}")
         (
             basename_without_ext,
             song_output,
             ultrastar_audio_input_path,
-            (title, artist, year, genre)
+            (title, artist, year, genre),
         ) = infos_from_audio_input_file()
 
-    cache_path = os.path.join(song_output, "cache")
-    settings.mono_audio_path = os.path.join(
-        cache_path, basename_without_ext + ".wav"
+    cache_path = (
+        os.path.join(song_output, "cache")
+        if settings.cache_override_path is None
+        else settings.cache_override_path
     )
-    os_helper.create_folder(cache_path)
+    settings.mono_audio_path = os.path.join(cache_path, basename_without_ext + ".wav")
 
     # Separate vocal from audio
-    audio_separation_path = separate_vocal_from_audio(
-        basename_without_ext, cache_path, ultrastar_audio_input_path
-    )
+    audio_separation_path = separate_vocal_from_audio(basename_without_ext, cache_path, ultrastar_audio_input_path)
 
     # Denoise vocal audio
     denoise_vocal_audio(basename_without_ext, cache_path)
@@ -390,13 +402,13 @@ def run() -> None:
     transcribed_data = None
     language = settings.language
     if not settings.ignore_audio:
-        detected_language, transcribed_data = transcribe_audio()
+        transcription_result = transcribe_audio(cache_path)
         if language is None:
-            language = detected_language
+            language = transcription_result.detected_language
 
-        remove_unecessary_punctuations(transcribed_data)
+        remove_unecessary_punctuations(transcription_result.transcribed_data)
         transcribed_data = remove_silence_from_transcription_data(
-            settings.mono_audio_path, transcribed_data
+            settings.mono_audio_path, transcription_result.transcribed_data
         )
 
         if settings.hyphenation:
@@ -421,10 +433,12 @@ def run() -> None:
 
     # Pitch the audio
     midi_notes, pitched_data, ultrastar_note_numbers = pitch_audio(
-        transcribed_data, ultrastar_class
+        transcribed_data, ultrastar_class, cache_path
     )
 
-    transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers)
+    transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(
+        transcribed_data, midi_notes, ultrastar_note_numbers
+    )
 
     # Create plot
     if settings.create_plot:
@@ -443,7 +457,7 @@ def run() -> None:
             title,
             artist,
             year,
-            genre
+            genre,
         )
     else:
         ultrastar_file_output = create_ultrastar_txt_from_ultrastar_data(
@@ -456,9 +470,7 @@ def run() -> None:
     )
 
     # Add calculated score to Ultrastar txt
-    ultrastar_writer.add_score_to_ultrastar_txt(
-        ultrastar_file_output, simple_score
-    )
+    ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score)
 
     # Midi
     if settings.create_midi:
@@ -488,36 +500,60 @@ def get_unused_song_output_dir(path: str) -> str:
     return path
 
 
-def transcribe_audio() -> (str, list[TranscribedData]):
+def transcribe_audio(cache_path: str) -> TranscriptionResult:
     """Transcribe audio with AI"""
+    transcription_result = None
     if settings.transcriber == "whisper":
-        transcribed_data, detected_language = transcribe_with_whisper(
-            settings.mono_audio_path,
-            settings.whisper_model,
-            settings.pytorch_device,
-            settings.whisper_align_model,
-            settings.whisper_batch_size,
-            settings.whisper_compute_type,
-            settings.language,
-        )
+        transcription_config = f"{settings.transcriber}_{settings.whisper_model}_{settings.pytorch_device}_{settings.whisper_align_model}_{settings.whisper_align_model}_{settings.whisper_batch_size}_{settings.whisper_compute_type}_{settings.language}"
+        transcription_path = os.path.join(cache_path, f"{transcription_config}.json")
+        cached_transcription_available = check_file_exists(transcription_path)
+        if settings.skip_cache_transcription or not cached_transcription_available:
+            transcription_result = transcribe_with_whisper(
+                settings.mono_audio_path,
+                settings.whisper_model,
+                settings.pytorch_device,
+                settings.whisper_align_model,
+                settings.whisper_batch_size,
+                settings.whisper_compute_type,
+                settings.language,
+            )
+            with open(transcription_path, "w", encoding=FILE_ENCODING) as file:
+                file.write(transcription_result.to_json())
+        else:
+            print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached transcribed data")
+            with open(transcription_path) as file:
+                json = file.read()
+                transcription_result = TranscriptionResult.from_json(json)
     else:
         raise NotImplementedError
-    return detected_language, transcribed_data
+    return transcription_result
 
 
 def separate_vocal_from_audio(
-        basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str
+    basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str
 ) -> str:
     """Separate vocal from audio"""
+    demcus_output_folder = os.path.splitext(
+        os.path.basename(ultrastar_audio_input_path)
+    )[0]
     audio_separation_path = os.path.join(
-        cache_path, "separated", "htdemucs", os.path.splitext(os.path.basename(ultrastar_audio_input_path))[0]
+        cache_path, "separated", "htdemucs", demcus_output_folder
     )
 
+    vocals_path = os.path.join(audio_separation_path, "vocals.wav")
+    instrumental_path = os.path.join(audio_separation_path, "no_vocals.wav")
     if settings.use_separated_vocal or settings.create_karaoke:
-        separate_audio(ultrastar_audio_input_path, cache_path, settings.pytorch_device)
+        cache_available = (check_file_exists(vocals_path)
+                           and check_file_exists(instrumental_path))
+        if settings.skip_cache_vocal_separation or not cache_available:
+            separate_audio(
+                ultrastar_audio_input_path, cache_path, settings.pytorch_device
+            )
+        else:
+            print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached separated vocals")
 
     if settings.use_separated_vocal:
-        input_path = os.path.join(audio_separation_path, "vocals.wav")
+        input_path = vocals_path
     else:
         input_path = ultrastar_audio_input_path
 
@@ -526,22 +562,18 @@ def separate_vocal_from_audio(
 
 
 def calculate_score_points(
-    pitched_data: PitchedData, ultrastar_class: UltrastarTxtValue, ultrastar_file_output: str
+    pitched_data: PitchedData,
+    ultrastar_class: UltrastarTxtValue,
+    ultrastar_file_output: str,
 ):
     """Calculate score points"""
     if not settings.ignore_audio:
-        ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
-            ultrastar_file_output
-        )
+        ultrastar_class = ultrastar_parser.parse_ultrastar_txt(ultrastar_file_output)
         (
             simple_score,
             accurate_score,
-        ) = ultrastar_score_calculator.calculate_score(
-            pitched_data, ultrastar_class
-        )
-        ultrastar_score_calculator.print_score_calculation(
-            simple_score, accurate_score
-        )
+        ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class)
+        ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score)
     else:
         print(
             f"{ULTRASINGER_HEAD} {blue_highlighted('Score of original Ultrastar txt')}"
@@ -549,32 +581,24 @@ def calculate_score_points(
         (
             simple_score,
             accurate_score,
-        ) = ultrastar_score_calculator.calculate_score(
-            pitched_data, ultrastar_class
-        )
-        ultrastar_score_calculator.print_score_calculation(
-            simple_score, accurate_score
-        )
+        ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class)
+        ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score)
         print(
             f"{ULTRASINGER_HEAD} {blue_highlighted('Score of re-pitched Ultrastar txt')}"
         )
-        ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
-            ultrastar_file_output
-        )
+        ultrastar_class = ultrastar_parser.parse_ultrastar_txt(ultrastar_file_output)
         (
             simple_score,
             accurate_score,
-        ) = ultrastar_score_calculator.calculate_score(
-            pitched_data, ultrastar_class
-        )
-        ultrastar_score_calculator.print_score_calculation(
-            simple_score, accurate_score
-        )
+        ) = ultrastar_score_calculator.calculate_score(pitched_data, ultrastar_class)
+        ultrastar_score_calculator.print_score_calculation(simple_score, accurate_score)
     return ultrastar_class, simple_score, accurate_score
 
 
 def create_ultrastar_txt_from_ultrastar_data(
-    song_output: str, ultrastar_class: UltrastarTxtValue, ultrastar_note_numbers: list[int]
+    song_output: str,
+    ultrastar_class: UltrastarTxtValue,
+    ultrastar_note_numbers: list[int],
 ) -> str:
     """Create Ultrastar txt from Ultrastar data"""
     output_repitched_ultrastar = os.path.join(
@@ -599,7 +623,7 @@ def create_ultrastar_txt_from_automation(
     title: str,
     artist: str,
     year: str,
-    genre: str
+    genre: str,
 ):
     """Create Ultrastar txt from automation"""
     ultrastar_header = UltrastarTxtValue()
@@ -610,9 +634,7 @@ def create_ultrastar_txt_from_automation(
     ultrastar_header.language = language
     cover = basename_without_ext + " [CO].jpg"
     ultrastar_header.cover = (
-        cover
-        if os_helper.check_file_exists(os.path.join(song_output, cover))
-        else None
+        cover if os_helper.check_file_exists(os.path.join(song_output, cover)) else None
     )
 
     # Additional data
@@ -626,9 +648,7 @@ def create_ultrastar_txt_from_automation(
         ultrastar_header.genre = genre
 
     real_bpm = get_bpm_from_file(ultrastar_audio_input_path)
-    ultrastar_file_output = os.path.join(
-        song_output, basename_without_ext + ".txt"
-    )
+    ultrastar_file_output = os.path.join(song_output, basename_without_ext + ".txt")
     ultrastar_writer.create_ultrastar_txt_from_automation(
         transcribed_data,
         ultrastar_note_numbers,
@@ -683,9 +703,17 @@ def infos_from_audio_input_file() -> tuple[str, str, str, tuple[str, str, str, s
     song_output = get_unused_song_output_dir(song_output)
     os_helper.create_folder(song_output)
     os_helper.copy(settings.input_file_path, song_output)
-    os_helper.rename(os.path.join(song_output, os.path.basename(settings.input_file_path)), os.path.join(song_output, basename))
+    os_helper.rename(
+        os.path.join(song_output, os.path.basename(settings.input_file_path)),
+        os.path.join(song_output, basename),
+    )
     ultrastar_audio_input_path = os.path.join(song_output, basename)
-    return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info)
+    return (
+        basename_without_ext,
+        song_output,
+        ultrastar_audio_input_path,
+        (title, artist, year_info, genre_info),
+    )
 
 
 FILENAME_REPLACEMENTS = (('?:"', ""), ("<", "("), (">", ")"), ("/\\|*", "-"))
@@ -706,7 +734,9 @@ def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]:
     (artist, title) = get_youtube_title(settings.input_file_path)
 
     # Get additional data for song
-    (title_info, artist_info, year_info, genre_info) = get_music_infos(f"{artist} - {title}")
+    (title_info, artist_info, year_info, genre_info) = get_music_infos(
+        f"{artist} - {title}"
+    )
 
     if title_info is not None:
         title = title_info
@@ -717,24 +747,23 @@ def download_from_youtube() -> tuple[str, str, str, tuple[str, str, str, str]]:
     song_output = os.path.join(settings.output_file_path, basename_without_ext)
     song_output = get_unused_song_output_dir(song_output)
     os_helper.create_folder(song_output)
-    download_youtube_audio(
-        settings.input_file_path, basename_without_ext, song_output
-    )
-    download_youtube_video(
-        settings.input_file_path, basename_without_ext, song_output
-    )
+    download_youtube_audio(settings.input_file_path, basename_without_ext, song_output)
+    download_youtube_video(settings.input_file_path, basename_without_ext, song_output)
     download_youtube_thumbnail(
         settings.input_file_path, basename_without_ext, song_output
     )
     ultrastar_audio_input_path = os.path.join(song_output, basename)
-    return basename_without_ext, song_output, ultrastar_audio_input_path, (title, artist, year_info, genre_info)
+    return (
+        basename_without_ext,
+        song_output,
+        ultrastar_audio_input_path,
+        (title, artist, year_info, genre_info),
+    )
 
 
 def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]:
     """Parse Ultrastar txt"""
-    ultrastar_class = ultrastar_parser.parse_ultrastar_txt(
-        settings.input_file_path
-    )
+    ultrastar_class = ultrastar_parser.parse_ultrastar_txt(settings.input_file_path)
     real_bpm = ultrastar_converter.ultrastar_bpm_to_real_bpm(
         float(ultrastar_class.bpm.replace(",", "."))
     )
@@ -758,63 +787,78 @@ def parse_ultrastar_txt() -> tuple[str, float, str, str, UltrastarTxtValue]:
     )
 
 
-def create_midi_file(real_bpm: float,
-                     song_output: str,
-                     ultrastar_class: UltrastarTxtValue,
-                     basename_without_ext: str) -> None:
+def create_midi_file(
+    real_bpm: float,
+    song_output: str,
+    ultrastar_class: UltrastarTxtValue,
+    basename_without_ext: str,
+) -> None:
     """Create midi file"""
-    print(
-        f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}"
-    )
+    print(f"{ULTRASINGER_HEAD} Creating Midi with {blue_highlighted('pretty_midi')}")
 
     voice_instrument = [
         midi_creator.convert_ultrastar_to_midi_instrument(ultrastar_class)
     ]
     midi_output = os.path.join(song_output, f"{basename_without_ext}.mid")
-    midi_creator.instruments_to_midi(
-        voice_instrument, real_bpm, midi_output
-    )
+    midi_creator.instruments_to_midi(voice_instrument, real_bpm, midi_output)
 
 
-def pitch_audio(transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue) -> tuple[
+def pitch_audio(
+    transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue, cache_path: str) -> tuple[
     list[str], PitchedData, list[int]]:
     """Pitch audio"""
     # todo: chunk pitching as option?
     # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name)
-    pitched_data = get_pitch_with_crepe_file(
-        settings.mono_audio_path,
-        settings.crepe_model_capacity,
-        settings.crepe_step_size,
-        settings.tensorflow_device
-    )
-    if not settings.ignore_audio:
-        start_times = []
-        end_times = []
-        for i, data in enumerate(transcribed_data):
-            start_times.append(data.start)
-            end_times.append(data.end)
-        midi_notes = create_midi_notes_from_pitched_data(
-            start_times, end_times, pitched_data
+
+    pitching_config = f"crepe_{settings.ignore_audio}_{settings.crepe_model_capacity}_{settings.crepe_step_size}_{settings.tensorflow_device}"
+    pitching_path = os.path.join(cache_path, f"{pitching_config}.json")
+    cache_available = check_file_exists(pitching_path)
+
+    if settings.skip_cache_transcription or not cache_available:
+        pitched_data = get_pitch_with_crepe_file(
+            settings.mono_audio_path,
+            settings.crepe_model_capacity,
+            settings.crepe_step_size,
+            settings.tensorflow_device,
         )
+        if not settings.ignore_audio:
+            start_times = []
+            end_times = []
+            for i, data in enumerate(transcribed_data):
+                start_times.append(data.start)
+                end_times.append(data.end)
+            midi_notes = create_midi_notes_from_pitched_data(
+                start_times, end_times, pitched_data
+            )
 
+        else:
+            midi_notes = create_midi_notes_from_pitched_data(
+                ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data
+            )
+        ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes)
+
+        pitching_result = PitchingResult(midi_notes, pitched_data, ultrastar_note_numbers)
+
+        pitching_result_json = pitching_result.to_json()
+        with open(pitching_path, "w", encoding=FILE_ENCODING) as file:
+            file.write(pitching_result_json)
     else:
-        midi_notes = create_midi_notes_from_pitched_data(
-            ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data
-        )
-    ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes)
-    return midi_notes, pitched_data, ultrastar_note_numbers
+        print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data")
+        with open(pitching_path) as file:
+            json = file.read()
+            pitching_result = PitchingResult.from_json(json)
+
+    return pitching_result.midi_notes, pitching_result.pitched_data, pitching_result.ultrastar_note_numbers
 
 
 def create_audio_chunks(
     cache_path: str,
     transcribed_data: list[TranscribedData],
     ultrastar_audio_input_path: str,
-    ultrastar_class: UltrastarTxtValue
+    ultrastar_class: UltrastarTxtValue,
 ) -> None:
     """Create audio chunks"""
-    audio_chunks_path = os.path.join(
-        cache_path, settings.audio_chunk_folder_name
-    )
+    audio_chunks_path = os.path.join(cache_path, settings.audio_chunk_folder_name)
     os_helper.create_folder(audio_chunks_path)
     if not settings.ignore_audio:  # and csv
         csv_filename = os.path.join(audio_chunks_path, "_chunks.csv")
@@ -830,10 +874,14 @@ def create_audio_chunks(
 
 def denoise_vocal_audio(basename_without_ext: str, cache_path: str) -> None:
     """Denoise vocal audio"""
-    denoised_path = os.path.join(
-        cache_path, basename_without_ext + "_denoised.wav"
-    )
-    ffmpeg_reduce_noise(settings.mono_audio_path, denoised_path)
+    denoised_path = os.path.join(cache_path, basename_without_ext + "_denoised.wav")
+    cache_available = check_file_exists(denoised_path)
+
+    if settings.skip_cache_denoise_vocal_audio or not cache_available:
+        ffmpeg_reduce_noise(settings.mono_audio_path, denoised_path)
+    else:
+        print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached denoised audio")
+
     settings.mono_audio_path = denoised_path
 
 
diff --git a/src/modules/Pitcher/PitchingResult.py b/src/modules/Pitcher/PitchingResult.py
new file mode 100644
index 0000000..47a74c3
--- /dev/null
+++ b/src/modules/Pitcher/PitchingResult.py
@@ -0,0 +1,15 @@
+from dataclasses import dataclass
+
+from dataclasses_json import dataclass_json
+
+from modules.Pitcher.pitched_data import PitchedData
+
+
+@dataclass_json
+@dataclass
+class PitchingResult:
+    """Pitching result"""
+
+    midi_notes: list[str]
+    pitched_data: PitchedData
+    ultrastar_note_numbers: list[int]
diff --git a/src/modules/Pitcher/pitched_data.py b/src/modules/Pitcher/pitched_data.py
index f2d32df..3edb088 100644
--- a/src/modules/Pitcher/pitched_data.py
+++ b/src/modules/Pitcher/pitched_data.py
@@ -1,7 +1,10 @@
 """Pitched data"""
 from dataclasses import dataclass
 
+from dataclasses_json import dataclass_json
 
+
+@dataclass_json
 @dataclass
 class PitchedData:
     """Pitched data from crepe"""
diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py
index c3fc81b..80506c7 100644
--- a/src/modules/Pitcher/pitcher.py
+++ b/src/modules/Pitcher/pitcher.py
@@ -41,6 +41,9 @@ def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size
     confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size)
     timer.log('Computing loudness end')
 
+    # convert to native float for serialization
+    confidence = [float(x) for x in confidence]
+
     return PitchedData(times, frequencies, confidence, perceived_loudness)
 
 
diff --git a/src/modules/Research/TestSong.py b/src/modules/Research/TestSong.py
index 3aa262d..fc0d72e 100644
--- a/src/modules/Research/TestSong.py
+++ b/src/modules/Research/TestSong.py
@@ -8,5 +8,6 @@ class TestSong:
     """Test song"""
 
     txt: str
+    folder: str
     audio: float
     ultrastar_class: UltrastarTxtValue
\ No newline at end of file
diff --git a/src/modules/Research/UltraSingerEvaluation.py b/src/modules/Research/UltraSingerEvaluation.py
index 17cfcf6..d0108c0 100644
--- a/src/modules/Research/UltraSingerEvaluation.py
+++ b/src/modules/Research/UltraSingerEvaluation.py
@@ -25,14 +25,9 @@
 
 def main() -> None:
     """Main function"""
-    test_input_folder_path = Path(test_input_folder)
-    test_input_folder_path.mkdir(parents=True, exist_ok=True)
-
-    test_output_folder_path = Path(test_output_folder)
-    test_output_folder_path.mkdir(parents=True, exist_ok=True)
-
-    test_run_folder_path = Path(test_run_folder)
-    test_run_folder_path.mkdir(parents=True)
+    Path(test_input_folder).mkdir(parents=True, exist_ok=True)
+    Path(test_output_folder).mkdir(parents=True, exist_ok=True)
+    Path(test_run_folder).mkdir(parents=True)
 
     base_settings = initialize_settings()
     base_settings.output_file_path = test_run_folder
@@ -49,19 +44,15 @@ def main() -> None:
 
     test_songs: List[TestSong] = []
     for dir_entry in os.listdir(base_settings.test_songs_input_folder):
-        dir_entry_path = os.path.join(base_settings.test_songs_input_folder, dir_entry)
-        if os.path.isdir(dir_entry_path):
-            for sub_dir_entry in os.listdir(dir_entry_path):
-                if sub_dir_entry.endswith(".txt") and sub_dir_entry != "license.txt":
-                    txt_file = os.path.join(
-                        base_settings.test_songs_input_folder, dir_entry, sub_dir_entry
-                    )
+        song_folder = os.path.join(base_settings.test_songs_input_folder, dir_entry)
+        if os.path.isdir(song_folder):
+            for song_folder_item in os.listdir(song_folder):
+                if song_folder_item.endswith(".txt") and song_folder_item != "license.txt":
+                    txt_file = os.path.join(song_folder, song_folder_item)
                     ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file)
 
                     if ultrastar_class.mp3:
-                        test_song = TestSong(
-                            txt_file, ultrastar_class.mp3, ultrastar_class
-                        )
+                        test_song = TestSong(txt_file, song_folder, ultrastar_class.mp3, ultrastar_class)
                         test_songs.append(test_song)
                         break
                     else:
@@ -83,8 +74,12 @@ def main() -> None:
             f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.txt)}"
         )
 
+        # prepare cache directory
+        song_cache_path = os.path.join(test_song.folder, "cache")
+        Path(song_cache_path).mkdir(parents=True, exist_ok=True)
         test_song_settings = copy.deepcopy(base_settings)
         test_song_settings.input_file_path = test_song.txt
+        test_song_settings.cache_override_path = song_cache_path
         UltraSinger.settings = test_song_settings
         UltraSinger.run()
 
diff --git a/src/modules/Speech_Recognition/TranscribedData.py b/src/modules/Speech_Recognition/TranscribedData.py
index 5962d9a..d006204 100644
--- a/src/modules/Speech_Recognition/TranscribedData.py
+++ b/src/modules/Speech_Recognition/TranscribedData.py
@@ -1,20 +1,28 @@
-"""Docstring"""
+from dataclasses import dataclass
 
+from dataclasses_json import dataclass_json
 
+
+@dataclass_json
+@dataclass
 class TranscribedData:
     """Transcribed data from json file"""
 
-    def __init__(self, transcribed_json = None):
+    confidence: float = 0
+    word: str = ""
+    start: float = 0
+    end: float = 0
+    is_hyphen: bool = False
+
 
-        if transcribed_json:
-            # Vosk = conf, Whisper = confidence
-            self.conf = transcribed_json.get(
-                "conf", transcribed_json.get("confidence", None)
-            )
-            # Vosk = word, Whisper = text
-            self.word = transcribed_json.get(
-                "word", transcribed_json.get("text", None)
-            )
-            self.end = transcribed_json.get("end", None)
-            self.start = transcribed_json.get("start", None)
-            self.is_hyphen = None
+def from_whisper(whisper_dict) -> TranscribedData:
+    transcribed_data = TranscribedData()
+    if "score" in whisper_dict:
+        transcribed_data.confidence = whisper_dict["score"]
+    if "word" in whisper_dict:
+        transcribed_data.word = whisper_dict["word"]
+    if "start" in whisper_dict:
+        transcribed_data.start = whisper_dict["start"]
+    if "end" in whisper_dict:
+        transcribed_data.end = whisper_dict["end"]
+    return transcribed_data
diff --git a/src/modules/Speech_Recognition/TranscriptionResult.py b/src/modules/Speech_Recognition/TranscriptionResult.py
new file mode 100644
index 0000000..1fa055f
--- /dev/null
+++ b/src/modules/Speech_Recognition/TranscriptionResult.py
@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+
+from dataclasses_json import dataclass_json
+
+from modules.Speech_Recognition.TranscribedData import TranscribedData
+
+
+@dataclass_json
+@dataclass
+class TranscriptionResult:
+    """Transcription result"""
+
+    transcribed_data: list[TranscribedData]
+    detected_language: str
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
index 0545ff7..565b123 100644
--- a/src/modules/Speech_Recognition/Whisper.py
+++ b/src/modules/Speech_Recognition/Whisper.py
@@ -5,8 +5,9 @@
 import whisperx
 from torch.cuda import OutOfMemoryError
 
+from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult
 from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
-from modules.Speech_Recognition.TranscribedData import TranscribedData
+from modules.Speech_Recognition.TranscribedData import TranscribedData, from_whisper
 
 
 def transcribe_with_whisper(
@@ -17,7 +18,7 @@ def transcribe_with_whisper(
     batch_size: int = 16,
     compute_type: str = None,
     language: str = None,
-) -> (list[TranscribedData], str):
+) -> TranscriptionResult:
     """Transcribe with whisper"""
 
     print(
@@ -90,20 +91,19 @@ def transcribe_with_whisper(
 
     transcribed_data = convert_to_transcribed_data(result_aligned)
 
-    return transcribed_data, detected_language
+    return TranscriptionResult(transcribed_data, detected_language)
 
 
 def convert_to_transcribed_data(result_aligned):
     transcribed_data = []
     for segment in result_aligned["segments"]:
         for obj in segment["words"]:
-            vtd = TranscribedData(obj)  # create custom Word object
+            vtd = from_whisper(obj)  # create custom Word object
             vtd.word = vtd.word + " "  # add space to end of word
             if len(obj) < 4:
                 previous = transcribed_data[-1]
                 if not previous:
                     previous.end = 0
-                    previous.end = ""
                 vtd.start = previous.end + 0.1
                 vtd.end = previous.end + 0.2
                 msg = f'Error: There is no timestamp for word:  {obj["word"]}. ' \
diff --git a/src/modules/Ultrastar/ultrastar_writer.py b/src/modules/Ultrastar/ultrastar_writer.py
index 9dee2e3..f6497c9 100644
--- a/src/modules/Ultrastar/ultrastar_writer.py
+++ b/src/modules/Ultrastar/ultrastar_writer.py
@@ -68,12 +68,12 @@ def create_ultrastar_txt_from_automation(
         if ultrastar_class.background is not None:
             file.write(f"#{UltrastarTxtTag.BACKGROUND}:{ultrastar_class.background}\n")
         file.write(f"#{UltrastarTxtTag.MP3}:{ultrastar_class.mp3}\n")
-        file.write(f"#{UltrastarTxtTag.GAP}:{int(gap * 1000)}\n")
         if ultrastar_class.video is not None:
             file.write(f"#{UltrastarTxtTag.VIDEO}:{ultrastar_class.video}\n")
         if ultrastar_class.videoGap is not None:
             file.write(f"#{UltrastarTxtTag.VIDEOGAP}:{ultrastar_class.videoGap}\n")
         file.write(f"#{UltrastarTxtTag.BPM}:{round(ultrastar_bpm, 2)}\n")  # not the real BPM!
+        file.write(f"#{UltrastarTxtTag.GAP}:{int(gap * 1000)}\n")
         file.write(f"#{UltrastarTxtTag.CREATOR}:{ultrastar_class.creator}\n")
         file.write(f"#{UltrastarTxtTag.FIXER}:{ultrastar_class.fixer}\n")
         file.write(f"#{UltrastarTxtTag.COMMENT}:{ultrastar_class.comment}\n")
diff --git a/src/modules/console_colors.py b/src/modules/console_colors.py
index c4cc6f8..59328ff 100644
--- a/src/modules/console_colors.py
+++ b/src/modules/console_colors.py
@@ -8,6 +8,11 @@ def blue_highlighted(text: str) -> str:
     return f"{Bcolors.blue}{text}{Bcolors.endc}"
 
 
+def green_highlighted(text: str) -> str:
+    """Returns a blue highlighted text"""
+    return f"{Bcolors.dark_green}{text}{Bcolors.endc}"
+
+
 def gold_highlighted(text: str) -> str:
     """Returns a gold highlighted text"""
     return f"{Bcolors.gold}{text}{Bcolors.endc}"
@@ -37,6 +42,7 @@ class Bcolors:
     """Colors for the console"""
 
     blue = "\033[94m"
+    dark_green = "\033[32m"
     red = "\033[91m"
     light_blue = "\033[96m"
     cyan = "\033[36m"
diff --git a/src/modules/csv_handler.py b/src/modules/csv_handler.py
index 58cceeb..3bb01f4 100644
--- a/src/modules/csv_handler.py
+++ b/src/modules/csv_handler.py
@@ -20,7 +20,7 @@ def export_transcribed_data_to_csv(transcribed_data: list[TranscribedData], file
                     data.word,
                     data.start,
                     data.end,
-                    data.conf,
+                    data.confidence,
                 ]
             )
 

From cb4aa4e5ead28234f307eccdc2c940b5e823fdf2 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Fri, 6 Oct 2023 22:57:14 +0200
Subject: [PATCH 05/19] move UltraSingerEvaluation to top level

---
 .../Research => }/UltraSingerEvaluation.py      | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)
 rename src/{modules/Research => }/UltraSingerEvaluation.py (87%)

diff --git a/src/modules/Research/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py
similarity index 87%
rename from src/modules/Research/UltraSingerEvaluation.py
rename to src/UltraSingerEvaluation.py
index d0108c0..4e159ff 100644
--- a/src/modules/Research/UltraSingerEvaluation.py
+++ b/src/UltraSingerEvaluation.py
@@ -1,9 +1,9 @@
 import copy
 import os
-import sys
 from datetime import datetime
 from pathlib import Path
 from typing import List
+import importlib.util
 
 import UltraSinger
 from Settings import Settings
@@ -13,10 +13,10 @@
 from modules.console_colors import ULTRASINGER_HEAD, red_highlighted
 
 test_input_folder = os.path.normpath(
-    os.path.abspath(__file__ + "../../../../../test_input")
+    os.path.abspath(__file__ + "/../../test_input")
 )
 test_output_folder = os.path.normpath(
-    os.path.abspath(__file__ + "../../../../../test_output")
+    os.path.abspath(__file__ + "/../../test_output")
 )
 test_run_folder = os.path.join(
     test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -90,10 +90,15 @@ def initialize_settings():
         os.path.join(test_input_folder, "config/local.py")
     )
     if os.path.isfile(user_config_file):
-        sys.path.append(os.path.join(user_config_file, ".."))
-        import local
+        print(f"{ULTRASINGER_HEAD} Using custom settings found under {user_config_file}")
 
-        s = local.user_settings
+        spec = importlib.util.spec_from_file_location("custom_settings", user_config_file)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+
+        s = module.user_settings
+    else:
+        print(f"{ULTRASINGER_HEAD} No custom settings found under {user_config_file}")
 
     if not s.force_cpu:
         s.tensorflow_device, s.pytorch_device = check_gpu_support()

From 7d57225a517c8e05f41c01c055435a590116301b Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sat, 7 Oct 2023 10:37:25 +0200
Subject: [PATCH 06/19] make loudness threshold configurable

---
 requirements.txt                      |  8 ++--
 src/Settings.py                       |  1 +
 src/UltraSinger.py                    | 60 +++++++++++++--------------
 src/modules/Pitcher/PitchingResult.py | 15 -------
 src/modules/Pitcher/pitcher.py        |  8 ++--
 5 files changed, 39 insertions(+), 53 deletions(-)
 delete mode 100644 src/modules/Pitcher/PitchingResult.py

diff --git a/requirements.txt b/requirements.txt
index b939eb4..d5763a7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-crepe~=0.0.13
+crepe~=0.0.14
 demucs~=4.0.0
 ffmpeg_python~=0.2.0
 git+https://github.com/m-bain/whisperx.git
@@ -14,13 +14,13 @@ pydub~=0.25.1
 PyHyphen~=4.0.3
 python_Levenshtein~=0.21.1
 scipy~=1.11.2
-tensorflow<2.11
+tensorflow==2.10.1
 tqdm~=4.65.2
 whisperx~=3.1.1
-yt_dlp~=2023.7.6
+yt_dlp~=2023.9.24
 
 isort~=5.12
 black~=23.3
 pylint~=2.17
 pytest~=7.3.1
-protobuf==3.20.*
\ No newline at end of file
+protobuf==3.19.6
\ No newline at end of file
diff --git a/src/Settings.py b/src/Settings.py
index efcbf9e..c46aaf5 100644
--- a/src/Settings.py
+++ b/src/Settings.py
@@ -28,6 +28,7 @@ class Settings:
     # Pitch
     crepe_model_capacity = "full"  # tiny|small|medium|large|full
     crepe_step_size = 10 # in miliseconds
+    pitch_loudness_threshold = -60
 
     # Device
     pytorch_device = 'cpu'  # cpu|cuda
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index 06b5d6f..d280dcb 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -2,7 +2,6 @@
 
 import copy
 import getopt
-import json
 import os
 import sys
 
@@ -18,7 +17,6 @@
     export_chunks_from_ultrastar_data,
 )
 from modules.Audio.silence_processing import remove_silence_from_transcription_data
-from modules.Pitcher.PitchingResult import PitchingResult
 from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult
 from modules.csv_handler import export_transcribed_data_to_csv
 from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
@@ -392,6 +390,7 @@ def run() -> None:
     )
     settings.mono_audio_path = os.path.join(cache_path, basename_without_ext + ".wav")
 
+    os_helper.create_folder(cache_path)
     # Separate vocal from audio
     audio_separation_path = separate_vocal_from_audio(basename_without_ext, cache_path, ultrastar_audio_input_path)
 
@@ -533,11 +532,11 @@ def separate_vocal_from_audio(
     basename_without_ext: str, cache_path: str, ultrastar_audio_input_path: str
 ) -> str:
     """Separate vocal from audio"""
-    demcus_output_folder = os.path.splitext(
+    demucs_output_folder = os.path.splitext(
         os.path.basename(ultrastar_audio_input_path)
     )[0]
     audio_separation_path = os.path.join(
-        cache_path, "separated", "htdemucs", demcus_output_folder
+        cache_path, "separated", "htdemucs", demucs_output_folder
     )
 
     vocals_path = os.path.join(audio_separation_path, "vocals.wav")
@@ -810,9 +809,10 @@ def pitch_audio(
     # todo: chunk pitching as option?
     # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name)
 
-    pitching_config = f"crepe_{settings.ignore_audio}_{settings.crepe_model_capacity}_{settings.crepe_step_size}_{settings.tensorflow_device}"
-    pitching_path = os.path.join(cache_path, f"{pitching_config}.json")
-    cache_available = check_file_exists(pitching_path)
+    pitching_config = f"crepe_{settings.ignore_audio}_{settings.crepe_model_capacity}_{settings.crepe_step_size}_{settings.tensorflow_device}_{settings.pitch_loudness_threshold}"
+    pitched_data_path = os.path.join(cache_path, f"{pitching_config}.json")
+    cache_available = check_file_exists(pitched_data_path)
+    pitched_data = None
 
     if settings.skip_cache_transcription or not cache_available:
         pitched_data = get_pitch_with_crepe_file(
@@ -820,35 +820,35 @@ def pitch_audio(
             settings.crepe_model_capacity,
             settings.crepe_step_size,
             settings.tensorflow_device,
+            settings.pitch_loudness_threshold
         )
-        if not settings.ignore_audio:
-            start_times = []
-            end_times = []
-            for i, data in enumerate(transcribed_data):
-                start_times.append(data.start)
-                end_times.append(data.end)
-            midi_notes = create_midi_notes_from_pitched_data(
-                start_times, end_times, pitched_data
-            )
-
-        else:
-            midi_notes = create_midi_notes_from_pitched_data(
-                ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data
-            )
-        ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes)
-
-        pitching_result = PitchingResult(midi_notes, pitched_data, ultrastar_note_numbers)
 
-        pitching_result_json = pitching_result.to_json()
-        with open(pitching_path, "w", encoding=FILE_ENCODING) as file:
-            file.write(pitching_result_json)
+        pitched_data_json = pitched_data.to_json()
+        with open(pitched_data_path, "w", encoding=FILE_ENCODING) as file:
+            file.write(pitched_data_json)
     else:
         print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data")
-        with open(pitching_path) as file:
+        with open(pitched_data_path) as file:
             json = file.read()
-            pitching_result = PitchingResult.from_json(json)
+            pitched_data = PitchedData.from_json(json)
+
+    if not settings.ignore_audio:
+        start_times = []
+        end_times = []
+        for i, data in enumerate(transcribed_data):
+            start_times.append(data.start)
+            end_times.append(data.end)
+        midi_notes = create_midi_notes_from_pitched_data(
+            start_times, end_times, pitched_data
+        )
+    else:
+        midi_notes = create_midi_notes_from_pitched_data(
+            ultrastar_class.startTimes, ultrastar_class.endTimes, pitched_data
+        )
+
+    ultrastar_note_numbers = convert_midi_notes_to_ultrastar_notes(midi_notes)
 
-    return pitching_result.midi_notes, pitching_result.pitched_data, pitching_result.ultrastar_note_numbers
+    return midi_notes, pitched_data, ultrastar_note_numbers
 
 
 def create_audio_chunks(
diff --git a/src/modules/Pitcher/PitchingResult.py b/src/modules/Pitcher/PitchingResult.py
deleted file mode 100644
index 47a74c3..0000000
--- a/src/modules/Pitcher/PitchingResult.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from dataclasses import dataclass
-
-from dataclasses_json import dataclass_json
-
-from modules.Pitcher.pitched_data import PitchedData
-
-
-@dataclass_json
-@dataclass
-class PitchingResult:
-    """Pitching result"""
-
-    midi_notes: list[str]
-    pitched_data: PitchedData
-    ultrastar_note_numbers: list[int]
diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py
index 80506c7..d979c12 100644
--- a/src/modules/Pitcher/pitcher.py
+++ b/src/modules/Pitcher/pitcher.py
@@ -11,7 +11,7 @@
 
 
 def get_pitch_with_crepe_file(
-    filename: str, model_capacity: str, step_size: int = 10, device: str = "cpu"
+    filename: str, model_capacity: str, step_size: int = 10, device: str = "cpu", filter_silence_threshold: int = -60
 ) -> PitchedData:
     """Pitch with crepe"""
 
@@ -22,10 +22,10 @@ def get_pitch_with_crepe_file(
     audio, sample_rate = librosa.load(filename)
     timer.log('Load file for pitch detection end')
 
-    return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size)
+    return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size, filter_silence_threshold)
 
 
-def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10) -> PitchedData:
+def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10, filter_silence_threshold: int = -60) -> PitchedData:
     """Pitch with crepe"""
 
     if sample_rate != CREPE_MODEL_SAMPLE_RATE:
@@ -38,7 +38,7 @@ def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size
     timer.log('Crepe pitch detection end')
 
     timer.log('Computing loudness start')
-    confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size)
+    confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=filter_silence_threshold, step_size=step_size)
     timer.log('Computing loudness end')
 
     # convert to native float for serialization

From 3bf7a06bb6b4cef34fd0eb7d16c4d54626cd4cd3 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sun, 8 Oct 2023 10:32:30 +0200
Subject: [PATCH 07/19] add pitch comparison

---
 src/Settings.py                               |   8 ++
 src/UltraSinger.py                            |  53 +++++---
 src/UltraSingerEvaluation.py                  | 115 +++++++++++++----
 src/modules/Research/TestRun.py               |  29 +++++
 src/modules/Research/TestSong.py              |   7 +-
 src/modules/Ultrastar/ultrastar_converter.py  | 117 +++++++++++++++---
 .../Ultrastar/ultrastar_score_calculator.py   |   5 +
 7 files changed, 273 insertions(+), 61 deletions(-)
 create mode 100644 src/modules/Research/TestRun.py

diff --git a/src/Settings.py b/src/Settings.py
index c46aaf5..709622a 100644
--- a/src/Settings.py
+++ b/src/Settings.py
@@ -1,3 +1,10 @@
+from dataclasses import dataclass
+
+from dataclasses_json import dataclass_json
+
+
+@dataclass_json
+@dataclass
 class Settings:
     create_midi = True
     create_plot = False
@@ -42,3 +49,4 @@ class Settings:
     skip_cache_denoise_vocal_audio = False
     skip_cache_transcription = False
     skip_cache_pitch_detection = False
+    calculate_score = True
\ No newline at end of file
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index d280dcb..d7bd992 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -4,6 +4,7 @@
 import getopt
 import os
 import sys
+from typing import Tuple, Any
 
 import Levenshtein
 import librosa
@@ -18,6 +19,7 @@
 )
 from modules.Audio.silence_processing import remove_silence_from_transcription_data
 from modules.Speech_Recognition.TranscriptionResult import TranscriptionResult
+from modules.Ultrastar.ultrastar_score_calculator import Score
 from modules.csv_handler import export_transcribed_data_to_csv
 from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
 from modules.Audio.youtube import (
@@ -343,7 +345,7 @@ def merge_syllable_segments(
     return new_data, new_midi_notes, new_us_notes
 
 
-def run() -> None:
+def run() -> tuple[str, Score, Score]:
     """The processing function of this program"""
     settings.input_file_is_ultrastar_txt = settings.input_file_path.endswith(".txt")
 
@@ -392,7 +394,9 @@ def run() -> None:
 
     os_helper.create_folder(cache_path)
     # Separate vocal from audio
-    audio_separation_path = separate_vocal_from_audio(basename_without_ext, cache_path, ultrastar_audio_input_path)
+    audio_separation_path = separate_vocal_from_audio(
+        basename_without_ext, cache_path, ultrastar_audio_input_path
+    )
 
     # Denoise vocal audio
     denoise_vocal_audio(basename_without_ext, cache_path)
@@ -463,13 +467,16 @@ def run() -> None:
             song_output, ultrastar_class, ultrastar_note_numbers
         )
 
-    # Calc Points
-    ultrastar_class, simple_score, accurate_score = calculate_score_points(
-        pitched_data, ultrastar_class, ultrastar_file_output
-    )
+    simple_score = None
+    accurate_score = None
+    if settings.calculate_score:
+        # Calc Points
+        ultrastar_class, simple_score, accurate_score = calculate_score_points(
+            pitched_data, ultrastar_class, ultrastar_file_output
+        )
 
-    # Add calculated score to Ultrastar txt
-    ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score)
+        # Add calculated score to Ultrastar txt
+        ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score)
 
     # Midi
     if settings.create_midi:
@@ -477,6 +484,7 @@ def run() -> None:
 
     # Print Support
     print_support()
+    return ultrastar_file_output, simple_score, accurate_score
 
 
 def get_unused_song_output_dir(path: str) -> str:
@@ -519,7 +527,9 @@ def transcribe_audio(cache_path: str) -> TranscriptionResult:
             with open(transcription_path, "w", encoding=FILE_ENCODING) as file:
                 file.write(transcription_result.to_json())
         else:
-            print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached transcribed data")
+            print(
+                f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached transcribed data"
+            )
             with open(transcription_path) as file:
                 json = file.read()
                 transcription_result = TranscriptionResult.from_json(json)
@@ -542,14 +552,17 @@ def separate_vocal_from_audio(
     vocals_path = os.path.join(audio_separation_path, "vocals.wav")
     instrumental_path = os.path.join(audio_separation_path, "no_vocals.wav")
     if settings.use_separated_vocal or settings.create_karaoke:
-        cache_available = (check_file_exists(vocals_path)
-                           and check_file_exists(instrumental_path))
+        cache_available = check_file_exists(vocals_path) and check_file_exists(
+            instrumental_path
+        )
         if settings.skip_cache_vocal_separation or not cache_available:
             separate_audio(
                 ultrastar_audio_input_path, cache_path, settings.pytorch_device
             )
         else:
-            print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached separated vocals")
+            print(
+                f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached separated vocals"
+            )
 
     if settings.use_separated_vocal:
         input_path = vocals_path
@@ -803,8 +816,10 @@ def create_midi_file(
 
 
 def pitch_audio(
-    transcribed_data: list[TranscribedData], ultrastar_class: UltrastarTxtValue, cache_path: str) -> tuple[
-    list[str], PitchedData, list[int]]:
+    transcribed_data: list[TranscribedData],
+    ultrastar_class: UltrastarTxtValue,
+    cache_path: str,
+) -> tuple[list[str], PitchedData, list[int]]:
     """Pitch audio"""
     # todo: chunk pitching as option?
     # midi_notes = pitch_each_chunk_with_crepe(chunk_folder_name)
@@ -820,14 +835,16 @@ def pitch_audio(
             settings.crepe_model_capacity,
             settings.crepe_step_size,
             settings.tensorflow_device,
-            settings.pitch_loudness_threshold
+            settings.pitch_loudness_threshold,
         )
 
         pitched_data_json = pitched_data.to_json()
         with open(pitched_data_path, "w", encoding=FILE_ENCODING) as file:
             file.write(pitched_data_json)
     else:
-        print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data")
+        print(
+            f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached pitch data"
+        )
         with open(pitched_data_path) as file:
             json = file.read()
             pitched_data = PitchedData.from_json(json)
@@ -880,7 +897,9 @@ def denoise_vocal_audio(basename_without_ext: str, cache_path: str) -> None:
     if settings.skip_cache_denoise_vocal_audio or not cache_available:
         ffmpeg_reduce_noise(settings.mono_audio_path, denoised_path)
     else:
-        print(f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached denoised audio")
+        print(
+            f"{ULTRASINGER_HEAD} {green_highlighted('cache')} reusing cached denoised audio"
+        )
 
     settings.mono_audio_path = denoised_path
 
diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py
index 4e159ff..7b9eebe 100644
--- a/src/UltraSingerEvaluation.py
+++ b/src/UltraSingerEvaluation.py
@@ -8,16 +8,16 @@
 import UltraSinger
 from Settings import Settings
 from modules.DeviceDetection.device_detection import check_gpu_support
+from modules.Research.TestRun import TestRun, TestedSong
 from modules.Research.TestSong import TestSong
 from modules.Ultrastar import ultrastar_parser
+from modules.Ultrastar.ultrastar_converter import compare_pitches
+from modules.Ultrastar.ultrastar_parser import parse_ultrastar_txt
+from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue, FILE_ENCODING
 from modules.console_colors import ULTRASINGER_HEAD, red_highlighted
 
-test_input_folder = os.path.normpath(
-    os.path.abspath(__file__ + "/../../test_input")
-)
-test_output_folder = os.path.normpath(
-    os.path.abspath(__file__ + "/../../test_output")
-)
+test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input"))
+test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output"))
 test_run_folder = os.path.join(
     test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 )
@@ -45,20 +45,11 @@ def main() -> None:
     test_songs: List[TestSong] = []
     for dir_entry in os.listdir(base_settings.test_songs_input_folder):
         song_folder = os.path.join(base_settings.test_songs_input_folder, dir_entry)
-        if os.path.isdir(song_folder):
-            for song_folder_item in os.listdir(song_folder):
-                if song_folder_item.endswith(".txt") and song_folder_item != "license.txt":
-                    txt_file = os.path.join(song_folder, song_folder_item)
-                    ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file)
-
-                    if ultrastar_class.mp3:
-                        test_song = TestSong(txt_file, song_folder, ultrastar_class.mp3, ultrastar_class)
-                        test_songs.append(test_song)
-                        break
-                    else:
-                        print(
-                            f"{ULTRASINGER_HEAD} {red_highlighted('Warning.')} {base_settings.test_songs_input_folder} contains an UltraStar text file but has no audio referenced in it. Skipping."
-                        )
+        found_song = find_ultrastar_song(song_folder)
+        if found_song is None:
+            continue
+
+        test_songs.append(TestSong(found_song[0], song_folder, found_song[1]))
 
     if len(test_songs) == 0:
         print(
@@ -68,20 +59,88 @@ def main() -> None:
 
     print(f"{ULTRASINGER_HEAD} Running evaluation for {len(test_songs)} songs")
 
+    test_run = TestRun(base_settings)
     for index, test_song in enumerate(test_songs):
         print(f"{ULTRASINGER_HEAD} ========================")
         print(
-            f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.txt)}"
+            f"{ULTRASINGER_HEAD} {index+1}/{len(test_songs)}: {os.path.basename(test_song.input_txt)}"
         )
 
         # prepare cache directory
-        song_cache_path = os.path.join(test_song.folder, "cache")
+        song_cache_path = os.path.join(test_song.input_folder, "cache")
         Path(song_cache_path).mkdir(parents=True, exist_ok=True)
+
         test_song_settings = copy.deepcopy(base_settings)
-        test_song_settings.input_file_path = test_song.txt
+        test_song_settings.input_file_path = test_song.input_txt
         test_song_settings.cache_override_path = song_cache_path
         UltraSinger.settings = test_song_settings
-        UltraSinger.run()
+
+        output_txt = None
+        simple_score = None
+        accurate_score = None
+        tested_song = TestedSong(test_song.input_txt)
+        test_run.tested_songs.append(tested_song)
+        try:
+            output_txt, simple_score, accurate_score = UltraSinger.run()
+        except Exception as error:
+            print(
+                f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Failed to process {test_song.input_txt}\n{error}."
+            )
+            continue
+
+
+        output_folder_name = f"{test_song.input_ultrastar_class.artist} - {test_song.input_ultrastar_class.title}"
+        output_folder = os.path.join(test_run_folder, output_folder_name)
+
+        if not os.path.isfile(output_txt):
+            print(
+                f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Could not find song txt in '{output_folder}'."
+            )
+            test_run.tested_songs.append(tested_song)
+            continue
+
+        ultrastar_class = parse_ultrastar_txt(output_txt)
+        (
+            input_match_ratio,
+            output_match_ratio,
+            pitch_where_should_be_no_pitch_ratio,
+            no_pitch_where_should_be_pitch_ratio,
+        ) = compare_pitches(test_song.input_ultrastar_class, ultrastar_class)
+
+        tested_song.output_path = output_txt
+        tested_song.success = True
+        tested_song.input_match_ratio = input_match_ratio
+        tested_song.output_match_ratio = output_match_ratio
+        tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio
+        tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio
+        tested_song.output_score_simple = simple_score
+        tested_song.output_score_accurate = accurate_score
+
+    test_run_result_file = os.path.join(test_run_folder, "run.json")
+    test_run_json = test_run.to_json()
+    with open(test_run_result_file, "w", encoding=FILE_ENCODING) as file:
+        file.write(test_run_json)
+
+
+def find_ultrastar_song(
+    song_folder, require_audio: bool = True
+) -> tuple[str, UltrastarTxtValue]:
+    if os.path.isdir(song_folder):
+        for song_folder_item in os.listdir(song_folder):
+            if (
+                song_folder_item.endswith(".txt")
+                and song_folder_item != "license.txt"
+                and not song_folder_item.endswith("[Karaoke].txt")
+            ):
+                txt_file = os.path.join(song_folder, song_folder_item)
+                ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file)
+
+                if ultrastar_class.mp3 != "" or not require_audio:
+                    return txt_file, ultrastar_class
+                else:
+                    print(
+                        f"{ULTRASINGER_HEAD} {red_highlighted('Warning.')} {song_folder} contains an UltraStar text file but has no audio referenced in it. Skipping."
+                    )
 
 
 def initialize_settings():
@@ -90,9 +149,13 @@ def initialize_settings():
         os.path.join(test_input_folder, "config/local.py")
     )
     if os.path.isfile(user_config_file):
-        print(f"{ULTRASINGER_HEAD} Using custom settings found under {user_config_file}")
+        print(
+            f"{ULTRASINGER_HEAD} Using custom settings found under {user_config_file}"
+        )
 
-        spec = importlib.util.spec_from_file_location("custom_settings", user_config_file)
+        spec = importlib.util.spec_from_file_location(
+            "custom_settings", user_config_file
+        )
         module = importlib.util.module_from_spec(spec)
         spec.loader.exec_module(module)
 
diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py
new file mode 100644
index 0000000..be60a78
--- /dev/null
+++ b/src/modules/Research/TestRun.py
@@ -0,0 +1,29 @@
+from dataclasses import dataclass, field
+
+from dataclasses_json import dataclass_json
+
+from Settings import Settings
+
+
+@dataclass_json
+@dataclass
+class TestedSong:
+    """Tested song"""
+
+    input_path: str
+    output_path: str = ""
+    success: bool = False
+    input_match_ratio: float = 0.0
+    output_match_ratio: float = 0.0
+    no_pitch_where_should_be_pitch_ratio: float = 0.0
+    pitch_where_should_be_no_pitch_ratio: float = 0.0
+    output_score_simple: int = 0
+    output_score_accurate: int = 0
+
+
+@dataclass_json
+@dataclass
+class TestRun:
+    """Test run"""
+    settings: Settings
+    tested_songs: list[TestedSong] = field(default_factory=lambda: [])
diff --git a/src/modules/Research/TestSong.py b/src/modules/Research/TestSong.py
index fc0d72e..be3a1b4 100644
--- a/src/modules/Research/TestSong.py
+++ b/src/modules/Research/TestSong.py
@@ -7,7 +7,6 @@
 class TestSong:
     """Test song"""
 
-    txt: str
-    folder: str
-    audio: float
-    ultrastar_class: UltrastarTxtValue
\ No newline at end of file
+    input_txt: str
+    input_folder: str
+    input_ultrastar_class: UltrastarTxtValue
diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py
index 97f79ee..01a143f 100644
--- a/src/modules/Ultrastar/ultrastar_converter.py
+++ b/src/modules/Ultrastar/ultrastar_converter.py
@@ -1,7 +1,13 @@
 """Ultrastar Converter"""
+from typing import Tuple
+
+import numpy
 
 from modules.Ultrastar.ultrastar_txt import UltrastarTxtValue
 
+NO_PITCH = -1000
+
+
 def real_bpm_to_ultrastar_bpm(real_bpm: float) -> float:
     """Converts real BPM to UltraStar BPM"""
     # The UltraStar BPM info is a fourth beat of the real BPM
@@ -48,32 +54,115 @@ def ultrastar_note_to_midi_note(ultrastar_note: int) -> int:
     return midi_note
 
 
-def get_start_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float:
+def get_start_time_from_ultrastar(
+    ultrastar_class: UltrastarTxtValue, pos: int
+) -> float:
     """Calculates the start time from the Ultrastar txt"""
 
-    gap = int(float(ultrastar_class.gap) / 1000)
-    real_bpm = ultrastar_bpm_to_real_bpm(
-        float(ultrastar_class.bpm.replace(",", "."))
-    )
-    start_time = (
-        beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap
-    )
+    gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000)
+    real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", ".")))
+    start_time = beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap
     return start_time
 
 
 def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float:
     """Calculates the end time from the Ultrastar txt"""
 
-    gap = int(float(ultrastar_class.gap) / 1000)
-    real_bpm = ultrastar_bpm_to_real_bpm(
-        float(ultrastar_class.bpm.replace(",", "."))
-    )
+    gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000)
+    real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", ".")))
     end_time = (
         beat_to_second(
-            int(ultrastar_class.startBeat[pos])
-            + int(ultrastar_class.durations[pos]),
+            int(ultrastar_class.startBeat[pos]) + int(ultrastar_class.durations[pos]),
             real_bpm,
         )
         + gap
     )
     return end_time
+
+
+def map_to_datapoints(
+    ultrastar_class: UltrastarTxtValue, step_size: int = 10
+) -> list[int]:
+    gap = int(float(ultrastar_class.gap.replace(",", ".")))
+
+    data = []
+
+    previous_step = -step_size
+    for pos, pitch in enumerate(ultrastar_class.pitches):
+        if ultrastar_class.noteType[pos] == "F":
+            continue
+
+        start_time = int(get_start_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap
+        end_time = int(get_end_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap
+
+        start_nearest_step = (start_time + step_size - 1) // step_size * step_size
+        end_nearest_step = (end_time + step_size - 1) // step_size * step_size
+
+        if previous_step == start_nearest_step:
+            start_nearest_step += step_size
+
+        duration = end_nearest_step - start_nearest_step
+
+        if duration < 10:
+            continue
+
+        # pad gaps between pitches with empty datapoints
+        gap_steps_count = (start_nearest_step - previous_step - step_size) // step_size
+        data += [NO_PITCH] * gap_steps_count
+
+        pitch_steps_count = duration // step_size
+        data += [pitch] * pitch_steps_count
+        previous_step = end_nearest_step
+
+    return data
+
+
+def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float]:
+    step_size = 10
+
+    input_datapoints = map_to_datapoints(input_ultrastar_class, step_size)
+    output_datapoints = map_to_datapoints(output_ultrastar_class, step_size)
+
+    longest = max(len(input_datapoints), len(output_datapoints))
+    for datapoints in [input_datapoints, output_datapoints]:
+        length = len(datapoints)
+        if length < longest:
+            gap_steps_count = longest - length
+            # pad gaps between pitches with empty datapoints
+            datapoints += [NO_PITCH] * gap_steps_count
+
+    input_pitched_datapoints = len([x for x in input_datapoints if x != NO_PITCH])
+    output_pitched_datapoints = len([x for x in output_datapoints if x != NO_PITCH])
+
+    matches = 0
+    pitch_where_should_be_no_pitch = 0
+    no_pitch_where_should_be_pitch = 0
+    for index, _ in enumerate(input_datapoints):
+        input_pitch = input_datapoints[index]
+        output_pitch = output_datapoints[index]
+        if input_pitch != NO_PITCH and output_pitch != NO_PITCH:
+            continue
+
+        if input_pitch == output_pitch:
+            matches += 1
+        elif input_pitch == NO_PITCH:
+            pitch_where_should_be_no_pitch += 1
+        else:
+            no_pitch_where_should_be_pitch += 1
+
+    input_match_ratio = matches / input_pitched_datapoints
+    output_match_ratio = matches / output_pitched_datapoints
+
+    output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints
+    output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints
+
+
+    return input_match_ratio, output_match_ratio, output_pitch_where_should_be_no_pitch_ratio, output_no_pitch_where_should_be_pitch_ratio
+
+
+def determine_nearest_end_step(input_ultrastar_class, step_size) -> int:
+    pitches_count = len(input_ultrastar_class.pitches) - 1
+    end_time = int(
+        get_end_time_from_ultrastar(input_ultrastar_class, pitches_count) * 1000
+    ) + int(input_ultrastar_class.gap)
+    return (end_time + step_size - 1) // step_size * step_size
diff --git a/src/modules/Ultrastar/ultrastar_score_calculator.py b/src/modules/Ultrastar/ultrastar_score_calculator.py
index a02194c..60fdd06 100644
--- a/src/modules/Ultrastar/ultrastar_score_calculator.py
+++ b/src/modules/Ultrastar/ultrastar_score_calculator.py
@@ -1,4 +1,7 @@
 """Ultrastar score calculator."""
+from dataclasses import dataclass
+
+from dataclasses_json import dataclass_json
 
 import librosa
 
@@ -48,6 +51,8 @@ def add_point(note_type: str, points: Points) -> Points:
     return points
 
 
+@dataclass_json
+@dataclass
 class Score:
     """Docstring"""
 

From eae51e7598c4eb902679c0c19fc7d89b5a2e022f Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sun, 8 Oct 2023 12:34:03 +0200
Subject: [PATCH 08/19] add cross octave pitch comparison

---
 src/UltraSingerEvaluation.py                 |  6 ++++++
 src/modules/Research/TestRun.py              |  2 ++
 src/modules/Ultrastar/ultrastar_converter.py | 19 +++++++++++++++++--
 src/modules/Ultrastar/ultrastar_txt.py       |  2 +-
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py
index 7b9eebe..1393722 100644
--- a/src/UltraSingerEvaluation.py
+++ b/src/UltraSingerEvaluation.py
@@ -103,6 +103,8 @@ def main() -> None:
         (
             input_match_ratio,
             output_match_ratio,
+            cross_octave_input_match_ratio,
+            cross_octave_output_match_ratio,
             pitch_where_should_be_no_pitch_ratio,
             no_pitch_where_should_be_pitch_ratio,
         ) = compare_pitches(test_song.input_ultrastar_class, ultrastar_class)
@@ -111,6 +113,8 @@ def main() -> None:
         tested_song.success = True
         tested_song.input_match_ratio = input_match_ratio
         tested_song.output_match_ratio = output_match_ratio
+        tested_song.cross_octave_input_match_ratio = cross_octave_input_match_ratio
+        tested_song.cross_octave_output_match_ratio = cross_octave_output_match_ratio
         tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio
         tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio
         tested_song.output_score_simple = simple_score
@@ -131,6 +135,8 @@ def find_ultrastar_song(
                 song_folder_item.endswith(".txt")
                 and song_folder_item != "license.txt"
                 and not song_folder_item.endswith("[Karaoke].txt")
+                and not song_folder_item.endswith("[MULTI].txt")
+                and not song_folder_item.endswith("[DUET].txt")
             ):
                 txt_file = os.path.join(song_folder, song_folder_item)
                 ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file)
diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py
index be60a78..7070c28 100644
--- a/src/modules/Research/TestRun.py
+++ b/src/modules/Research/TestRun.py
@@ -15,6 +15,8 @@ class TestedSong:
     success: bool = False
     input_match_ratio: float = 0.0
     output_match_ratio: float = 0.0
+    cross_octave_input_match_ratio: float = 0.0
+    cross_octave_output_match_ratio: float = 0.0
     no_pitch_where_should_be_pitch_ratio: float = 0.0
     pitch_where_should_be_no_pitch_ratio: float = 0.0
     output_score_simple: int = 0
diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py
index 01a143f..8ef002c 100644
--- a/src/modules/Ultrastar/ultrastar_converter.py
+++ b/src/modules/Ultrastar/ultrastar_converter.py
@@ -135,6 +135,7 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa
     output_pitched_datapoints = len([x for x in output_datapoints if x != NO_PITCH])
 
     matches = 0
+    cross_octave_matches = 0
     pitch_where_should_be_no_pitch = 0
     no_pitch_where_should_be_pitch = 0
     for index, _ in enumerate(input_datapoints):
@@ -147,17 +148,31 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa
             matches += 1
         elif input_pitch == NO_PITCH:
             pitch_where_should_be_no_pitch += 1
-        else:
+        elif output_pitch == NO_PITCH:
             no_pitch_where_should_be_pitch += 1
+        else:
+            _, input_pitch_remainder = divmod(input_pitch, 12)
+            _, output_pitch_remainder = divmod(output_pitch, 12)
+            if input_pitch_remainder == output_pitch_remainder:
+                cross_octave_matches += 1
 
     input_match_ratio = matches / input_pitched_datapoints
     output_match_ratio = matches / output_pitched_datapoints
 
+    cross_octave_input_match_ratio = (matches + cross_octave_matches) / input_pitched_datapoints
+    cross_octave_output_match_ratio = (matches + cross_octave_matches) / output_pitched_datapoints
+
     output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints
     output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints
 
 
-    return input_match_ratio, output_match_ratio, output_pitch_where_should_be_no_pitch_ratio, output_no_pitch_where_should_be_pitch_ratio
+    return (input_match_ratio,
+            output_match_ratio,
+            cross_octave_input_match_ratio,
+            cross_octave_output_match_ratio,
+            output_pitch_where_should_be_no_pitch_ratio,
+            output_no_pitch_where_should_be_pitch_ratio
+    )
 
 
 def determine_nearest_end_step(input_ultrastar_class, step_size) -> int:
diff --git a/src/modules/Ultrastar/ultrastar_txt.py b/src/modules/Ultrastar/ultrastar_txt.py
index 3e21273..cdddca0 100644
--- a/src/modules/Ultrastar/ultrastar_txt.py
+++ b/src/modules/Ultrastar/ultrastar_txt.py
@@ -64,7 +64,7 @@ class UltrastarTxtValue:
     mp3 = ""
     video = None
     videoGap = None
-    gap = ""
+    gap = "0"
     bpm = ""
     language = None
     cover = None

From c139c74775f36c80f387d47fd98109464ee99fe3 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sun, 8 Oct 2023 19:25:31 +0200
Subject: [PATCH 09/19] fix some bugs

---
 src/UltraSingerEvaluation.py                 |  6 +-
 src/modules/Speech_Recognition/Whisper.py    | 92 +++++++++++---------
 src/modules/Ultrastar/ultrastar_converter.py |  9 +-
 src/modules/Ultrastar/ultrastar_parser.py    | 48 +++++-----
 4 files changed, 82 insertions(+), 73 deletions(-)

diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py
index 1393722..03230b3 100644
--- a/src/UltraSingerEvaluation.py
+++ b/src/UltraSingerEvaluation.py
@@ -1,5 +1,6 @@
 import copy
 import os
+import traceback
 from datetime import datetime
 from pathlib import Path
 from typing import List
@@ -75,9 +76,6 @@ def main() -> None:
         test_song_settings.cache_override_path = song_cache_path
         UltraSinger.settings = test_song_settings
 
-        output_txt = None
-        simple_score = None
-        accurate_score = None
         tested_song = TestedSong(test_song.input_txt)
         test_run.tested_songs.append(tested_song)
         try:
@@ -86,6 +84,7 @@ def main() -> None:
             print(
                 f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Failed to process {test_song.input_txt}\n{error}."
             )
+            traceback.print_exc()
             continue
 
 
@@ -137,6 +136,7 @@ def find_ultrastar_song(
                 and not song_folder_item.endswith("[Karaoke].txt")
                 and not song_folder_item.endswith("[MULTI].txt")
                 and not song_folder_item.endswith("[DUET].txt")
+                and not song_folder_item.endswith("instrumental.txt")
             ):
                 txt_file = os.path.join(song_folder, song_folder_item)
                 ultrastar_class = ultrastar_parser.parse_ultrastar_txt(txt_file)
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
index 565b123..daa0123 100644
--- a/src/modules/Speech_Recognition/Whisper.py
+++ b/src/modules/Speech_Recognition/Whisper.py
@@ -10,6 +10,8 @@
 from modules.Speech_Recognition.TranscribedData import TranscribedData, from_whisper
 
 
+MEMORY_ERROR_MESSAGE = f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu"
+
 def transcribe_with_whisper(
     audio_path: str,
     model: str,
@@ -34,6 +36,46 @@ def transcribe_with_whisper(
         loaded_whisper_model = whisperx.load_model(
             model, language=language, device=device, compute_type=compute_type
         )
+
+        audio = whisperx.load_audio(audio_path)
+
+        print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
+
+        result = loaded_whisper_model.transcribe(
+            audio, batch_size=batch_size, language=language
+        )
+
+        detected_language = result["language"]
+        if language is None:
+            language = detected_language
+
+        # load alignment model and metadata
+        try:
+            model_a, metadata = whisperx.load_align_model(
+                language_code=language, device=device, model_name=model_name
+            )
+        except ValueError as ve:
+            print(
+                f"{red_highlighted(f'{ve}')}"
+                f"\n"
+                f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. "
+                f"Try add it with --align_model [hugingface]."
+            )
+            sys.exit(1)
+
+        # align whisper output
+        result_aligned = whisperx.align(
+            result["segments"],
+            model_a,
+            metadata,
+            audio,
+            device,
+            return_char_alignments=False,
+        )
+
+        transcribed_data = convert_to_transcribed_data(result_aligned)
+
+        return TranscriptionResult(transcribed_data, detected_language)
     except ValueError as value_error:
         if (
             "Requested float16 compute type, but the target device or backend do not support efficient float16 computation."
@@ -48,50 +90,14 @@ def transcribe_with_whisper(
         raise value_error
     except OutOfMemoryError as oom_exception:
         print(oom_exception)
-        print(
-            f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu"
-        )
+        print(MEMORY_ERROR_MESSAGE)
         sys.exit(1)
-
-    audio = whisperx.load_audio(audio_path)
-
-    print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
-
-    result = loaded_whisper_model.transcribe(
-        audio, batch_size=batch_size, language=language
-    )
-
-    detected_language = result["language"]
-    if language is None:
-        language = detected_language
-
-    # load alignment model and metadata
-    try:
-        model_a, metadata = whisperx.load_align_model(
-            language_code=language, device=device, model_name=model_name
-        )
-    except ValueError as ve:
-        print(
-            f"{red_highlighted(f'{ve}')}"
-            f"\n"
-            f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. "
-            f"Try add it with --align_model [hugingface]."
-        )
-        sys.exit(1)
-
-    # align whisper output
-    result_aligned = whisperx.align(
-        result["segments"],
-        model_a,
-        metadata,
-        audio,
-        device,
-        return_char_alignments=False,
-    )
-
-    transcribed_data = convert_to_transcribed_data(result_aligned)
-
-    return TranscriptionResult(transcribed_data, detected_language)
+    except Exception as exception:
+        if "CUDA failed with error out of memory" in str(exception.args[0]):
+            print(exception)
+            print(MEMORY_ERROR_MESSAGE)
+            sys.exit(1)
+        raise exception
 
 
 def convert_to_transcribed_data(result_aligned):
diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py
index 8ef002c..6e1e83c 100644
--- a/src/modules/Ultrastar/ultrastar_converter.py
+++ b/src/modules/Ultrastar/ultrastar_converter.py
@@ -111,13 +111,13 @@ def map_to_datapoints(
         data += [NO_PITCH] * gap_steps_count
 
         pitch_steps_count = duration // step_size
-        data += [pitch] * pitch_steps_count
+        data += [int(pitch)] * pitch_steps_count
         previous_step = end_nearest_step
 
     return data
 
 
-def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float]:
+def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float, float, float]:
     step_size = 10
 
     input_datapoints = map_to_datapoints(input_ultrastar_class, step_size)
@@ -141,7 +141,7 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa
     for index, _ in enumerate(input_datapoints):
         input_pitch = input_datapoints[index]
         output_pitch = output_datapoints[index]
-        if input_pitch != NO_PITCH and output_pitch != NO_PITCH:
+        if input_pitch == NO_PITCH and output_pitch == NO_PITCH:
             continue
 
         if input_pitch == output_pitch:
@@ -165,14 +165,13 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa
     output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints
     output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints
 
-
     return (input_match_ratio,
             output_match_ratio,
             cross_octave_input_match_ratio,
             cross_octave_output_match_ratio,
             output_pitch_where_should_be_no_pitch_ratio,
             output_no_pitch_where_should_be_pitch_ratio
-    )
+            )
 
 
 def determine_nearest_end_step(input_ultrastar_class, step_size) -> int:
diff --git a/src/modules/Ultrastar/ultrastar_parser.py b/src/modules/Ultrastar/ultrastar_parser.py
index f1aaca2..63f19de 100644
--- a/src/modules/Ultrastar/ultrastar_parser.py
+++ b/src/modules/Ultrastar/ultrastar_parser.py
@@ -12,6 +12,8 @@
     FILE_ENCODING,
 )
 
+CHARACTERS_TO_REMOVE = ["\ufeff"]
+
 
 def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue:
     """Parse ultrastar txt file to UltrastarTxt class"""
@@ -19,33 +21,35 @@ def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue:
 
     with open(input_file, "r", encoding=FILE_ENCODING) as file:
         txt = file.readlines()
-
     ultrastar_class = UltrastarTxtValue()
     count = 0
 
     # Strips the newline character
     for line in txt:
+        filtered_line = line
+        for character_to_remove in CHARACTERS_TO_REMOVE:
+            filtered_line = filtered_line.replace(character_to_remove, "")
         count += 1
-        if line.startswith("#"):
-            if line.startswith(f"#{UltrastarTxtTag.ARTIST}"):
-                ultrastar_class.artist = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.TITLE}"):
-                ultrastar_class.title = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.MP3}"):
-                ultrastar_class.mp3 = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.GAP}"):
-                ultrastar_class.gap = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.BPM}"):
-                ultrastar_class.bpm = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.VIDEO}"):
-                ultrastar_class.video = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.VIDEOGAP}"):
-                ultrastar_class.videoGap = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.COVER}"):
-                ultrastar_class.cover = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.BACKGROUND}"):
-                ultrastar_class.background = line.split(":")[1].replace("\n", "")
-        elif line.startswith(
+        if filtered_line.startswith("#"):
+            if filtered_line.startswith(f"#{UltrastarTxtTag.ARTIST}"):
+                ultrastar_class.artist = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.TITLE}"):
+                ultrastar_class.title = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.MP3}"):
+                ultrastar_class.mp3 = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.GAP}"):
+                ultrastar_class.gap = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.BPM}"):
+                ultrastar_class.bpm = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.VIDEO}"):
+                ultrastar_class.video = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.VIDEOGAP}"):
+                ultrastar_class.videoGap = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.COVER}"):
+                ultrastar_class.cover = filtered_line.split(":")[1].replace("\n", "")
+            elif filtered_line.startswith(f"#{UltrastarTxtTag.BACKGROUND}"):
+                ultrastar_class.background = filtered_line.split(":")[1].replace("\n", "")
+        elif filtered_line.startswith(
             (
                 f"{UltrastarTxtNoteTypeTag.FREESTYLE} ",
                 f"{UltrastarTxtNoteTypeTag.NORMAL} ",
@@ -54,7 +58,7 @@ def parse_ultrastar_txt(input_file: str) -> UltrastarTxtValue:
                 f"{UltrastarTxtNoteTypeTag.RAP_GOLDEN} ",
             )
         ):
-            parts = line.split()
+            parts = filtered_line.split()
             # [0] F : * R G
             # [1] start beat
             # [2] duration

From 4339c3594ada94b1e4d63891d07fa3c349654fa7 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sun, 8 Oct 2023 19:43:35 +0200
Subject: [PATCH 10/19] fix some bugs

---
 src/modules/Speech_Recognition/Whisper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
index daa0123..c8bbc5a 100644
--- a/src/modules/Speech_Recognition/Whisper.py
+++ b/src/modules/Speech_Recognition/Whisper.py
@@ -107,9 +107,7 @@ def convert_to_transcribed_data(result_aligned):
             vtd = from_whisper(obj)  # create custom Word object
             vtd.word = vtd.word + " "  # add space to end of word
             if len(obj) < 4:
-                previous = transcribed_data[-1]
-                if not previous:
-                    previous.end = 0
+                previous = transcribed_data[-1] if len(transcribed_data) != 0 else {"end": 0, "word": ""}
                 vtd.start = previous.end + 0.1
                 vtd.end = previous.end + 0.2
                 msg = f'Error: There is no timestamp for word:  {obj["word"]}. ' \

From e2c22095fd82d8e7709e8fc78468b0e0ec8646d4 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sun, 8 Oct 2023 20:36:10 +0200
Subject: [PATCH 11/19] fix some bugs

---
 src/UltraSingerEvaluation.py                 |  8 ++++----
 src/modules/Research/TestRun.py              |  5 +++--
 src/modules/Ultrastar/ultrastar_converter.py | 19 +++++++++++--------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py
index 03230b3..a884e13 100644
--- a/src/UltraSingerEvaluation.py
+++ b/src/UltraSingerEvaluation.py
@@ -102,8 +102,8 @@ def main() -> None:
         (
             input_match_ratio,
             output_match_ratio,
-            cross_octave_input_match_ratio,
-            cross_octave_output_match_ratio,
+            input_pitch_shift_match_ratios,
+            output_pitch_shift_match_ratios,
             pitch_where_should_be_no_pitch_ratio,
             no_pitch_where_should_be_pitch_ratio,
         ) = compare_pitches(test_song.input_ultrastar_class, ultrastar_class)
@@ -112,8 +112,8 @@ def main() -> None:
         tested_song.success = True
         tested_song.input_match_ratio = input_match_ratio
         tested_song.output_match_ratio = output_match_ratio
-        tested_song.cross_octave_input_match_ratio = cross_octave_input_match_ratio
-        tested_song.cross_octave_output_match_ratio = cross_octave_output_match_ratio
+        tested_song.input_pitch_shift_match_ratios = input_pitch_shift_match_ratios
+        tested_song.output_pitch_shift_match_ratios = output_pitch_shift_match_ratios
         tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio
         tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio
         tested_song.output_score_simple = simple_score
diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py
index 7070c28..f381a01 100644
--- a/src/modules/Research/TestRun.py
+++ b/src/modules/Research/TestRun.py
@@ -15,8 +15,8 @@ class TestedSong:
     success: bool = False
     input_match_ratio: float = 0.0
     output_match_ratio: float = 0.0
-    cross_octave_input_match_ratio: float = 0.0
-    cross_octave_output_match_ratio: float = 0.0
+    input_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {})
+    output_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {})
     no_pitch_where_should_be_pitch_ratio: float = 0.0
     pitch_where_should_be_no_pitch_ratio: float = 0.0
     output_score_simple: int = 0
@@ -27,5 +27,6 @@ class TestedSong:
 @dataclass
 class TestRun:
     """Test run"""
+
     settings: Settings
     tested_songs: list[TestedSong] = field(default_factory=lambda: [])
diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py
index 6e1e83c..28a7a9b 100644
--- a/src/modules/Ultrastar/ultrastar_converter.py
+++ b/src/modules/Ultrastar/ultrastar_converter.py
@@ -117,7 +117,7 @@ def map_to_datapoints(
     return data
 
 
-def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, float, float, float, float]:
+def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[float, float, dict[int, float], dict[int, float], float, float]:
     step_size = 10
 
     input_datapoints = map_to_datapoints(input_ultrastar_class, step_size)
@@ -135,7 +135,7 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa
     output_pitched_datapoints = len([x for x in output_datapoints if x != NO_PITCH])
 
     matches = 0
-    cross_octave_matches = 0
+    pitch_shift_matches = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     pitch_where_should_be_no_pitch = 0
     no_pitch_where_should_be_pitch = 0
     for index, _ in enumerate(input_datapoints):
@@ -153,22 +153,25 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa
         else:
             _, input_pitch_remainder = divmod(input_pitch, 12)
             _, output_pitch_remainder = divmod(output_pitch, 12)
-            if input_pitch_remainder == output_pitch_remainder:
-                cross_octave_matches += 1
+            pitch_difference = abs(input_pitch_remainder - output_pitch_remainder)
+            pitch_shift_matches[pitch_difference] += 1
 
     input_match_ratio = matches / input_pitched_datapoints
     output_match_ratio = matches / output_pitched_datapoints
 
-    cross_octave_input_match_ratio = (matches + cross_octave_matches) / input_pitched_datapoints
-    cross_octave_output_match_ratio = (matches + cross_octave_matches) / output_pitched_datapoints
+    input_pitch_shift_match_ratios = {}
+    output_pitch_shift_match_ratios = {}
+    for index, pitch_shift_matches_item in enumerate(pitch_shift_matches):
+        input_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / input_pitched_datapoints
+        output_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / output_pitched_datapoints
 
     output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints
     output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints
 
     return (input_match_ratio,
             output_match_ratio,
-            cross_octave_input_match_ratio,
-            cross_octave_output_match_ratio,
+            input_pitch_shift_match_ratios,
+            output_pitch_shift_match_ratios,
             output_pitch_where_should_be_no_pitch_ratio,
             output_no_pitch_where_should_be_pitch_ratio
             )

From c3bd2d0b76f2acfc1f09dc28e0b087ccbe027457 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sun, 8 Oct 2023 21:10:05 +0200
Subject: [PATCH 12/19] fix some bugs

---
 src/UltraSinger.py                        | 1 +
 src/modules/Speech_Recognition/Whisper.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index d7bd992..2fadec7 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -570,6 +570,7 @@ def separate_vocal_from_audio(
         input_path = ultrastar_audio_input_path
 
     convert_audio_to_mono_wav(input_path, settings.mono_audio_path)
+
     return audio_separation_path
 
 
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
index c8bbc5a..599a7f3 100644
--- a/src/modules/Speech_Recognition/Whisper.py
+++ b/src/modules/Speech_Recognition/Whisper.py
@@ -107,7 +107,7 @@ def convert_to_transcribed_data(result_aligned):
             vtd = from_whisper(obj)  # create custom Word object
             vtd.word = vtd.word + " "  # add space to end of word
             if len(obj) < 4:
-                previous = transcribed_data[-1] if len(transcribed_data) != 0 else {"end": 0, "word": ""}
+                previous = transcribed_data[-1] if len(transcribed_data) != 0 else TranscribedData()
                 vtd.start = previous.end + 0.1
                 vtd.end = previous.end + 0.2
                 msg = f'Error: There is no timestamp for word:  {obj["word"]}. ' \

From 4729daf2dd6e0a339f6b5979ab93d9240090e9b6 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sat, 14 Oct 2023 10:43:06 +0200
Subject: [PATCH 13/19] remove hard sys.exit calls for batch processing

---
 src/UltraSinger.py                           | 2 +-
 src/modules/Speech_Recognition/Whisper.py    | 9 +++++----
 src/modules/Ultrastar/ultrastar_converter.py | 7 +++++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index 2fadec7..baffdcf 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -503,7 +503,7 @@ def get_unused_song_output_dir(path: str) -> str:
             print(
                 f"{ULTRASINGER_HEAD} {red_highlighted('Error: Could not create output folder! (999) is the maximum number of tries.')}"
             )
-            sys.exit(1)
+            raise ValueError("Could not create output folder! (999) is the maximum number of tries.")
     return path
 
 
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
index 599a7f3..46cc8dd 100644
--- a/src/modules/Speech_Recognition/Whisper.py
+++ b/src/modules/Speech_Recognition/Whisper.py
@@ -2,6 +2,7 @@
 
 import sys
 
+import torch
 import whisperx
 from torch.cuda import OutOfMemoryError
 
@@ -12,6 +13,7 @@
 
 MEMORY_ERROR_MESSAGE = f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu"
 
+
 def transcribe_with_whisper(
     audio_path: str,
     model: str,
@@ -33,6 +35,7 @@ def transcribe_with_whisper(
         compute_type = "float16" if device == "cuda" else "int8"
 
     try:
+        torch.cuda.empty_cache()
         loaded_whisper_model = whisperx.load_model(
             model, language=language, device=device, compute_type=compute_type
         )
@@ -61,7 +64,7 @@ def transcribe_with_whisper(
                 f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. "
                 f"Try add it with --align_model [hugingface]."
             )
-            sys.exit(1)
+            raise ve
 
         # align whisper output
         result_aligned = whisperx.align(
@@ -85,18 +88,16 @@ def transcribe_with_whisper(
             print(
                 f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'"
             )
-            sys.exit(1)
 
         raise value_error
     except OutOfMemoryError as oom_exception:
         print(oom_exception)
         print(MEMORY_ERROR_MESSAGE)
-        sys.exit(1)
+        raise oom_exception
     except Exception as exception:
         if "CUDA failed with error out of memory" in str(exception.args[0]):
             print(exception)
             print(MEMORY_ERROR_MESSAGE)
-            sys.exit(1)
         raise exception
 
 
diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py
index 28a7a9b..7a4aee6 100644
--- a/src/modules/Ultrastar/ultrastar_converter.py
+++ b/src/modules/Ultrastar/ultrastar_converter.py
@@ -162,8 +162,11 @@ def compare_pitches(input_ultrastar_class, output_ultrastar_class) -> tuple[floa
     input_pitch_shift_match_ratios = {}
     output_pitch_shift_match_ratios = {}
     for index, pitch_shift_matches_item in enumerate(pitch_shift_matches):
-        input_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / input_pitched_datapoints
-        output_pitch_shift_match_ratios[index] = (matches + pitch_shift_matches_item) / output_pitched_datapoints
+        pitch_shift_matches_count = pitch_shift_matches_item
+        if index == 0:
+            pitch_shift_matches_count += matches
+        input_pitch_shift_match_ratios[index] = pitch_shift_matches_item / input_pitched_datapoints
+        output_pitch_shift_match_ratios[index] = pitch_shift_matches_item / output_pitched_datapoints
 
     output_pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch / output_pitched_datapoints
     output_no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch / input_pitched_datapoints

From 945581dbd8528bf2e681b076c29f4ca737d4f555 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sat, 14 Oct 2023 14:30:17 +0200
Subject: [PATCH 14/19] add test run evaluation

---
 src/UltraSingerEvaluation.py     |  18 +++--
 src/UltraSingerMetaEvaluation.py | 109 +++++++++++++++++++++++++++++++
 src/modules/Research/TestRun.py  |   5 +-
 3 files changed, 123 insertions(+), 9 deletions(-)
 create mode 100644 src/UltraSingerMetaEvaluation.py

diff --git a/src/UltraSingerEvaluation.py b/src/UltraSingerEvaluation.py
index a884e13..1d06292 100644
--- a/src/UltraSingerEvaluation.py
+++ b/src/UltraSingerEvaluation.py
@@ -6,6 +6,8 @@
 from typing import List
 import importlib.util
 
+import pandas
+
 import UltraSinger
 from Settings import Settings
 from modules.DeviceDetection.device_detection import check_gpu_support
@@ -19,9 +21,11 @@
 
 test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input"))
 test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output"))
+test_start_time = datetime.now()
 test_run_folder = os.path.join(
-    test_output_folder, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    test_output_folder, test_start_time.strftime("%Y-%m-%d_%H-%M-%S")
 )
+test_run_songs_folder = os.path.join(test_run_folder, "songs")
 
 
 def main() -> None:
@@ -29,9 +33,10 @@ def main() -> None:
     Path(test_input_folder).mkdir(parents=True, exist_ok=True)
     Path(test_output_folder).mkdir(parents=True, exist_ok=True)
     Path(test_run_folder).mkdir(parents=True)
+    Path(test_run_songs_folder).mkdir(parents=True)
 
     base_settings = initialize_settings()
-    base_settings.output_file_path = test_run_folder
+    base_settings.output_file_path = test_run_songs_folder
 
     base_settings.test_songs_input_folder = os.path.normpath(
         base_settings.test_songs_input_folder
@@ -60,7 +65,7 @@ def main() -> None:
 
     print(f"{ULTRASINGER_HEAD} Running evaluation for {len(test_songs)} songs")
 
-    test_run = TestRun(base_settings)
+    test_run = TestRun(base_settings, test_start_time)
     for index, test_song in enumerate(test_songs):
         print(f"{ULTRASINGER_HEAD} ========================")
         print(
@@ -79,7 +84,7 @@ def main() -> None:
         tested_song = TestedSong(test_song.input_txt)
         test_run.tested_songs.append(tested_song)
         try:
-            output_txt, simple_score, accurate_score = UltraSinger.run()
+            output_txt, _, _ = UltraSinger.run()
         except Exception as error:
             print(
                 f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} Failed to process {test_song.input_txt}\n{error}."
@@ -89,7 +94,7 @@ def main() -> None:
 
 
         output_folder_name = f"{test_song.input_ultrastar_class.artist} - {test_song.input_ultrastar_class.title}"
-        output_folder = os.path.join(test_run_folder, output_folder_name)
+        output_folder = os.path.join(test_run_songs_folder, output_folder_name)
 
         if not os.path.isfile(output_txt):
             print(
@@ -116,9 +121,8 @@ def main() -> None:
         tested_song.output_pitch_shift_match_ratios = output_pitch_shift_match_ratios
         tested_song.pitch_where_should_be_no_pitch_ratio = pitch_where_should_be_no_pitch_ratio
         tested_song.no_pitch_where_should_be_pitch_ratio = no_pitch_where_should_be_pitch_ratio
-        tested_song.output_score_simple = simple_score
-        tested_song.output_score_accurate = accurate_score
 
+    test_run.end_time = datetime.now()
     test_run_result_file = os.path.join(test_run_folder, "run.json")
     test_run_json = test_run.to_json()
     with open(test_run_result_file, "w", encoding=FILE_ENCODING) as file:
diff --git a/src/UltraSingerMetaEvaluation.py b/src/UltraSingerMetaEvaluation.py
new file mode 100644
index 0000000..d26da4b
--- /dev/null
+++ b/src/UltraSingerMetaEvaluation.py
@@ -0,0 +1,109 @@
+import os
+from pathlib import Path
+from typing import List
+
+import pandas
+
+from modules.Research.TestRun import TestRun
+from modules.console_colors import ULTRASINGER_HEAD, red_highlighted
+
+test_input_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_input"))
+test_output_folder = os.path.normpath(os.path.abspath(__file__ + "/../../test_output"))
+
+
+def main() -> None:
+    """Main function"""
+    Path(test_output_folder).mkdir(parents=True, exist_ok=True)
+
+    test_runs: List[TestRun] = []
+    for dir_entry in os.listdir(test_output_folder):
+        test_run_folder = os.path.join(test_output_folder, dir_entry)
+        test_run = find_test_run_result(test_run_folder)
+        if test_run is None:
+            continue
+
+        test_runs.append(test_run)
+
+    if len(test_runs) == 0:
+        print(
+            f"{ULTRASINGER_HEAD} {red_highlighted('Error!')} No test runs found in {test_output_folder}."
+        )
+        exit(1)
+
+    print(f"{ULTRASINGER_HEAD} Running meta evaluation for {len(test_runs)} test runs")
+
+    for test_run in test_runs:
+        tested_songs_dicts = []
+        for tested_song in [s for s in test_run.tested_songs if s.success]:
+            tested_song_dict = tested_song.to_dict()
+
+            best_input_pitch_shift_match_ratio = max(
+                tested_song.input_pitch_shift_match_ratios.values()
+            )
+
+            # based on the pitch shift of the highest input_pitch_shift_match_ratio picked previously
+            # we pick the corresponding value of output_pitch_shift_match_ratios
+            matching_input_best_output_pitch_shift_match_ratio = (
+                tested_song.output_pitch_shift_match_ratios[
+                    list(tested_song.input_pitch_shift_match_ratios.values()).index(
+                        best_input_pitch_shift_match_ratio
+                    )
+                ]
+            )
+
+            best_output_pitch_shift_match_ratio = max(
+                tested_song.output_pitch_shift_match_ratios.values()
+            )
+
+            # based on the pitch shift of the highest output_pitch_shift_match_ratio picked previously
+            # we pick the corresponding value of input_pitch_shift_match_ratios
+            matching_output_best_input_pitch_shift_match_ratio = (
+                tested_song.input_pitch_shift_match_ratios[
+                    list(tested_song.output_pitch_shift_match_ratios.values()).index(
+                        best_output_pitch_shift_match_ratio
+                    )
+                ]
+            )
+
+            tested_song_dict[
+                "best_input_pitch_shift_match_ratio"
+            ] = best_input_pitch_shift_match_ratio
+            tested_song_dict[
+                "matching_input_best_output_pitch_shift_match_ratio"
+            ] = matching_input_best_output_pitch_shift_match_ratio
+            tested_song_dict[
+                "best_output_pitch_shift_match_ratio"
+            ] = best_output_pitch_shift_match_ratio
+            tested_song_dict[
+                "matching_output_best_input_pitch_shift_match_ratio"
+            ] = matching_output_best_input_pitch_shift_match_ratio
+
+            tested_songs_dicts.append(tested_song_dict)
+
+        records = pandas.DataFrame.from_records(tested_songs_dicts)
+        pandas.options.display.max_columns = records.shape[1]
+        describe_result = records.describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99])
+        print(describe_result)
+
+    print("Done")
+
+
+def find_test_run_result(test_run_folder) -> TestRun:
+    if os.path.isdir(test_run_folder):
+        for test_run_folder_item in os.listdir(test_run_folder):
+            test_run_folder_item_path = os.path.join(
+                test_run_folder, test_run_folder_item
+            )
+            if (
+                os.path.isfile(test_run_folder_item_path)
+                and test_run_folder_item == "run.json"
+            ):
+                test_run = None
+                with open(test_run_folder_item_path) as file:
+                    json = file.read()
+                    test_run = TestRun.from_json(json)
+                return test_run
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/modules/Research/TestRun.py b/src/modules/Research/TestRun.py
index f381a01..ed573a8 100644
--- a/src/modules/Research/TestRun.py
+++ b/src/modules/Research/TestRun.py
@@ -1,3 +1,4 @@
+import datetime
 from dataclasses import dataclass, field
 
 from dataclasses_json import dataclass_json
@@ -19,8 +20,6 @@ class TestedSong:
     output_pitch_shift_match_ratios: dict[int, float] = field(default_factory=lambda: {})
     no_pitch_where_should_be_pitch_ratio: float = 0.0
     pitch_where_should_be_no_pitch_ratio: float = 0.0
-    output_score_simple: int = 0
-    output_score_accurate: int = 0
 
 
 @dataclass_json
@@ -29,4 +28,6 @@ class TestRun:
     """Test run"""
 
     settings: Settings
+    start_time: datetime.datetime = None
+    end_time: datetime.datetime = None
     tested_songs: list[TestedSong] = field(default_factory=lambda: [])

From 582b6440f27ce75c5a68e18bc701ae8632966672 Mon Sep 17 00:00:00 2001
From: Vadim <rakuri255@yahoo.de>
Date: Wed, 19 Jun 2024 01:18:06 +0200
Subject: [PATCH 15/19] Fix merge conflicts

---
 src/UltraSinger.py                        |  4 +-
 src/modules/Audio/separation.py           | 14 +++--
 src/modules/Audio/silence_processing.py   |  2 +-
 src/modules/Speech_Recognition/Whisper.py | 77 ++++++++++++-----------
 4 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index db49042..9d6205d 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -489,9 +489,9 @@ def run() -> tuple[str, Score, Score]:
         remove_unecessary_punctuations(transcription_result.transcribed_data)
 
         if settings.hyphenation:
-            hyphen_words = hyphenate_each_word(language, transcribed_data)
+            hyphen_words = hyphenate_each_word(language, transcription_result.transcribed_data)
             if hyphen_words is not None:
-                transcribed_data = add_hyphen_to_data(transcribed_data, hyphen_words)
+                transcribed_data = add_hyphen_to_data(transcription_result.transcribed_data, hyphen_words)
 
         transcribed_data = remove_silence_from_transcription_data(
             settings.processing_audio_path, transcribed_data
diff --git a/src/modules/Audio/separation.py b/src/modules/Audio/separation.py
index 479966d..55335ac 100644
--- a/src/modules/Audio/separation.py
+++ b/src/modules/Audio/separation.py
@@ -20,11 +20,13 @@ def separate_audio(input_file_path: str, output_folder: str, device="cpu") -> No
         f"{ULTRASINGER_HEAD} Separating vocals from audio with {blue_highlighted('demucs')} and {red_highlighted(device)} as worker."
     )
 
-    demucs.separate.main(shlex.split(f'--two-stems vocals -d {device} --out "{os.path.join(output_folder, "separated")}" "{input_file_path}"'))
     # Model selection?
     # -n htdemucs_ft
-    subprocess.run(
-        ["demucs", "-d", device, "--two-stems=vocals", "--float32", input_file_path]
-    )
-    separated_folder = path_join(current_executor_path(), "separated")
-    move(separated_folder, output_file)
\ No newline at end of file
+    # subprocess.run(
+    #     ["demucs", "-d", device, "--two-stems=vocals", "--float32", input_file_path]
+    # )
+    # separated_folder = path_join(current_executor_path(), "separated")
+    # move(separated_folder, output_file)
+
+    # fixme "--float32" is missing
+    demucs.separate.main(shlex.split(f'--two-stems vocals -d {device} --out "{os.path.join(output_folder, "separated")}" "{input_file_path}"'))
diff --git a/src/modules/Audio/silence_processing.py b/src/modules/Audio/silence_processing.py
index da11172..46f9fea 100644
--- a/src/modules/Audio/silence_processing.py
+++ b/src/modules/Audio/silence_processing.py
@@ -63,7 +63,7 @@ def remove_silence(silence_parts_list: list[tuple[float, float]], transcribed_da
                     split_word = "~ "
                     is_word_end = True
 
-                split_data = TranscribedData({"conf": data.conf, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end})
+                split_data = TranscribedData({"conf": data.confidence, "word": split_word, "end": split_end, "start": silence_end, "is_word_end": is_word_end})
 
                 if not was_split:
                     data.end = silence_start
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
index db176f9..a0fec68 100644
--- a/src/modules/Speech_Recognition/Whisper.py
+++ b/src/modules/Speech_Recognition/Whisper.py
@@ -41,18 +41,38 @@ def transcribe_with_whisper(
         loaded_whisper_model = whisperx.load_model(
             model, language=language, device=device, compute_type=compute_type
         )
+    except ValueError as value_error:
+        if (
+                "Requested float16 compute type, but the target device or backend do not support efficient float16 computation."
+                in str(value_error.args[0])
+        ):
+            print(value_error)
+            print(
+                f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'"
+            )
+
+        raise value_error
+    except OutOfMemoryError as oom_exception:
+        print(oom_exception)
+        print(MEMORY_ERROR_MESSAGE)
+        raise oom_exception
+    except Exception as exception:
+        if "CUDA failed with error out of memory" in str(exception.args[0]):
+            print(exception)
+            print(MEMORY_ERROR_MESSAGE)
+        raise exception
 
-        audio = whisperx.load_audio(audio_path)
+    audio = whisperx.load_audio(audio_path)
 
-        print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
+    print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
 
-        result = loaded_whisper_model.transcribe(
-            audio, batch_size=batch_size, language=language
-        )
+    result = loaded_whisper_model.transcribe(
+        audio, batch_size=batch_size, language=language
+    )
 
-        detected_language = result["language"]
-        if language is None:
-            language = detected_language
+    detected_language = result["language"]
+    if language is None:
+        language = detected_language
 
     # load alignment model and metadata
     try:
@@ -68,39 +88,20 @@ def transcribe_with_whisper(
         )
         raise ve
 
-        # align whisper output
-        result_aligned = whisperx.align(
-            result["segments"],
-            model_a,
-            metadata,
-            audio,
-            device,
-            return_char_alignments=False,
-        )
+    # align whisper output
+    result_aligned = whisperx.align(
+        result["segments"],
+        model_a,
+        metadata,
+        audio,
+        device,
+        return_char_alignments=False,
+    )
 
-        transcribed_data = convert_to_transcribed_data(result_aligned)
+    transcribed_data = convert_to_transcribed_data(result_aligned)
 
-        return TranscriptionResult(transcribed_data, detected_language)
-    except ValueError as value_error:
-        if (
-            "Requested float16 compute type, but the target device or backend do not support efficient float16 computation."
-            in str(value_error.args[0])
-        ):
-            print(value_error)
-            print(
-                f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'"
-            )
+    return TranscriptionResult(transcribed_data, detected_language)
 
-        raise value_error
-    except OutOfMemoryError as oom_exception:
-        print(oom_exception)
-        print(MEMORY_ERROR_MESSAGE)
-        raise oom_exception
-    except Exception as exception:
-        if "CUDA failed with error out of memory" in str(exception.args[0]):
-            print(exception)
-            print(MEMORY_ERROR_MESSAGE)
-        raise exception
 
 
 def convert_to_transcribed_data(result_aligned):

From 10e063aae2706546e842da1cf664b90ea7acbda1 Mon Sep 17 00:00:00 2001
From: Vadim <rakuri255@yahoo.de>
Date: Sat, 22 Jun 2024 18:26:54 +0200
Subject: [PATCH 16/19] Fix merge conflicts

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d83b1e1..1f39d8d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-crepe~=0.0.13
-demucs~=4.0.0
+crepe~=0.0.15
+demucs~=4.0.1
 ffmpeg_python~=0.2.0
 git+https://github.com/m-bain/whisperx.git
 langcodes~=3.4.0

From 8f597719ba7081f02479a1c78b6971c1ef0a5f72 Mon Sep 17 00:00:00 2001
From: Vadim <rakuri255@yahoo.de>
Date: Thu, 4 Jul 2024 18:34:53 +0200
Subject: [PATCH 17/19] Merge fix

---
 src/UltraSinger.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index 3c77c7e..bbb9580 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -345,7 +345,7 @@ def split_syllables_into_segments(
 
 
 def merge_syllable_segments(
-    transcribed_data: list[TranscribedData], midi_notes: list[str], us_notes=list[int]
+    transcribed_data: list[TranscribedData], midi_segments: list[MidiSegment], us_notes=list[int]
 ) -> tuple[list[TranscribedData], list[str], list[int]]:
     """Merge sub-segments of a syllable where the pitch is the same"""
     new_data = []
@@ -358,13 +358,13 @@ def merge_syllable_segments(
         if (
             str(data.word).startswith("~")
             and previous_data is not None
-            and midi_notes[i] == midi_notes[i - 1]
+            and midi_segments[i].note == midi_segments[i - 1].note
             and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE
         ):
             new_data[-1].end = data.end
         else:
             new_data.append(data)
-            new_midi_notes.append(midi_notes[i])
+            new_midi_notes.append(midi_segments[i].note)
             new_us_notes.append(us_notes[i])
         previous_data = data
     return new_data, new_midi_notes, new_us_notes
@@ -500,10 +500,10 @@ def run() -> tuple[str, Score, Score]:
 
     # Pitch the audio
     midi_segments, pitched_data, ultrastar_note_numbers, transcribed_data = pitch_audio(
-        is_audio, transcribed_data, ultrastar_class
+        transcribed_data, ultrastar_class, cache_path)
 
     transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(
-        transcribed_data, midi_notes, ultrastar_note_numbers
+        transcribed_data, midi_segments, ultrastar_note_numbers
     )
 
     # Create plot

From 76a495653bc4c7ccd21b5a448bc4362de39efa96 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sat, 20 Jul 2024 10:36:05 +0200
Subject: [PATCH 18/19] fix score calculation

---
 src/UltraSinger.py                           | 10 +++++-----
 src/modules/Speech_Recognition/Whisper.py    |  8 +++++++-
 src/modules/Ultrastar/ultrastar_converter.py | 10 +++++-----
 src/modules/console_colors.py                |  2 +-
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/UltraSinger.py b/src/UltraSinger.py
index bbb9580..50db73f 100644
--- a/src/UltraSinger.py
+++ b/src/UltraSinger.py
@@ -540,10 +540,10 @@ def run() -> tuple[str, Score, Score]:
             pitched_data, ultrastar_class, ultrastar_file_output
         )
 
-    # Add calculated score to Ultrastar txt #Todo: Missing Karaoke
-    ultrastar_writer.add_score_to_ultrastar_txt(
-        ultrastar_file_output, simple_score
-    )
+        # Add calculated score to Ultrastar txt #Todo: Missing Karaoke
+        ultrastar_writer.add_score_to_ultrastar_txt(
+            ultrastar_file_output, simple_score
+        )
 
     # Midi
     if settings.create_midi:
@@ -984,7 +984,7 @@ def pitch_audio(
 
     new_transcribed_data = []
     for i, midi_segment in enumerate(midi_segments):
-        new_transcribed_data.append(TranscribedData({"word": midi_segment.word, "start": midi_segment.start, "end": midi_segment.end, "is_hyphen": None, "confidence": 1}))
+        new_transcribed_data.append(TranscribedData(word=midi_segment.word, start=midi_segment.start, end=midi_segment.end, is_hyphen=None, confidence=1))
 
     return midi_segments, pitched_data, ultrastar_note_numbers, new_transcribed_data
 
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
index a0fec68..9cb228d 100644
--- a/src/modules/Speech_Recognition/Whisper.py
+++ b/src/modules/Speech_Recognition/Whisper.py
@@ -38,8 +38,14 @@ def transcribe_with_whisper(
 
     try:
         torch.cuda.empty_cache()
+        asr_options = {
+            "max_new_tokens": None,
+            "clip_timestamps": None,
+            "hallucination_silence_threshold": None
+        }
+
         loaded_whisper_model = whisperx.load_model(
-            model, language=language, device=device, compute_type=compute_type
+            model, asr_options=asr_options, language=language, device=device, compute_type=compute_type
         )
     except ValueError as value_error:
         if (
diff --git a/src/modules/Ultrastar/ultrastar_converter.py b/src/modules/Ultrastar/ultrastar_converter.py
index 7a4aee6..795c20c 100644
--- a/src/modules/Ultrastar/ultrastar_converter.py
+++ b/src/modules/Ultrastar/ultrastar_converter.py
@@ -59,7 +59,7 @@ def get_start_time_from_ultrastar(
 ) -> float:
     """Calculates the start time from the Ultrastar txt"""
 
-    gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000)
+    gap = float(ultrastar_class.gap.replace(",", ".")) / 1000
     real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", ".")))
     start_time = beat_to_second(int(ultrastar_class.startBeat[pos]), real_bpm) + gap
     return start_time
@@ -68,7 +68,7 @@ def get_start_time_from_ultrastar(
 def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) -> float:
     """Calculates the end time from the Ultrastar txt"""
 
-    gap = int(float(ultrastar_class.gap.replace(",", ".")) / 1000)
+    gap = float(ultrastar_class.gap.replace(",", ".")) / 1000
     real_bpm = ultrastar_bpm_to_real_bpm(float(ultrastar_class.bpm.replace(",", ".")))
     end_time = (
         beat_to_second(
@@ -83,7 +83,7 @@ def get_end_time_from_ultrastar(ultrastar_class: UltrastarTxtValue, pos: int) ->
 def map_to_datapoints(
     ultrastar_class: UltrastarTxtValue, step_size: int = 10
 ) -> list[int]:
-    gap = int(float(ultrastar_class.gap.replace(",", ".")))
+    gap = float(ultrastar_class.gap.replace(",", "."))
 
     data = []
 
@@ -92,8 +92,8 @@ def map_to_datapoints(
         if ultrastar_class.noteType[pos] == "F":
             continue
 
-        start_time = int(get_start_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap
-        end_time = int(get_end_time_from_ultrastar(ultrastar_class, pos) * 1000) + gap
+        start_time = int(get_start_time_from_ultrastar(ultrastar_class, pos) * 1000 + gap)
+        end_time = int(get_end_time_from_ultrastar(ultrastar_class, pos) * 1000 + gap)
 
         start_nearest_step = (start_time + step_size - 1) // step_size * step_size
         end_nearest_step = (end_time + step_size - 1) // step_size * step_size
diff --git a/src/modules/console_colors.py b/src/modules/console_colors.py
index 59328ff..e5d9375 100644
--- a/src/modules/console_colors.py
+++ b/src/modules/console_colors.py
@@ -9,7 +9,7 @@ def blue_highlighted(text: str) -> str:
 
 
 def green_highlighted(text: str) -> str:
-    """Returns a blue highlighted text"""
+    """Returns a green highlighted text"""
     return f"{Bcolors.dark_green}{text}{Bcolors.endc}"
 
 

From 051290c7682aa0795b1f74a2a93f282a1c1057c4 Mon Sep 17 00:00:00 2001
From: Benedikt Wagener <wagenerbene@gmail.com>
Date: Sat, 20 Jul 2024 11:02:46 +0200
Subject: [PATCH 19/19] delete obsolete testfile

---
 pytest/modules/UltraSinger.py | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 pytest/modules/UltraSinger.py

diff --git a/pytest/modules/UltraSinger.py b/pytest/modules/UltraSinger.py
deleted file mode 100644
index 2aa2d27..0000000
--- a/pytest/modules/UltraSinger.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""Tests for UltraSinger.py"""
-
-import os
-import unittest
-import src.modules.Pitcher.pitcher as test_subject
-
-import pytest
-from src.modules.plot import plot
-
-
-class PitcherTest(unittest.TestCase):
-    # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
-    def test_get_pitch_with_crepe_file(self):
-        # Arrange
-        test_dir = os.path.dirname(os.path.abspath(__file__))
-        root_dir = os.path.abspath(test_dir + "/../../..")
-        # test_file_abs_path = os.path.abspath(root_dir + "/test_input/audio_denoised.wav")
-        test_file_abs_path = os.path.abspath(root_dir + "/test_input/test_denoised.wav")
-        test_output = root_dir + "/test_output"
-
-        # Act
-        # pitched_data = test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', device="cuda")
-        # test_subject.get_pitch_with_crepe_file(test_file_abs_path, 'full', 'cpu', batch_size=1024)
-        # plot(pitched_data, test_output, title="pitching test")
-
-        print("done")
-
-    
-if __name__ == "__main__":
-    unittest.main()