diff --git a/dataset_configs/multilingual/granary/config.yaml b/dataset_configs/multilingual/granary/config.yaml index 78e778b0..2a476b88 100644 --- a/dataset_configs/multilingual/granary/config.yaml +++ b/dataset_configs/multilingual/granary/config.yaml @@ -71,7 +71,7 @@ documentation: | ``ConvertToTarredAudioDataset`` *(optional, only if tar-sharding is enabled)*:: - pip install lhotse "nemo-toolkit[common]==2.2.1" + pip install lhotse "nemo-toolkit[common]==2.3.2" Quick start ----------- diff --git a/docker/Dockerfile.tts_sdp b/docker/Dockerfile.tts_sdp index f174c7b1..f3a009b7 100644 --- a/docker/Dockerfile.tts_sdp +++ b/docker/Dockerfile.tts_sdp @@ -40,7 +40,7 @@ RUN pip install -r requirements/main.txt RUN pip install -r requirements/tts.txt RUN pip install flash-attn --no-build-isolation RUN pip install https://github.com/LahiLuk/YouTokenToMe/archive/master.zip -RUN pip install megatron-core transformer_engine[pytorch]==2.4.0 -RUN pip install nemo_toolkit['all']==2.1.0 +RUN pip install --no-build-isolation megatron-core transformer_engine[pytorch]==2.4.0 +RUN pip install nemo_toolkit['all']==2.3.2 WORKDIR /src/NeMo-speech-data-processor \ No newline at end of file diff --git a/sdp/processors/tts/nemo_asr_align.py b/sdp/processors/tts/nemo_asr_align.py index 9a71c476..fd6a077f 100644 --- a/sdp/processors/tts/nemo_asr_align.py +++ b/sdp/processors/tts/nemo_asr_align.py @@ -119,7 +119,7 @@ def get_alignments_text(self, hypotheses): - list: List of dictionaries with word alignments (word, start, end) - str: The transcribed text """ - timestamp_dict = hypotheses.timestep # extract timesteps from hypothesis of first (and only) audio file + timestamp_dict = hypotheses.timestamp # extract timesteps from hypothesis of first (and only) audio file # For a FastConformer model, you can display the word timestamps as follows: # 80ms is duration of a timestep at output of the Conformer