NVIDIA · titu1994 · May 8, 2023 · May 8, 2023 · May 8, 2023 · May 8, 2023
diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
@@ -35,6 +35,11 @@ Model Classes
     :members: setup_training_data, setup_optimization, setup_validation_data, setup_test_data, register_artifact
 
 
+.. autoclass:: nemo.collections.asr.models.hybrid_asr_tts_models.ASRWithTTSModel
+    :show-inheritance:
+    :members: from_asr_config, from_pretrained_models, save_asr_model_to, setup_training_data
+
+
 Modules
 -------
 
@@ -131,6 +136,19 @@ Character Encoding Datasets
     :show-inheritance:
     :members:
 
+
+Text-to-Text Datasets for Hybrid ASR-TTS models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: nemo.collections.asr.data.text_to_text.TextToTextDataset
+    :show-inheritance:
+    :members:
+
+.. autoclass:: nemo.collections.asr.data.text_to_text.TextToTextIterableDataset
+    :show-inheritance:
+    :members:
+
+
 Subword Encoding Datasets
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst
@@ -878,6 +878,113 @@ FastEmit Regularization is supported for the default Numba based WarpRNNT loss.
 Refer to the above paper for results and recommendations of ``fastemit_lambda``.
 
 
+.. _Hybrid-ASR-TTS_model__Config:
+
+Hybrid ASR-TTS Model Configuration
+----------------------------------
+
+:ref:`Hybrid ASR-TTS model <Hybrid-ASR-TTS_model>` consists of three parts:
+
+* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``)
+* TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch <FastPitch_model>` model is supported)
+* Enhancer model (optional)
+
+Also, the config allows to specify :ref:`text-only dataset <Hybrid-ASR-TTS_model__Text-Only-Data>`.
+
+Main parts of the config:
+
+* ASR model
+    * ``asr_model_path``: path to the ASR model checkpoint (`.nemo`) file, loaded only once, then the config of the ASR model is stored in the ``asr_model`` field
+    * ``asr_model_type``: needed only when training from scratch, ``rnnt_bpe`` corresponds to ``EncDecRNNTBPEModel``, ``ctc_bpe`` to ``EncDecCTCModelBPE``
+    * ``asr_model_fuse_bn``: fusing BatchNorm in the pretrained ASR model, can improve quality in finetuning scenario
+* TTS model
+    * ``tts_model_path``: path to the pretrained TTS model checkpoint (`.nemo`) file, loaded only once, then the config of the model is stored in the ``tts_model`` field
+* Enhancer model
+    * ``enhancer_model_path``: optional path to the enhancer model. Loaded only once, the config is stored in the ``enhancer_model`` field
+* ``train_ds``
+    * ``text_data``: properties related to text-only data
+        * ``manifest_filepath``: path (or paths) to :ref:`text-only dataset <Hybrid-ASR-TTS_model__Text-Only-Data>` manifests
+        * ``speakers_filepath``: path (or paths) to the text file containing speaker ids for the multi-speaker TTS model (speakers are sampled randomly during training)
+        * ``min_words`` and ``max_words``: parameters to filter text-only manifests by the number of words
+        * ``tokenizer_workers``: number of workers for initial tokenization (when loading the data). ``num_CPUs / num_GPUs`` is a recommended value.
+    * ``asr_tts_sampling_technique``, ``asr_tts_sampling_temperature``, ``asr_tts_sampling_probabilities``: sampling parameters for text-only and audio-text data (if both specified). See parameters for ``nemo.collections.common.data.ConcatDataset``
+    * all other components are similar to conventional ASR models
+* ``validation_ds`` and ``test_ds`` correspond to the underlying ASR model
+
+
+.. code-block:: yaml
+
+  model:
+    sample_rate: 16000
+
+    # asr model
+    asr_model_path: ???
+    asr_model: null
+    asr_model_type: null  # rnnt_bpe or ctc_bpe, needed only if instantiating from config, otherwise type is auto inferred
+    asr_model_fuse_bn: false  # only ConformerEncoder supported now, use false for other models
+
+    # tts model
+    tts_model_path: ???
+    tts_model: null
+
+    # enhancer model
+    enhancer_model_path: null
+    enhancer_model: null
+
+    train_ds:
+      text_data:
+        manifest_filepath: ???
+        speakers_filepath: ???
+        min_words: 1
+        max_words: 45  # 45 - recommended value, ~16.7 sec for LibriSpeech
+        tokenizer_workers: 1
+      asr_tts_sampling_technique: round-robin  # random, round-robin, temperature
+      asr_tts_sampling_temperature: null
+      asr_tts_sampling_probabilities: null  # [0.5,0.5] – ASR,TTS
+      manifest_filepath: ???
+      batch_size: 16 # you may increase batch_size if your memory allows
+      # other params
+
+Finetuning
+~~~~~~~~~~~
+
+To finetune existing ASR model using text-only data use ``<NeMo_git_root>/examples/asr/asr_with_tts/speech_to_text_bpe_with_text_finetune.py`` script with the corresponding config ``<NeMo_git_root>/examples/asr/conf/asr_tts/hybrid_asr_tts.yaml``.
+
+Please specify paths to all the required models (ASR, TTS, and Enhancer checkpoints), along with ``train_ds.text_data.manifest_filepath`` and ``train_ds.text_data.speakers_filepath``.
+
+.. code-block:: shell
+
+    python speech_to_text_bpe_with_text_finetune.py \
+        model.asr_model_path=<path to ASR model> \
+        model.tts_model_path=<path to compatible TTS model> \
+        model.enhancer_model_path=<optional path to enhancer model> \
+        model.asr_model_fuse_bn=<true recommended if ConformerEncoder with BatchNorm, false otherwise> \
+        model.train_ds.manifest_filepath=<path to manifest with audio-text pairs or null> \
+        model.train_ds.text_data.manifest_filepath=<path(s) to manifest with train text> \
+        model.train_ds.text_data.speakers_filepath=<path(s) to speakers list> \
+        model.train_ds.text_data.tokenizer_workers=4 \
+        model.validation_ds.manifest_filepath=<path to validation manifest> \
+        model.train_ds.batch_size=<batch_size>
+
+Training from Scratch
+~~~~~~~~~~~~~~~~~~~~~
+
+To train ASR model from scratch using text-only data use ``<NeMo_git_root>/examples/asr/asr_with_tts/speech_to_text_bpe_with_text.py`` script with conventional ASR model config, e.g. ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_ctc_bpe.yaml`` or  ``<NeMo_git_root>/examples/asr/conf/conformer/conformer_transducer_bpe.yaml``
+
+Please specify the ASR model type, paths to the TTS model, and (optional) enhancer, along with text-only data-related fields.
+
+.. code-block:: shell
+
+    python speech_to_text_bpe_with_text.py \
+        ++asr_model_type=<rnnt_bpe or ctc_bpe> \
+        ++tts_model_path=<path to compatible tts model> \
+        ++enhancer_model_path=<optional path to enhancer model> \
+        ++model.train_ds.text_data.manifest_filepath=<path(s) to manifests with train text> \
+        ++model.train_ds.text_data.speakers_filepath=<path(s) to speakers list> \
+        ++model.train_ds.text_data.min_words=1 \
+        ++model.train_ds.text_data.max_words=45 \
+        ++model.train_ds.text_data.tokenizer_workers=4
+
 Fine-tuning Configurations
 --------------------------
 

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
@@ -481,4 +481,24 @@ An example using an AIS cluster at ``hostname:port`` with a tarred dataset for t
   model.train_ds.tarred_audio_filepaths=ais://train_bucket/audio__OP_0..511_CL_.tar \
   ++model.train_ds.defer_setup=true \
   mode.validation_ds.manifest_filepath=ais://validation_bucket/validation_manifest.json \
-  ++model.validation_ds.defer_setup=true
+  ++model.validation_ds.defer_setup=true
+
+
+.. _Hybrid-ASR-TTS_model__Text-Only-Data:
+
+Preparing Text-Only Data for Hybrid ASR-TTS Models
+--------------------------------------------------
+
+:ref:`Hybrid ASR-TTS models <Hybrid-ASR-TTS_model>` require a text-only dataset for training the ASR model.
+Each record in the dataset (in ``.json`` file) should contain the following fields:
+
+* ``text``: text to use as a target for the ASR model
+* ``tts_text`` or/and ``tts_text_normalized``: text to use as a source for TTS model. ``tts_text_normalized`` should contain normalized text for TTS model. If there is no such field, ``tts_text`` will be used after normalization using the normalizer from the TTS model. It is highly recommended to normalize the text and create ``tts_text_normalized`` field manually, since current normalizers are unsuitable for processing a large amount of text on the fly.
+
+**Example record:**
+
+.. code-block:: json
+
+    {"text": "target for one hundred billion parameters asr model",
+     "tts_text": "Target for 100B parameters ASR model.",
+     "tts_text_normalized": "Target for one hundred billion parameters ASR model."}
diff --git a/docs/source/asr/images/hybrid_asr_tts_model.png b/docs/source/asr/images/hybrid_asr_tts_model.png
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
@@ -316,6 +316,33 @@ By default, the decoding for HAT model works in the same way as for Conformer-Tr
 In the case of external ngram LM fusion you can use ``<NeMo_git_root>/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py``.
 To enable HAT internal LM subtraction set ``hat_subtract_ilm=True`` and find more appropriate couple of ``beam_alpha`` and ``hat_ilm_weight`` values in terms of the best recognition accuracy.
 
+
+.. _Hybrid-ASR-TTS_model:
+
+Hybrid ASR-TTS Model
+--------------------
+
+Hybrid ASR-TTS Model (``ASRWithTTSModel``) is a transparent wrapper for the ASR model with a frozen pretrained text-to-spectrogram model. The approach is described in the paper
+`Text-only domain adaptation for end-to-end ASR using integrated text-to-mel-spectrogram generator <https://arxiv.org/abs/2302.14036>`_.
+This allows using text-only data for training and finetuning, mixing it with audio-text pairs if necessary.
+
+The model consists of three models:
+
+* ASR model (``EncDecCTCModelBPE`` or ``EncDecRNNTBPEModel``)
+* Frozen TTS Mel Spectrogram Generator (currently, only :ref:`FastPitch <FastPitch_model>` model is supported)
+* Optional frozen Enhancer model trained to mitigate mismatch between real and generated mel spectrogram
+
+    .. image:: images/hybrid_asr_tts_model.png
+        :align: center
+        :alt: Hybrid ASR-TTS Model
+        :scale: 50%
+
+For the detailed information see:
+
+* :ref:`Text-only dataset <Hybrid-ASR-TTS_model__Text-Only-Data>` preparation
+* :ref:`Configs and training <Hybrid-ASR-TTS_model__Config>`
+
+
 References
 ----------
 

diff --git a/docs/source/asr/results.rst b/docs/source/asr/results.rst
@@ -26,6 +26,17 @@ If there is a local ``.nemo`` checkpoint that you'd like to load, use the :code:
 
 Where the model base class is the ASR model class of the original checkpoint, or the general ``ASRModel`` class.
 
+
+Hybrid ASR-TTS Models Checkpoints
+---------------------------------
+
+:ref:`Hybrid ASR-TTS model <Hybrid-ASR-TTS_model>` is a transparent wrapper for the ASR model, text-to-mel-spectrogram generator, and optional enhancer.
+The model is saved as a solid ``.nemo`` checkpoint containing all these parts.
+Due to transparency, the ASR model can be extracted after training/finetuning separately by using the ``asr_model`` attribute (NeMo submodel)
+:code:`hybrid_model.asr_model.save_to(<asr_checkpoint_path>.nemo)` or by using a wrapper
+made for convenience purpose :code:`hybrid_model.save_asr_model_to(<asr_checkpoint_path>.nemo)`
+
+
 NGC Pretrained Checkpoints
 --------------------------
 

diff --git a/docs/source/tts/models.rst b/docs/source/tts/models.rst
@@ -12,6 +12,8 @@ This section provides a brief overview of TTS models that NeMo's TTS collection
 Mel-Spectrogram Generators
 --------------------------
 
+.. _FastPitch_model:
+
 FastPitch
 ~~~~~~~~~
 FastPitch is a fully-parallel text-to-speech synthesis model based on FastSpeech, conditioned on fundamental frequency contours. The model predicts pitch contours during inference. By altering these predictions, the generated speech can be more expressive, better match the semantic of the utterance, and in the end more engaging to the listener. Uniformly increasing or decreasing pitch with FastPitch generates speech that resembles the voluntary modulation of voice. Conditioning on frequency contours improves the overall quality of synthesized speech, making it comparable to the state of the art. It does not introduce an overhead, and FastPitch retains the favorable, fully-parallel Transformers architecture, with over 900x real-time factor for mel-spectrogram synthesis of a typical utterance. The architecture of FastPitch is shown below. It is based on FastSpeech and consists of two feed-forward Transformer (FFTr) stacks. The first FFTr operates in the resolution of input tokens, and the other one in the resolution of the output frames. Please refer to :cite:`tts-models-lancucki2021fastpitch` for details.

diff --git a/nemo/collections/asr/data/text_to_text.py b/nemo/collections/asr/data/text_to_text.py
@@ -26,7 +26,6 @@
 import numpy as np
 import torch
 import torch.utils.data
-from nemo_text_processing.text_normalization.normalize import Normalizer
 from torch.nn.utils.rnn import pad_sequence
 from tqdm.auto import tqdm
 
@@ -35,6 +34,12 @@
 from nemo.core.classes import Dataset, IterableDataset
 from nemo.utils import logging
 
+try:
+    from nemo_text_processing.text_normalization.normalize import Normalizer
+except Exception as e:
+    logging.warning(e)
+    logging.warning("nemo_text_processing is not installed")
+
 AnyPath = Union[Path, str]
 
 
@@ -176,7 +181,7 @@ def __init__(
         asr_use_start_end_token: bool,
         tts_parser: Callable,
         tts_text_pad_id: int,
-        tts_text_normalizer: Normalizer,
+        tts_text_normalizer: "Normalizer",
         tts_text_normalizer_call_kwargs: Dict,
         min_words: int = 1,
         max_words: int = 1_000_000,
@@ -379,7 +384,7 @@ def __init__(
         asr_use_start_end_token: bool,
         tts_parser: Callable,
         tts_text_pad_id: int,
-        tts_text_normalizer: Normalizer,
+        tts_text_normalizer: "Normalizer",
         tts_text_normalizer_call_kwargs: Dict,
         min_words: int = 1,
         max_words: int = 1_000_000,
@@ -426,7 +431,7 @@ def __init__(
         asr_use_start_end_token: bool,
         tts_parser: Callable,
         tts_text_pad_id: int,
-        tts_text_normalizer: Normalizer,
+        tts_text_normalizer: "Normalizer",
         tts_text_normalizer_call_kwargs: Dict,
         min_words: int = 1,
         max_words: int = 1_000_000,

diff --git a/nemo/collections/asr/models/hybrid_asr_tts_models.py b/nemo/collections/asr/models/hybrid_asr_tts_models.py
@@ -31,7 +31,9 @@
     TextToTextDataset,
     TextToTextIterableDataset,
 )
-from nemo.collections.asr.models import ASRModel, EncDecCTCModelBPE, EncDecRNNTBPEModel
+from nemo.collections.asr.models.asr_model import ASRModel
+from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
+from nemo.collections.asr.models.rnnt_bpe_models import EncDecRNNTBPEModel
 from nemo.collections.asr.modules.conformer_encoder import ConformerEncoder
 from nemo.collections.asr.parts.preprocessing.features import clean_spectrogram_batch, normalize_batch
 from nemo.collections.asr.parts.submodules.batchnorm import replace_bn_with_fused_bn_all