Merge configs to one file

Adibian · May 10, 2023 · eed4326 · eed4326
1 parent 6075dab
commit eed4326
Show file tree

Hide file tree

Showing 13 changed files with 203 additions and 174 deletions.
diff --git a/config/Persian/config.yaml b/config/Persian/config.yaml
@@ -0,0 +1,144 @@
+############################################################################################
+####################################### Main Config ########################################
+############################################################################################
+dataset: &Dataset "Persian"
+multi_speaker: False
+
+
+
+############################################################################################
+####################################### Synthesizer ########################################
+############################################################################################
+synthesizer:
+  ############### Main Parameters ###################
+  main:
+    device: "cuda" ## cpu or cuda
+
+  ##################### Model #######################
+  model:
+    transformer:
+      encoder_layer: 4
+      encoder_head: 2
+      encoder_hidden: 256
+      decoder_layer: 6
+      decoder_head: 2
+      decoder_hidden: 256
+      conv_filter_size: 1024
+      conv_kernel_size: [9, 1]
+      encoder_dropout: 0.2
+      decoder_dropout: 0.2
+
+    variance_predictor:
+      filter_size: 256
+      kernel_size: 3
+      dropout: 0.5
+
+    variance_embedding:
+      pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
+      energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
+      n_bins: 256
+    max_seq_len: 1000
+
+  #################### Preprocesss #####################
+  preprocess:
+    path:
+      corpus_path: !join ["dataset/", *Dataset, "synthesizer_data/train_data"]
+      raw_path: !join ["synthesizer/raw_data/", *Dataset]
+      preprocessed_path: !join ["synthesizer/preprocessed_data/", *Dataset]
+    preprocessing:
+      val_size: 100
+      text:
+        text_cleaners: ["persian_cleaners"] ## ljspeech_cleaner od persian_cleaner
+        language: "fa"  ## fa or en
+      audio:
+        sampling_rate: 22050
+        max_wav_value: 32768.0
+      stft:
+        filter_length: 1024
+        hop_length: 256
+        win_length: 1024
+      mel:
+        n_mel_channels: 80
+        mel_fmin: 0
+        mel_fmax: 8000
+      pitch:
+        feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
+        normalization: True
+      energy:
+        feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
+        normalization: True
+
+  #################### Training #####################
+  train:
+    path:
+      ckpt_path: !join ["output/", *Dataset, "synthesizer/ckpt"]
+      log_path: !join ["output/", *Dataset, "synthesizer/log"]
+      result_path: !join ["output/", *Dataset, "synthesizer/result"]
+    optimizer:
+      batch_size: 16
+      betas: [0.9, 0.98]
+      eps: 0.000000001
+      weight_decay: 0.0
+      grad_clip_thresh: 1.0
+      grad_acc_step: 1
+      warm_up_step: 4000
+      anneal_steps: [300000, 400000, 500000]
+      anneal_rate: 0.3
+    step:
+      total_step: 1010000
+      log_step: 500
+      synth_step: 1000
+      val_step: 1000
+      save_step: 100000
+
+
+
+############################################################################################
+########################################### ResGrad ########################################
+############################################################################################
+resgrad:
+
+  #################### Data #####################
+  data:
+    batch_size: 32
+    target_data_dir: !join ["dataset/", *Dataset, "resgrad_data/mel_target"]
+    input_data_dir: !join ["dataset/", *Dataset, "resgrad_data/mel_prediction"]
+    durations_dir: !join ["dataset/", *Dataset, "resgrad_data/durations"]
+    val_size: 16
+    preprocessed_path: "processed_data"
+    normalized_method: "min-max"
+
+    shuffle_data: True
+    normallize_spectrum: True
+    min_spec_value: -13
+    max_spec_value: 3
+    normallize_residual: True
+    min_residual_value: -0.25
+    max_residual_value: 0.25
+    max_win_length: 100  ## maximum size of window in spectrum
+
+  ################## Training ###################
+  train:
+    lr: 1e-4
+    epochs: 70
+    save_model_path: !join ["output/", *Dataset, "resgrad/ckpt"]
+    validate_every_n_step: 20
+    log_dir: !join ["output/", *Dataset, "resgrad/log"]
+    save_path: 'checkpoint'
+
+  ############ Model Parameters #################
+  model:
+    model_type1: "spec2residual"  ## "spec2spec" or "spec2residual"
+    model_type2: "segment-based"  ## "segment-based" or "sentence-based"
+    n_feats: 80
+    dim: 64
+    n_spks: 1
+    spk_emb_dim: 64
+    beta_min: 0.05
+    beta_max: 20.0
+    pe_scale: 1000
+
+  ############ Main Parameters #################
+  main:
+    device: "cuda"
+
diff --git a/resgrad_data.py b/resgrad_data.py
@@ -1,11 +1,12 @@
-from utils import load_models, get_synthesizer_configs
+from utils import load_models, load_yaml_file
 from synthesizer.synthesize import infer as synthesizer_infer
 
 import argparse
 import os
 from tqdm import tqdm
 import numpy as np
 import torch
+import yaml
 
 
 def read_input_data(raw_data_path):
@@ -24,37 +25,34 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--synthesizer_restore_step", type=int, required=True)
     parser.add_argument("--raw_data_path", type=str, default="synthesizer/raw_data", required=False)
-    parser.add_argument("--synthesizer_preprocess_config", type=str, default="synthesizer/config/persian/preprocess.yaml", required=False)
-    parser.add_argument("--synthesizer_model_config", type=str, default="synthesizer/config/persian/model.yaml", required=False)
-    parser.add_argument("--synthesizer_train_config", type=str, default="synthesizer/config/persian/train.yaml", required=False)
+    parser.add_argument("-c", "--config", type=str, default='config/Persian/config.yaml', required=False, help="path to config.yaml")
     args = parser.parse_args()
-
-    synthesizer_configs = get_synthesizer_configs(args.synthesizer_preprocess_config, args.synthesizer_model_config, args.synthesizer_train_config)
+
+    # Read Config
+    config = load_yaml_file(args.config)
 
     restore_steps = {"synthesizer":args.synthesizer_restore_step, "resgrad":None, "vocoder":None}
-    synthesizer_model, _, _ = load_models(restore_steps, synthesizer_configs)
-    text_data = read_input_data(os.path.join(args.raw_data_path, synthesizer_configs['preprocess_config']['dataset']))
+    synthesizer_model, _, _ = load_models(restore_steps, config['synthesizer'])
+    text_data = read_input_data(args.raw_data_path)
 
     current_path = os.getcwd()
-    save_path = os.path.join(current_path, "dataset", synthesizer_configs['preprocess_config']['dataset'], "resgrad_data")
-    duration_dir = os.path.join(save_path, 'durations')
-    mel_pred_dir = os.path.join(save_path, 'mel_prediction')
-    mel_target_dir = os.path.join(save_path, 'mel_target')
+    duration_dir = os.path.join(current_path, config['resgrad']['data']['durations_dir'])
+    mel_pred_dir = os.path.join(current_path, config['resgrad']['data']['input_data_dir'])
+    mel_target_dir = os.path.join(current_path, config['resgrad']['data']['mel_target'])
 
-    os.makedirs(save_path, exist_ok=True)
     os.makedirs(mel_pred_dir, exist_ok=True)
     if not os.path.islink(mel_target_dir):
-        os.symlink(os.path.join(current_path, synthesizer_configs['preprocess_config']['path']['preprocessed_path'], 'mel'), mel_target_dir, target_is_directory=True)
+        os.symlink(os.path.join(current_path, config['synthesizer']['preprocess']['path']['preprocessed_path'], 'mel'), mel_target_dir, target_is_directory=True)
     if not os.path.islink(duration_dir):
-        os.symlink(os.path.join(current_path, synthesizer_configs['preprocess_config']['path']['preprocessed_path'], 'duration'), duration_dir, target_is_directory=True)
+        os.symlink(os.path.join(current_path, config['synthesizer']['preprocess']['path']['preprocessed_path'], 'duration'), duration_dir, target_is_directory=True)
 
-    device = synthesizer_configs['model_config']['device']
+    device = config['synthesizer']['main']['device']
     for (speaker, file_name), text in tqdm(text_data.items()):
         dur_file_name = speaker + "-duration-" + file_name + ".npy"
         dur_target = torch.from_numpy(np.load(os.path.join(duration_dir, dur_file_name))).to(device).unsqueeze(0)
 
         control_values = 1.0,1.0,1.0
-        mel_prediction, _, _, _ = synthesizer_infer(synthesizer_model, text, control_values, synthesizer_configs['preprocess_config'], 
+        mel_prediction, _, _, _ = synthesizer_infer(synthesizer_model, text, control_values, config, 
                                                     device, d_target=dur_target)
         file_path = os.path.join(mel_pred_dir, file_name)
         np.save(file_path, mel_prediction[0].cpu())

diff --git a/synthesizer/config/persian/model.yaml b/synthesizer/config/persian/model.yaml
diff --git a/synthesizer/config/persian/preprocess.yaml b/synthesizer/config/persian/preprocess.yaml
diff --git a/synthesizer/config/persian/train.yaml b/synthesizer/config/persian/train.yaml
diff --git a/synthesizer/evaluate.py b/synthesizer/evaluate.py
@@ -1,17 +1,14 @@
-import argparse
 import torch
-import yaml
 from torch.utils.data import DataLoader
 
-from .utils.model import get_model
 from .utils.tools import to_device, log, synth_one_sample
 from .model import FastSpeech2Loss
 from .dataset import Dataset
 
 
-def evaluate(model, step, configs, logger=None, vocoder=None):
-    preprocess_config, model_config, train_config = configs
-    device = model_config['device']
+def evaluate(model, step, config, logger=None, vocoder=None):
+    preprocess_config, model_config, train_config = config['synthesizer']['preprocess'], config['synthesizer']['model']. config['synthesizer']['train']
+    device = config['synthesizer']['main']['device']
     # Get dataset
     dataset = Dataset(
         "val.txt", preprocess_config, train_config, sort=False, drop_last=False
@@ -80,35 +77,3 @@ def evaluate(model, step, configs, logger=None, vocoder=None):
     return message
 
 
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--restore_step", type=int, default=30000)
-    parser.add_argument(
-        "-p",
-        "--preprocess_config",
-        type=str,
-        required=True,
-        help="path to preprocess.yaml",
-    )
-    parser.add_argument(
-        "-m", "--model_config", type=str, required=True, help="path to model.yaml"
-    )
-    parser.add_argument(
-        "-t", "--train_config", type=str, required=True, help="path to train.yaml"
-    )
-    args = parser.parse_args()
-
-    # Read Config
-    preprocess_config = yaml.load(
-        open(args.preprocess_config, "r"), Loader=yaml.FullLoader
-    )
-    model_config = yaml.load(open(args.model_config, "r"), Loader=yaml.FullLoader)
-    train_config = yaml.load(open(args.train_config, "r"), Loader=yaml.FullLoader)
-    configs = (preprocess_config, model_config, train_config)
-
-    # Get model
-    model = get_model(args, configs, device, train=False).to(device)
-
-    message = evaluate(model, args.restore_step, configs)
-    print(message)
diff --git a/synthesizer/prepare_align.py b/synthesizer/prepare_align.py
@@ -1,8 +1,7 @@
 import argparse
 
-import yaml
-
-from preprocessor import persian
+from .preprocessor import persian
+from ..utils import load_yaml_file
 
 
 def main(config):
@@ -12,8 +11,8 @@ def main(config):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("config", type=str, help="path to preprocess.yaml")
+    parser.add_argument("config", type=str, help="path to config.yaml")
     args = parser.parse_args()
 
-    config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
-    main(config)
+    config = load_yaml_file(args.config)
+    main(config['synthesizer']['preprocess'])
diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py
@@ -1,15 +1,14 @@
 import argparse
 
-import yaml
-
-from preprocessor.preprocessor import Preprocessor
+from .preprocessor.preprocessor import Preprocessor
+from ..utils import load_yaml_file
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("config", type=str, help="path to preprocess.yaml")
+    parser.add_argument("config", type=str, help="path to config.yaml")
     args = parser.parse_args()
 
-    config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
-    preprocessor = Preprocessor(config)
+    config = load_yaml_file(args.config)
+    preprocessor = Preprocessor(config['synthesizer']['preprocess'])
     preprocessor.build_from_path()