first commit

Adibian · May 10, 2023 · 6075dab · 6075dab
commit 6075dab
Show file tree

Hide file tree

Showing 55 changed files with 4,587 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,18 @@
+## Synthesizer
+synthesizer/raw_data/*
+synthesizer/preprocessed_data/*
+synthesizer/*.npy
+
+## Vocoder
+vocoder/ckpt/g_2500000_persian
+
+
+## Others
+dataset/Persian/synthesizer_data/*
+dataset/Persian/resgrad/*
+output/persian/synthesizer/*
+output/persian/resgrad/*
+
+commands.txt
+*__pycache__/
+
diff --git a/inference.py b/inference.py
@@ -0,0 +1,43 @@
+from synthesizer.synthesize import infer as synthesizer_infer
+from resgrad.inference import infer as resgrad_infer
+from vocoder.inference import infer as vocoder_infer
+from utils import load_model, save_result, get_synthesizer_configs
+
+import argparse
+
+def infer():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--text", type=str, required=True)
+    parser.add_argument("--synthesizer_restore_step", type=int, required=True)
+    parser.add_argument("--regrad_restore_epoch", type=int, required=True)
+    parser.add_argument("--vocoder_restore_epoch", type=int, default=0 ,required=False)
+    parser.add_argument("--result_dir", type=str, default="results", required=False)
+    parser.add_argument("--pitch_control", type=float, default=1.0, required=False)
+    parser.add_argument("--energy_control", type=float, default=1.0, required=False)
+    parser.add_argument("--duration_control", type=float, default=1.0, required=False)
+    parser.add_argument("--synthesizer_preprocess_config", type=str, default="synthesizer/config/persian/preprocess.yaml", required=False)
+    parser.add_argument("--synthesizer_model_config", type=str, default="synthesizer/config/persian/model.yaml", required=False)
+    parser.add_argument("--synthesizer_train_config", type=str, default="synthesizer/config/persian/train.yaml", required=False)
+    args = parser.parse_args()
+
+    synthesizer_configs = get_synthesizer_configs(args.synthesizer_preprocess_config, args.synthesizer_model_config, args.synthesizer_train_config)
+
+    print("load models...")
+    restore_steps = {"synthesizer":args.synthesizer_restore_step, "regrad":args.regrad_restore_epoch, "vocoder":args.vocoder_restore_epoch}
+    synthesizer_model, resgrad_model, vocoder_model = load_model(restore_steps, synthesizer_configs)
+
+    ## Synthesizer
+    control_values = args.pitch_control, args.energy_control, args.duration_control
+    mel_prediction, duration_prediction, pitch_prediction, energy_prediction = synthesizer_infer(synthesizer_model, args.text, control_values, \
+                                                                                        synthesizer_configs['preprocess_config'], \
+                                                                                        synthesizer_configs['model_config']['device'])
+
+    ## ResGrad
+    mel_prediction = resgrad_infer(resgrad_model, mel_prediction, duration_prediction)
+
+    ## Vocoder
+    wav = vocoder_infer(vocoder_model, mel_prediction, synthesizer_configs['preprocess_config']["preprocessing"]["audio"]["max_wav_value"])
+
+    ## Save result
+    save_result(mel_prediction, wav, pitch_prediction, energy_prediction, synthesizer_configs['preprocess_config'], args.result_dir)
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,17 @@
+g2p-en == 2.1.0
+inflect == 4.1.0
+librosa == 0.7.2
+matplotlib == 3.2.2
+numba == 0.48
+numpy == 1.19.0
+pypinyin==0.39.0
+pyworld == 0.2.10
+PyYAML==5.4.1
+scikit-learn==0.23.2
+scipy == 1.5.0
+soundfile==0.10.3.post1
+tensorboard == 2.2.2
+tgt == 1.4.4
+torch == 1.7.0
+tqdm==4.46.1
+unidecode == 1.1.1
diff --git a/resgrad/config.py b/resgrad/config.py
@@ -0,0 +1,42 @@
+
+###############################################
+#################### Data #####################
+batch_size = 32
+target_data_dir = "dataset/Persian/resgrad_data/mel_target"
+input_data_dir = "dataset/Persian/resgrad_data/mel_prediction"
+durations_dir = "dataset/Persian/resgrad_data/durations"
+val_size = 16
+preprocessed_path = "processed_data"
+normalized_method = "min-max"
+
+shuffle_data = True
+normallize_spectrum = True
+min_spec_value = -13
+max_spec_value = 3
+normallize_residual = True
+min_residual_value = -0.25
+max_residual_value = 0.25
+max_win_length = 100  ## maximum size of window in spectrum
+
+###############################################
+################## Training ###################
+lr = 1e-4
+epochs = 70
+save_model_path = "output/persian/resgrad/ckpt"
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+device = "cuda"
+validate_every_n_step = 20
+log_dir = 'output/persian/resgrad/log'
+save_path = 'checkpoint'
+
+###############################################
+############ Model Parameters #################
+model_type1 = "spec2residual"  ## "spec2spec" or "spec2residual"
+model_type2 = "segment-based"  ## "segment-based" or "sentence-based"
+n_feats=80
+dim=64
+n_spks=1
+spk_emb_dim=64
+beta_min=0.05
+beta_max=20.0
+pe_scale=1000
diff --git a/resgrad/data.py b/resgrad/data.py
@@ -0,0 +1,101 @@
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import torch
+import os
+
+from . import config
+from .utils import normalize_residual, normalize_data
+
+class SpectumDataset(Dataset):
+    def __init__(self):
+        super(SpectumDataset, self).__init__()
+        self.input_data_path = []
+        self.target_data_path = []
+        self.duration_data_path = []
+        # i = 0
+        for file_name in os.listdir(config.input_data_dir):
+            # i += 1
+            # if i > 1000:
+            #     break
+            input_file_path = os.path.join(config.input_data_dir, file_name)
+            target_file_path = os.path.join(config.target_data_dir, 'single_speaker-mel-' + file_name)
+            duration_file_path = os.path.join(config.durations_dir, 'single_speaker-duration-' + file_name)
+
+            self.input_data_path.append(input_file_path)
+            self.target_data_path.append(target_file_path)
+            self.duration_data_path.append(duration_file_path)
+
+        if config.model_type2 == "segment-based":
+            self.max_len = config.max_win_length
+            # self.win_size = config.window_size
+        else:
+            self.max_len = config.spectrum_max_size
+
+    def __getitem__(self, index):
+        input_spec_path = self.input_data_path[index]
+        input_spec = np.load(input_spec_path)
+        target_spec_path = self.target_data_path[index]
+        target_spec = np.load(target_spec_path)
+        dutarions_path = self.duration_data_path[index]
+        durations = np.load(dutarions_path)
+        target_spec = torch.from_numpy(target_spec).T
+        input_spec = torch.from_numpy(input_spec).squeeze()
+        if config.normallize_spectrum:
+            input_spec = normalize_data(input_spec)
+            target_spec = normalize_data(target_spec)
+
+        if config.model_type2 == "segment-based":
+            start_phoneme_index = np.random.choice(len(durations)-4, 1)[0]
+            end_phoneme_index = 0
+            for i in range(start_phoneme_index+1, len(durations)+1):
+                win_length = sum(durations[start_phoneme_index:i])
+                if win_length > self.max_len:
+                    end_phoneme_index = i-1
+                    break
+            if end_phoneme_index == 0:
+                end_phoneme_index = len(durations)
+                for i in range(start_phoneme_index):
+                    start_phoneme_index -= 1
+                    win_length = sum(durations[start_phoneme_index:end_phoneme_index])
+                    if win_length > self.max_len:
+                        start_phoneme_index += 1
+                        break
+            win_start = sum(durations[:start_phoneme_index])
+            win_end = sum(durations[:end_phoneme_index])
+
+            input_spec = input_spec[:,win_start:win_end]
+            target_spec = target_spec[:,win_start:win_end]
+
+        spec_size = input_spec.shape[-1]
+        print(input_spec.shape)
+        print(target_spec.shape)
+        print("###")
+        input_spec = torch.nn.functional.pad(input_spec, (0, self.max_len-spec_size), mode = "constant", value = 0.0)
+        target_spec = torch.nn.functional.pad(target_spec, (0, self.max_len-spec_size), mode = "constant", value = 0.0)
+
+        residual_spec = target_spec - input_spec
+        if config.normallize_residual:
+            residual_spec = normalize_residual(residual_spec)
+
+        mask = torch.ones((1, input_spec.shape[-1]))
+        mask[:,spec_size:] = 0
+
+        if config.model_type1 == "spec2residual":
+            residual_spec = target_spec - input_spec
+            if config.normallize_residual:
+                residual_spec = normalize_residual(residual_spec)
+            residual_spec = residual_spec*mask
+            return input_spec, target_spec, residual_spec, mask
+        else:
+            return input_spec, target_spec, mask
+
+
+    def __len__(self):
+        return len(self.input_data_path)
+
+
+def create_dataset():
+    dataset = SpectumDataset()
+    val_dataset, train_dataset = torch.utils.data.random_split(dataset, [config.val_size, len(dataset)-(config.val_size)])
+    return DataLoader(train_dataset, batch_size=config.batch_size, shuffle=config.shuffle_data), \
+                DataLoader(val_dataset, batch_size=config.batch_size, shuffle=config.shuffle_data)
diff --git a/resgrad/inference.py b/resgrad/inference.py
@@ -0,0 +1,86 @@
+from . import config 
+from .utils import denormalize_residual, denormalize_data, normalize_data
+
+import torch
+import numpy as np
+
+
+def infer(model, mel_prediction, duration_prediction):
+    synthesized_spec = mel_prediction.transpose(0,2,1)
+    synthesized_spec = torch.from_numpy(synthesized_spec).to(config.device)
+    if config.normallize_spectrum:
+        synthesized_spec = normalize_data(synthesized_spec)
+
+    if config.model_type2 == "segment-based":
+        durations = np.round(np.exp(duration_prediction.squeeze()) - 1)
+
+        all_mask, all_segment_spec, all_start_points, all_spec_size = [], [], [], []
+        pred = torch.zeros(synthesized_spec.shape)
+
+        ## Create segments of date exept last segment 
+        start_phoneme_index = 0
+        end_phoneme_index = 0
+        for i in range(1, len(durations)+1):
+            win_length = int(sum(durations[start_phoneme_index:i]))
+            if win_length > config.max_win_length:
+                end_phoneme_index = i-1
+                start_point = int(sum(durations[:start_phoneme_index]))
+                end_point = int(sum(durations[:end_phoneme_index]))
+                segment_spec = synthesized_spec[:,:,start_point:end_point]
+                all_start_points.append(start_point)
+                spec_size = segment_spec.shape[-1]
+                all_spec_size.append(spec_size)
+                segment_spec = torch.nn.functional.pad(segment_spec, (0, config.max_win_length-spec_size), mode = "constant", value = 0.0)
+                mask = torch.ones((1, segment_spec.shape[-1])).to(config.device)
+                mask[:,spec_size:] = 0
+                all_mask.append(mask.unsqueeze(0))
+                all_segment_spec.append(segment_spec)  
+                start_phoneme_index = end_phoneme_index
+
+        ## Create last segment of data with overlapping to last previous segments
+        start_phoneme_index = len(durations)
+        end_phoneme_index = len(durations)
+        for i in range(len(durations)):
+            start_phoneme_index -= 1
+            win_length = int(sum(durations[start_phoneme_index:]))
+            if win_length > config.max_win_length:
+                start_phoneme_index += 1
+                start_point = int(sum(durations[:start_phoneme_index]))
+                end_point = int(sum(durations[:end_phoneme_index]))
+                segment_spec = synthesized_spec[:,:,start_point:end_point]
+                all_start_points.append(start_point)
+                spec_size = segment_spec.shape[-1]
+                all_spec_size.append(spec_size)
+                segment_spec = torch.nn.functional.pad(segment_spec, (0, config.max_win_length-spec_size), mode = "constant", value = 0.0)
+                mask = torch.ones((1, segment_spec.shape[-1])).to(config.device)
+                mask[:,spec_size:] = 0
+                all_mask.append(mask.unsqueeze(0))
+                all_segment_spec.append(segment_spec)  
+                break
+
+        mask = torch.cat(all_mask).to(config.device)
+        segment_spec = torch.cat(all_segment_spec).to(config.device)
+        z = segment_spec + torch.randn_like(segment_spec, device=config.device) / 1.5
+        segments_pred = model(z, mask, segment_spec, n_timesteps=25, stoc=False, spk=None)
+
+        for i in range(len(segments_pred)):
+            segment_pred = segments_pred[i,:,:all_spec_size[i]]
+            pred[:,:,all_start_points[i]:all_start_points[i]+all_spec_size[i]] = segment_pred
+    else:
+        mask = torch.ones(synthesized_spec.shape).to(config.device)
+        z = synthesized_spec + torch.randn_like(synthesized_spec, device=config.device) / 1.5
+        pred = model(z, mask, synthesized_spec, n_timesteps=50, stoc=False, spk=None)
+    pred = pred.to(config.device)
+
+    if config.model_type1 == "spec2residual":
+        if config.normallize_residual:
+            spec_pred =  denormalize_residual(pred) + synthesized_spec
+        else:
+            spec_pred =  pred + synthesized_spec
+    else:
+        spec_pred = pred
+
+    if config.normallize_spectrum:
+        spec_pred = denormalize_data(spec_pred)
+
+    return spec_pred
diff --git a/resgrad/model/__init__.py b/resgrad/model/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# MIT License for more details.
+
+from .diffusion import Diffusion
diff --git a/resgrad/model/base.py b/resgrad/model/base.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# MIT License for more details.
+
+import numpy as np
+import torch
+
+
+class BaseModule(torch.nn.Module):
+    def __init__(self):
+        super(BaseModule, self).__init__()
+
+    @property
+    def nparams(self):
+        """
+        Returns number of trainable parameters of the module.
+        """
+        num_params = 0
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                num_params += np.prod(param.detach().cpu().numpy().shape)
+        return num_params
+
+
+    def relocate_input(self, x: list):
+        """
+        Relocates provided tensors to the same device set for the module.
+        """
+        device = next(self.parameters()).device
+        for i in range(len(x)):
+            if isinstance(x[i], torch.Tensor) and x[i].device != device:
+                x[i] = x[i].to(device)
+        return x