Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Adibian committed May 10, 2023
0 parents commit 6075dab
Show file tree
Hide file tree
Showing 55 changed files with 4,587 additions and 0 deletions.
18 changes: 18 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
## Synthesizer
synthesizer/raw_data/*
synthesizer/preprocessed_data/*
synthesizer/*.npy

## Vocoder
vocoder/ckpt/g_2500000_persian


## Others
dataset/Persian/synthesizer_data/*
dataset/Persian/resgrad/*
output/persian/synthesizer/*
output/persian/resgrad/*

commands.txt
*__pycache__/

43 changes: 43 additions & 0 deletions inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from synthesizer.synthesize import infer as synthesizer_infer
from resgrad.inference import infer as resgrad_infer
from vocoder.inference import infer as vocoder_infer
from utils import load_model, save_result, get_synthesizer_configs

import argparse

def infer():
parser = argparse.ArgumentParser()
parser.add_argument("--text", type=str, required=True)
parser.add_argument("--synthesizer_restore_step", type=int, required=True)
parser.add_argument("--regrad_restore_epoch", type=int, required=True)
parser.add_argument("--vocoder_restore_epoch", type=int, default=0 ,required=False)
parser.add_argument("--result_dir", type=str, default="results", required=False)
parser.add_argument("--pitch_control", type=float, default=1.0, required=False)
parser.add_argument("--energy_control", type=float, default=1.0, required=False)
parser.add_argument("--duration_control", type=float, default=1.0, required=False)
parser.add_argument("--synthesizer_preprocess_config", type=str, default="synthesizer/config/persian/preprocess.yaml", required=False)
parser.add_argument("--synthesizer_model_config", type=str, default="synthesizer/config/persian/model.yaml", required=False)
parser.add_argument("--synthesizer_train_config", type=str, default="synthesizer/config/persian/train.yaml", required=False)
args = parser.parse_args()

synthesizer_configs = get_synthesizer_configs(args.synthesizer_preprocess_config, args.synthesizer_model_config, args.synthesizer_train_config)

print("load models...")
restore_steps = {"synthesizer":args.synthesizer_restore_step, "regrad":args.regrad_restore_epoch, "vocoder":args.vocoder_restore_epoch}
synthesizer_model, resgrad_model, vocoder_model = load_model(restore_steps, synthesizer_configs)

## Synthesizer
control_values = args.pitch_control, args.energy_control, args.duration_control
mel_prediction, duration_prediction, pitch_prediction, energy_prediction = synthesizer_infer(synthesizer_model, args.text, control_values, \
synthesizer_configs['preprocess_config'], \
synthesizer_configs['model_config']['device'])

## ResGrad
mel_prediction = resgrad_infer(resgrad_model, mel_prediction, duration_prediction)

## Vocoder
wav = vocoder_infer(vocoder_model, mel_prediction, synthesizer_configs['preprocess_config']["preprocessing"]["audio"]["max_wav_value"])

## Save result
save_result(mel_prediction, wav, pitch_prediction, energy_prediction, synthesizer_configs['preprocess_config'], args.result_dir)

17 changes: 17 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
g2p-en == 2.1.0
inflect == 4.1.0
librosa == 0.7.2
matplotlib == 3.2.2
numba == 0.48
numpy == 1.19.0
pypinyin==0.39.0
pyworld == 0.2.10
PyYAML==5.4.1
scikit-learn==0.23.2
scipy == 1.5.0
soundfile==0.10.3.post1
tensorboard == 2.2.2
tgt == 1.4.4
torch == 1.7.0
tqdm==4.46.1
unidecode == 1.1.1
42 changes: 42 additions & 0 deletions resgrad/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

###############################################
#################### Data #####################
batch_size = 32
target_data_dir = "dataset/Persian/resgrad_data/mel_target"
input_data_dir = "dataset/Persian/resgrad_data/mel_prediction"
durations_dir = "dataset/Persian/resgrad_data/durations"
val_size = 16
preprocessed_path = "processed_data"
normalized_method = "min-max"

shuffle_data = True
normallize_spectrum = True
min_spec_value = -13
max_spec_value = 3
normallize_residual = True
min_residual_value = -0.25
max_residual_value = 0.25
max_win_length = 100 ## maximum size of window in spectrum

###############################################
################## Training ###################
lr = 1e-4
epochs = 70
save_model_path = "output/persian/resgrad/ckpt"
# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cuda"
validate_every_n_step = 20
log_dir = 'output/persian/resgrad/log'
save_path = 'checkpoint'

###############################################
############ Model Parameters #################
model_type1 = "spec2residual" ## "spec2spec" or "spec2residual"
model_type2 = "segment-based" ## "segment-based" or "sentence-based"
n_feats=80
dim=64
n_spks=1
spk_emb_dim=64
beta_min=0.05
beta_max=20.0
pe_scale=1000
101 changes: 101 additions & 0 deletions resgrad/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import os

from . import config
from .utils import normalize_residual, normalize_data

class SpectumDataset(Dataset):
def __init__(self):
super(SpectumDataset, self).__init__()
self.input_data_path = []
self.target_data_path = []
self.duration_data_path = []
# i = 0
for file_name in os.listdir(config.input_data_dir):
# i += 1
# if i > 1000:
# break
input_file_path = os.path.join(config.input_data_dir, file_name)
target_file_path = os.path.join(config.target_data_dir, 'single_speaker-mel-' + file_name)
duration_file_path = os.path.join(config.durations_dir, 'single_speaker-duration-' + file_name)

self.input_data_path.append(input_file_path)
self.target_data_path.append(target_file_path)
self.duration_data_path.append(duration_file_path)

if config.model_type2 == "segment-based":
self.max_len = config.max_win_length
# self.win_size = config.window_size
else:
self.max_len = config.spectrum_max_size

def __getitem__(self, index):
input_spec_path = self.input_data_path[index]
input_spec = np.load(input_spec_path)
target_spec_path = self.target_data_path[index]
target_spec = np.load(target_spec_path)
dutarions_path = self.duration_data_path[index]
durations = np.load(dutarions_path)
target_spec = torch.from_numpy(target_spec).T
input_spec = torch.from_numpy(input_spec).squeeze()
if config.normallize_spectrum:
input_spec = normalize_data(input_spec)
target_spec = normalize_data(target_spec)

if config.model_type2 == "segment-based":
start_phoneme_index = np.random.choice(len(durations)-4, 1)[0]
end_phoneme_index = 0
for i in range(start_phoneme_index+1, len(durations)+1):
win_length = sum(durations[start_phoneme_index:i])
if win_length > self.max_len:
end_phoneme_index = i-1
break
if end_phoneme_index == 0:
end_phoneme_index = len(durations)
for i in range(start_phoneme_index):
start_phoneme_index -= 1
win_length = sum(durations[start_phoneme_index:end_phoneme_index])
if win_length > self.max_len:
start_phoneme_index += 1
break
win_start = sum(durations[:start_phoneme_index])
win_end = sum(durations[:end_phoneme_index])

input_spec = input_spec[:,win_start:win_end]
target_spec = target_spec[:,win_start:win_end]

spec_size = input_spec.shape[-1]
print(input_spec.shape)
print(target_spec.shape)
print("###")
input_spec = torch.nn.functional.pad(input_spec, (0, self.max_len-spec_size), mode = "constant", value = 0.0)
target_spec = torch.nn.functional.pad(target_spec, (0, self.max_len-spec_size), mode = "constant", value = 0.0)

residual_spec = target_spec - input_spec
if config.normallize_residual:
residual_spec = normalize_residual(residual_spec)

mask = torch.ones((1, input_spec.shape[-1]))
mask[:,spec_size:] = 0

if config.model_type1 == "spec2residual":
residual_spec = target_spec - input_spec
if config.normallize_residual:
residual_spec = normalize_residual(residual_spec)
residual_spec = residual_spec*mask
return input_spec, target_spec, residual_spec, mask
else:
return input_spec, target_spec, mask


def __len__(self):
return len(self.input_data_path)


def create_dataset():
dataset = SpectumDataset()
val_dataset, train_dataset = torch.utils.data.random_split(dataset, [config.val_size, len(dataset)-(config.val_size)])
return DataLoader(train_dataset, batch_size=config.batch_size, shuffle=config.shuffle_data), \
DataLoader(val_dataset, batch_size=config.batch_size, shuffle=config.shuffle_data)
86 changes: 86 additions & 0 deletions resgrad/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from . import config
from .utils import denormalize_residual, denormalize_data, normalize_data

import torch
import numpy as np


def infer(model, mel_prediction, duration_prediction):
synthesized_spec = mel_prediction.transpose(0,2,1)
synthesized_spec = torch.from_numpy(synthesized_spec).to(config.device)
if config.normallize_spectrum:
synthesized_spec = normalize_data(synthesized_spec)

if config.model_type2 == "segment-based":
durations = np.round(np.exp(duration_prediction.squeeze()) - 1)

all_mask, all_segment_spec, all_start_points, all_spec_size = [], [], [], []
pred = torch.zeros(synthesized_spec.shape)

## Create segments of date exept last segment
start_phoneme_index = 0
end_phoneme_index = 0
for i in range(1, len(durations)+1):
win_length = int(sum(durations[start_phoneme_index:i]))
if win_length > config.max_win_length:
end_phoneme_index = i-1
start_point = int(sum(durations[:start_phoneme_index]))
end_point = int(sum(durations[:end_phoneme_index]))
segment_spec = synthesized_spec[:,:,start_point:end_point]
all_start_points.append(start_point)
spec_size = segment_spec.shape[-1]
all_spec_size.append(spec_size)
segment_spec = torch.nn.functional.pad(segment_spec, (0, config.max_win_length-spec_size), mode = "constant", value = 0.0)
mask = torch.ones((1, segment_spec.shape[-1])).to(config.device)
mask[:,spec_size:] = 0
all_mask.append(mask.unsqueeze(0))
all_segment_spec.append(segment_spec)
start_phoneme_index = end_phoneme_index

## Create last segment of data with overlapping to last previous segments
start_phoneme_index = len(durations)
end_phoneme_index = len(durations)
for i in range(len(durations)):
start_phoneme_index -= 1
win_length = int(sum(durations[start_phoneme_index:]))
if win_length > config.max_win_length:
start_phoneme_index += 1
start_point = int(sum(durations[:start_phoneme_index]))
end_point = int(sum(durations[:end_phoneme_index]))
segment_spec = synthesized_spec[:,:,start_point:end_point]
all_start_points.append(start_point)
spec_size = segment_spec.shape[-1]
all_spec_size.append(spec_size)
segment_spec = torch.nn.functional.pad(segment_spec, (0, config.max_win_length-spec_size), mode = "constant", value = 0.0)
mask = torch.ones((1, segment_spec.shape[-1])).to(config.device)
mask[:,spec_size:] = 0
all_mask.append(mask.unsqueeze(0))
all_segment_spec.append(segment_spec)
break

mask = torch.cat(all_mask).to(config.device)
segment_spec = torch.cat(all_segment_spec).to(config.device)
z = segment_spec + torch.randn_like(segment_spec, device=config.device) / 1.5
segments_pred = model(z, mask, segment_spec, n_timesteps=25, stoc=False, spk=None)

for i in range(len(segments_pred)):
segment_pred = segments_pred[i,:,:all_spec_size[i]]
pred[:,:,all_start_points[i]:all_start_points[i]+all_spec_size[i]] = segment_pred
else:
mask = torch.ones(synthesized_spec.shape).to(config.device)
z = synthesized_spec + torch.randn_like(synthesized_spec, device=config.device) / 1.5
pred = model(z, mask, synthesized_spec, n_timesteps=50, stoc=False, spk=None)
pred = pred.to(config.device)

if config.model_type1 == "spec2residual":
if config.normallize_residual:
spec_pred = denormalize_residual(pred) + synthesized_spec
else:
spec_pred = pred + synthesized_spec
else:
spec_pred = pred

if config.normallize_spectrum:
spec_pred = denormalize_data(spec_pred)

return spec_pred
9 changes: 9 additions & 0 deletions resgrad/model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the MIT License.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# MIT License for more details.

from .diffusion import Diffusion
37 changes: 37 additions & 0 deletions resgrad/model/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the MIT License.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# MIT License for more details.

import numpy as np
import torch


class BaseModule(torch.nn.Module):
def __init__(self):
super(BaseModule, self).__init__()

@property
def nparams(self):
"""
Returns number of trainable parameters of the module.
"""
num_params = 0
for name, param in self.named_parameters():
if param.requires_grad:
num_params += np.prod(param.detach().cpu().numpy().shape)
return num_params


def relocate_input(self, x: list):
"""
Relocates provided tensors to the same device set for the module.
"""
device = next(self.parameters()).device
for i in range(len(x)):
if isinstance(x[i], torch.Tensor) and x[i].device != device:
x[i] = x[i].to(device)
return x
Loading

0 comments on commit 6075dab

Please sign in to comment.