Spaces:
Running
Running
import os | |
import json | |
import logging | |
import torch | |
import config | |
import numpy as np | |
from utils.utils import check_is_none | |
from vits import VITS | |
from voice import TTS | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
lang_dict = { | |
"english_cleaners": ["en"], | |
"english_cleaners2": ["en"], | |
"japanese_cleaners": ["ja"], | |
"japanese_cleaners2": ["ja"], | |
"korean_cleaners": ["ko"], | |
"chinese_cleaners": ["zh"], | |
"zh_ja_mixture_cleaners": ["zh", "ja"], | |
"sanskrit_cleaners": ["sa"], | |
"cjks_cleaners": ["zh", "ja", "ko", "sa"], | |
"cjke_cleaners": ["zh", "ja", "ko", "en"], | |
"cjke_cleaners2": ["zh", "ja", "ko", "en"], | |
"cje_cleaners": ["zh", "ja", "en"], | |
"cje_cleaners2": ["zh", "ja", "en"], | |
"thai_cleaners": ["th"], | |
"shanghainese_cleaners": ["sh"], | |
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD", | |
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC", | |
"YB"], | |
"bert_chinese_cleaners": ["zh"], | |
} | |
def analysis(model_config_json): | |
model_config = json.load(model_config_json) | |
symbols = model_config.get("symbols", None) | |
emotion_embedding = model_config.get("data").get("emotion_embedding", False) | |
if "use_spk_conditioned_encoder" in model_config.get("model"): | |
model_type = 'bert_vits2' | |
return model_type | |
if symbols != None: | |
if not emotion_embedding: | |
mode_type = "vits" | |
else: | |
mode_type = "w2v2" | |
else: | |
mode_type = "hubert" | |
return mode_type | |
def load_npy(model_): | |
if isinstance(model_, list): | |
# check if is .npy | |
for i in model_: | |
_model_extention = os.path.splitext(i)[1] | |
if _model_extention != ".npy": | |
raise ValueError(f"Unsupported model type: {_model_extention}") | |
# merge npy files | |
emotion_reference = np.empty((0, 1024)) | |
for i in model_: | |
tmp = np.load(i).reshape(-1, 1024) | |
emotion_reference = np.append(emotion_reference, tmp, axis=0) | |
elif os.path.isdir(model_): | |
emotion_reference = np.empty((0, 1024)) | |
for root, dirs, files in os.walk(model_): | |
for file_name in files: | |
# check if is .npy | |
_model_extention = os.path.splitext(file_name)[1] | |
if _model_extention != ".npy": | |
continue | |
file_path = os.path.join(root, file_name) | |
# merge npy files | |
tmp = np.load(file_path).reshape(-1, 1024) | |
emotion_reference = np.append(emotion_reference, tmp, axis=0) | |
elif os.path.isfile(model_): | |
# check if is .npy | |
_model_extention = os.path.splitext(model_)[1] | |
if _model_extention != ".npy": | |
raise ValueError(f"Unsupported model type: {_model_extention}") | |
emotion_reference = np.load(model_) | |
logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}") | |
return emotion_reference | |
def merge_model(merging_model): | |
vits_obj = [] | |
vits_speakers = [] | |
hubert_vits_obj = [] | |
hubert_vits_speakers = [] | |
w2v2_vits_obj = [] | |
w2v2_vits_speakers = [] | |
bert_vits2_obj = [] | |
bert_vits2_speakers = [] | |
# model list | |
vits_list = [] | |
hubert_vits_list = [] | |
w2v2_vits_list = [] | |
bert_vits2_list = [] | |
for l in merging_model: | |
with open(l[1], 'r', encoding='utf-8') as model_config: | |
model_type = analysis(model_config) | |
if model_type == "vits": | |
vits_list.append(l) | |
elif model_type == "hubert": | |
hubert_vits_list.append(l) | |
elif model_type == "w2v2": | |
w2v2_vits_list.append(l) | |
elif model_type == "bert_vits2": | |
bert_vits2_list.append(l) | |
# merge vits | |
new_id = 0 | |
for obj_id, i in enumerate(vits_list): | |
obj = VITS(model=i[0], config=i[1], model_type="vits", device=device) | |
lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) | |
for id, name in enumerate(obj.get_speakers()): | |
vits_obj.append([int(id), obj, obj_id]) | |
vits_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
new_id += 1 | |
# merge hubert-vits | |
if len(hubert_vits_list) != 0: | |
if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL): | |
raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py") | |
try: | |
from vits.hubert_model import hubert_soft | |
hubert = hubert_soft(config.HUBERT_SOFT_MODEL) | |
except Exception as e: | |
raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}") | |
new_id = 0 | |
for obj_id, i in enumerate(hubert_vits_list): | |
obj = VITS(model=i[0], config=i[1], model_=hubert, model_type="hubert", device=device) | |
lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) | |
for id, name in enumerate(obj.get_speakers()): | |
hubert_vits_obj.append([int(id), obj, obj_id]) | |
hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
new_id += 1 | |
# merge w2v2-vits | |
emotion_reference = None | |
if len(w2v2_vits_list) != 0: | |
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY): | |
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py") | |
try: | |
emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY) | |
except Exception as e: | |
raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}") | |
new_id = 0 | |
for obj_id, i in enumerate(w2v2_vits_list): | |
obj = VITS(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2", device=device) | |
lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) | |
for id, name in enumerate(obj.get_speakers()): | |
w2v2_vits_obj.append([int(id), obj, obj_id]) | |
w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
new_id += 1 | |
# merge Bert_VITS2 | |
new_id = 0 | |
for obj_id, i in enumerate(bert_vits2_list): | |
from bert_vits2 import Bert_VITS2 | |
obj = Bert_VITS2(model=i[0], config=i[1], device=device) | |
lang = ["ZH"] | |
for id, name in enumerate(obj.get_speakers()): | |
bert_vits2_obj.append([int(id), obj, obj_id]) | |
bert_vits2_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
new_id += 1 | |
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj, | |
"BERT-VITS2": bert_vits2_obj} | |
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers, | |
"BERT-VITS2": bert_vits2_speakers} | |
w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0 | |
tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count, device=device) | |
return tts | |