Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -30,13 +30,10 @@ bert_path = f"./PretrainedModels/{model_id}/chinese-roberta-wwm-ext-large"
|
|
30 |
|
31 |
import gradio as gr
|
32 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
33 |
-
import sys,torch,numpy as np
|
34 |
from pathlib import Path
|
35 |
-
|
36 |
-
|
37 |
-
# torch.backends.cuda.enable_flash_sdp(True)
|
38 |
-
# torch.backends.cuda.enable_mem_efficient_sdp(True) # Not avaliable if torch version is lower than 2.0
|
39 |
-
# torch.backends.cuda.enable_math_sdp(True)
|
40 |
from random import shuffle
|
41 |
from AR.utils import get_newest_ckpt
|
42 |
from glob import glob
|
@@ -61,17 +58,15 @@ logging.getLogger('multipart').setLevel(logging.WARNING)
|
|
61 |
|
62 |
device = "cpu"
|
63 |
is_half = False
|
64 |
-
|
65 |
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
66 |
bert_model=AutoModelForMaskedLM.from_pretrained(bert_path)
|
67 |
if(is_half==True):bert_model=bert_model.half().to(device)
|
68 |
else:bert_model=bert_model.to(device)
|
69 |
-
# bert_model=bert_model.to(device)
|
70 |
def get_bert_feature(text, word2ph):
|
71 |
with torch.no_grad():
|
72 |
inputs = tokenizer(text, return_tensors="pt")
|
73 |
for i in inputs:
|
74 |
-
inputs[i] = inputs[i].to(device)
|
75 |
res = bert_model(**inputs, output_hidden_states=True)
|
76 |
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
77 |
assert len(word2ph) == len(text)
|
@@ -80,10 +75,8 @@ def get_bert_feature(text, word2ph):
|
|
80 |
repeat_feature = res[i].repeat(word2ph[i], 1)
|
81 |
phone_level_feature.append(repeat_feature)
|
82 |
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
83 |
-
# if(is_half==True):phone_level_feature=phone_level_feature.half()
|
84 |
return phone_level_feature.T
|
85 |
|
86 |
-
|
87 |
def load_model(sovits_path, gpt_path):
|
88 |
n_semantic = 1024
|
89 |
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
@@ -224,6 +217,18 @@ def split(todo_text):
|
|
224 |
def change_reference_audio(prompt_text, transcripts):
|
225 |
return transcripts[prompt_text]
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
models = []
|
228 |
models_info = {}
|
229 |
models_folder_path = "./Models/None1145"
|
@@ -258,6 +263,8 @@ for folder_name in folder_names:
|
|
258 |
models_info[speaker]["gpt_weight"] = f"{gpt_path}/{max_file}"
|
259 |
data_path = f"{models_folder_path}/{folder_name}/Data"
|
260 |
models_info[speaker]["transcript"] = {}
|
|
|
|
|
261 |
with open(f"{data_path}/{speaker}.list", "r", encoding="utf-8") as f:
|
262 |
for line in f.read().split("\n"):
|
263 |
try:
|
@@ -266,8 +273,10 @@ for folder_name in folder_names:
|
|
266 |
break
|
267 |
text = line.split("|")[3]
|
268 |
print(wav, text)
|
|
|
|
|
269 |
models_info[speaker]["transcript"][text] = wav
|
270 |
-
|
271 |
print(models_info)
|
272 |
for speaker in models_info:
|
273 |
speaker_info = models_info[speaker]
|
@@ -322,7 +331,7 @@ with gr.Blocks() as app:
|
|
322 |
text_language = gr.Dropdown(
|
323 |
label="Language",
|
324 |
choices=["zh", "en", "ja"],
|
325 |
-
value="
|
326 |
)
|
327 |
inference_button = gr.Button("Generate", variant="primary")
|
328 |
om = gr.Textbox(label="Output Message")
|
|
|
30 |
|
31 |
import gradio as gr
|
32 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
33 |
+
import sys, torch, numpy as np
|
34 |
from pathlib import Path
|
35 |
+
from pydub import AudioSegment
|
36 |
+
import librosa, math, traceback, requests, argparse, torch, multiprocessing, pandas as pd, torch.multiprocessing as mp, soundfile
|
|
|
|
|
|
|
37 |
from random import shuffle
|
38 |
from AR.utils import get_newest_ckpt
|
39 |
from glob import glob
|
|
|
58 |
|
59 |
device = "cpu"
|
60 |
is_half = False
|
|
|
61 |
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
62 |
bert_model=AutoModelForMaskedLM.from_pretrained(bert_path)
|
63 |
if(is_half==True):bert_model=bert_model.half().to(device)
|
64 |
else:bert_model=bert_model.to(device)
|
|
|
65 |
def get_bert_feature(text, word2ph):
|
66 |
with torch.no_grad():
|
67 |
inputs = tokenizer(text, return_tensors="pt")
|
68 |
for i in inputs:
|
69 |
+
inputs[i] = inputs[i].to(device)
|
70 |
res = bert_model(**inputs, output_hidden_states=True)
|
71 |
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
|
72 |
assert len(word2ph) == len(text)
|
|
|
75 |
repeat_feature = res[i].repeat(word2ph[i], 1)
|
76 |
phone_level_feature.append(repeat_feature)
|
77 |
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
|
|
78 |
return phone_level_feature.T
|
79 |
|
|
|
80 |
def load_model(sovits_path, gpt_path):
|
81 |
n_semantic = 1024
|
82 |
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
|
|
217 |
def change_reference_audio(prompt_text, transcripts):
|
218 |
return transcripts[prompt_text]
|
219 |
|
220 |
+
def get_audio_duration(path):
|
221 |
+
audio = AudioSegment.from_wav(path)
|
222 |
+
return len(audio) / 1000
|
223 |
+
def select_audio_file(wav_paths):
|
224 |
+
import random
|
225 |
+
eligible_files = [path for path in wav_paths if 3 <= get_audio_duration(path) <= 10]
|
226 |
+
if eligible_files:
|
227 |
+
selected_file = random.choice(eligible_files)
|
228 |
+
else:
|
229 |
+
selected_file = random.choice(wav_paths)
|
230 |
+
return selected_file
|
231 |
+
|
232 |
models = []
|
233 |
models_info = {}
|
234 |
models_folder_path = "./Models/None1145"
|
|
|
263 |
models_info[speaker]["gpt_weight"] = f"{gpt_path}/{max_file}"
|
264 |
data_path = f"{models_folder_path}/{folder_name}/Data"
|
265 |
models_info[speaker]["transcript"] = {}
|
266 |
+
wavs = []
|
267 |
+
tmp = {}
|
268 |
with open(f"{data_path}/{speaker}.list", "r", encoding="utf-8") as f:
|
269 |
for line in f.read().split("\n"):
|
270 |
try:
|
|
|
273 |
break
|
274 |
text = line.split("|")[3]
|
275 |
print(wav, text)
|
276 |
+
wavs.append(wav)
|
277 |
+
tmp[wav] = text
|
278 |
models_info[speaker]["transcript"][text] = wav
|
279 |
+
models_info[speaker]["example_reference"] = tmp[select_audio_file(wavs)]
|
280 |
print(models_info)
|
281 |
for speaker in models_info:
|
282 |
speaker_info = models_info[speaker]
|
|
|
331 |
text_language = gr.Dropdown(
|
332 |
label="Language",
|
333 |
choices=["zh", "en", "ja"],
|
334 |
+
value="zh"
|
335 |
)
|
336 |
inference_button = gr.Button("Generate", variant="primary")
|
337 |
om = gr.Textbox(label="Output Message")
|