None1145 commited on
Commit
46b65aa
·
verified ·
1 Parent(s): d86778e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -13
app.py CHANGED
@@ -30,13 +30,10 @@ bert_path = f"./PretrainedModels/{model_id}/chinese-roberta-wwm-ext-large"
30
 
31
  import gradio as gr
32
  from transformers import AutoModelForMaskedLM, AutoTokenizer
33
- import sys,torch,numpy as np
34
  from pathlib import Path
35
- import os,pdb,utils,librosa,math,traceback,requests,argparse,torch,multiprocessing,pandas as pd,torch.multiprocessing as mp,soundfile
36
- # torch.backends.cuda.sdp_kernel("flash")
37
- # torch.backends.cuda.enable_flash_sdp(True)
38
- # torch.backends.cuda.enable_mem_efficient_sdp(True) # Not avaliable if torch version is lower than 2.0
39
- # torch.backends.cuda.enable_math_sdp(True)
40
  from random import shuffle
41
  from AR.utils import get_newest_ckpt
42
  from glob import glob
@@ -61,17 +58,15 @@ logging.getLogger('multipart').setLevel(logging.WARNING)
61
 
62
  device = "cpu"
63
  is_half = False
64
-
65
  tokenizer = AutoTokenizer.from_pretrained(bert_path)
66
  bert_model=AutoModelForMaskedLM.from_pretrained(bert_path)
67
  if(is_half==True):bert_model=bert_model.half().to(device)
68
  else:bert_model=bert_model.to(device)
69
- # bert_model=bert_model.to(device)
70
  def get_bert_feature(text, word2ph):
71
  with torch.no_grad():
72
  inputs = tokenizer(text, return_tensors="pt")
73
  for i in inputs:
74
- inputs[i] = inputs[i].to(device)#####输入是long不用管精度问题,精度随bert_model
75
  res = bert_model(**inputs, output_hidden_states=True)
76
  res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
77
  assert len(word2ph) == len(text)
@@ -80,10 +75,8 @@ def get_bert_feature(text, word2ph):
80
  repeat_feature = res[i].repeat(word2ph[i], 1)
81
  phone_level_feature.append(repeat_feature)
82
  phone_level_feature = torch.cat(phone_level_feature, dim=0)
83
- # if(is_half==True):phone_level_feature=phone_level_feature.half()
84
  return phone_level_feature.T
85
 
86
-
87
  def load_model(sovits_path, gpt_path):
88
  n_semantic = 1024
89
  dict_s2 = torch.load(sovits_path, map_location="cpu")
@@ -224,6 +217,18 @@ def split(todo_text):
224
  def change_reference_audio(prompt_text, transcripts):
225
  return transcripts[prompt_text]
226
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  models = []
228
  models_info = {}
229
  models_folder_path = "./Models/None1145"
@@ -258,6 +263,8 @@ for folder_name in folder_names:
258
  models_info[speaker]["gpt_weight"] = f"{gpt_path}/{max_file}"
259
  data_path = f"{models_folder_path}/{folder_name}/Data"
260
  models_info[speaker]["transcript"] = {}
 
 
261
  with open(f"{data_path}/{speaker}.list", "r", encoding="utf-8") as f:
262
  for line in f.read().split("\n"):
263
  try:
@@ -266,8 +273,10 @@ for folder_name in folder_names:
266
  break
267
  text = line.split("|")[3]
268
  print(wav, text)
 
 
269
  models_info[speaker]["transcript"][text] = wav
270
- models_info[speaker]["example_reference"] = text
271
  print(models_info)
272
  for speaker in models_info:
273
  speaker_info = models_info[speaker]
@@ -322,7 +331,7 @@ with gr.Blocks() as app:
322
  text_language = gr.Dropdown(
323
  label="Language",
324
  choices=["zh", "en", "ja"],
325
- value="ja"
326
  )
327
  inference_button = gr.Button("Generate", variant="primary")
328
  om = gr.Textbox(label="Output Message")
 
30
 
31
  import gradio as gr
32
  from transformers import AutoModelForMaskedLM, AutoTokenizer
33
+ import sys, torch, numpy as np
34
  from pathlib import Path
35
+ from pydub import AudioSegment
36
+ import librosa, math, traceback, requests, argparse, torch, multiprocessing, pandas as pd, torch.multiprocessing as mp, soundfile
 
 
 
37
  from random import shuffle
38
  from AR.utils import get_newest_ckpt
39
  from glob import glob
 
58
 
59
  device = "cpu"
60
  is_half = False
 
61
  tokenizer = AutoTokenizer.from_pretrained(bert_path)
62
  bert_model=AutoModelForMaskedLM.from_pretrained(bert_path)
63
  if(is_half==True):bert_model=bert_model.half().to(device)
64
  else:bert_model=bert_model.to(device)
 
65
  def get_bert_feature(text, word2ph):
66
  with torch.no_grad():
67
  inputs = tokenizer(text, return_tensors="pt")
68
  for i in inputs:
69
+ inputs[i] = inputs[i].to(device)
70
  res = bert_model(**inputs, output_hidden_states=True)
71
  res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
72
  assert len(word2ph) == len(text)
 
75
  repeat_feature = res[i].repeat(word2ph[i], 1)
76
  phone_level_feature.append(repeat_feature)
77
  phone_level_feature = torch.cat(phone_level_feature, dim=0)
 
78
  return phone_level_feature.T
79
 
 
80
  def load_model(sovits_path, gpt_path):
81
  n_semantic = 1024
82
  dict_s2 = torch.load(sovits_path, map_location="cpu")
 
217
  def change_reference_audio(prompt_text, transcripts):
218
  return transcripts[prompt_text]
219
 
220
+ def get_audio_duration(path):
221
+ audio = AudioSegment.from_wav(path)
222
+ return len(audio) / 1000
223
+ def select_audio_file(wav_paths):
224
+ import random
225
+ eligible_files = [path for path in wav_paths if 3 <= get_audio_duration(path) <= 10]
226
+ if eligible_files:
227
+ selected_file = random.choice(eligible_files)
228
+ else:
229
+ selected_file = random.choice(wav_paths)
230
+ return selected_file
231
+
232
  models = []
233
  models_info = {}
234
  models_folder_path = "./Models/None1145"
 
263
  models_info[speaker]["gpt_weight"] = f"{gpt_path}/{max_file}"
264
  data_path = f"{models_folder_path}/{folder_name}/Data"
265
  models_info[speaker]["transcript"] = {}
266
+ wavs = []
267
+ tmp = {}
268
  with open(f"{data_path}/{speaker}.list", "r", encoding="utf-8") as f:
269
  for line in f.read().split("\n"):
270
  try:
 
273
  break
274
  text = line.split("|")[3]
275
  print(wav, text)
276
+ wavs.append(wav)
277
+ tmp[wav] = text
278
  models_info[speaker]["transcript"][text] = wav
279
+ models_info[speaker]["example_reference"] = tmp[select_audio_file(wavs)]
280
  print(models_info)
281
  for speaker in models_info:
282
  speaker_info = models_info[speaker]
 
331
  text_language = gr.Dropdown(
332
  label="Language",
333
  choices=["zh", "en", "ja"],
334
+ value="zh"
335
  )
336
  inference_button = gr.Button("Generate", variant="primary")
337
  om = gr.Textbox(label="Output Message")