Files changed (4) hide show
  1. README.md +1 -3
  2. app.py +25 -39
  3. gitattributes +1 -0
  4. gitattributes (1) +36 -0
README.md CHANGED
@@ -3,9 +3,7 @@ title: XTTS-streaming
3
  emoji: 🐸
4
  colorFrom: green
5
  colorTo: red
6
- sdk: gradio
7
- sdk_version: 3.44.3
8
- app_file: app.py
9
  pinned: false
10
  models:
11
  - coqui/XTTS-v1
 
3
  emoji: 🐸
4
  colorFrom: green
5
  colorTo: red
6
+ sdk: static
 
 
7
  pinned: false
8
  models:
9
  - coqui/XTTS-v1
app.py CHANGED
@@ -29,20 +29,20 @@ from TTS.tts.configs.xtts_config import XttsConfig
29
  from TTS.tts.models.xtts import Xtts
30
  from TTS.utils.generic_utils import get_user_data_dir
31
 
32
- #HF_TOKEN = os.environ.get("HF_TOKEN")
33
 
34
  from huggingface_hub import HfApi
35
 
36
  # will use api to restart space on a unrecoverable error
37
- #api = HfApi(token=HF_TOKEN)
38
  repo_id = "coqui/xtts-streaming"
39
 
40
- ## Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
41
- #print("Export newer ffmpeg binary for denoise filter")
42
- #ZipFile("ffmpeg.zip").extractall()
43
- #print("Make ffmpeg binary executable")
44
- #st = os.stat('ffmpeg')
45
- #os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
46
 
47
  # This will trigger downloading model
48
  print("Downloading if not downloaded Coqui XTTS V1.1")
@@ -73,7 +73,7 @@ DEVICE_ASSERT_DETECTED=0
73
  DEVICE_ASSERT_PROMPT=None
74
  DEVICE_ASSERT_LANG=None
75
 
76
- #supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
77
  supported_languages=config.languages
78
 
79
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
@@ -91,11 +91,11 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
91
 
92
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
93
 
94
- # tts expects chinese as zh-cn
95
- if language_predicted == "zh":
96
  #we use zh-cn
97
- language_predicted = "zh-cn"
98
- print(f"Detected language:{language_predicted}, Chosen language:{language}")
99
 
100
  # After text character length 15 trigger language detection
101
  if len(prompt)>15:
@@ -151,8 +151,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
151
  out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
152
 
153
  #we will use newer ffmpeg as that has afftn denoise filter
154
- #shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
155
- shell_command = f"ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
156
 
157
  command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
158
  speaker_wav=out_filename
@@ -163,15 +162,15 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
163
  else:
164
  speaker_wav=speaker_wav
165
 
166
- if len(prompt)<2:
167
  gr.Warning("Please give a longer prompt text")
168
  return (
169
  None,
170
  None,
171
  None,
172
  )
173
- if len(prompt)>200:
174
- gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
175
  return (
176
  None,
177
  None,
@@ -253,7 +252,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
253
  print(error_data)
254
  print(speaker_wav)
255
  write_io = StringIO()
256
- csv.writer(write_io).writerows(error_data)
257
  csv_upload= write_io.getvalue().encode()
258
 
259
  filename = error_time+"_xtts-stream_" + str(uuid.uuid4()) +".csv"
@@ -278,7 +277,7 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
278
  )
279
 
280
  # HF Space specific.. This error is unrecoverable need to restart space
281
- #api.restart_space(repo_id=repo_id)
282
  else:
283
  if "Failed to decode" in str(e):
284
  print("Speaker encoding error", str(e))
@@ -342,9 +341,7 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>
342
 
343
  </p>
344
  <p>Language Selectors:
345
- Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
346
- Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
347
- Russian: ru, Spanish: es, Turkish: tr, Japanese: ja <br/>
348
  </p>
349
  <p> Notice: Autoplay may not work on mobile, if you see black waveform image on mobile click it your Audio is there</p>
350
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
@@ -497,30 +494,19 @@ gr.Interface(
497
  inputs=[
498
  gr.Textbox(
499
  label="Text Prompt",
500
- info="One or two sentences at a time is better. Up to 200 text characters.",
501
  value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
502
  ),
503
  gr.Dropdown(
504
  label="Language",
505
  info="Select an output language for the synthesised speech",
506
  choices=[
507
- "en",
508
- "es",
509
- "fr",
510
- "de",
511
- "it",
512
  "pt",
513
- "pl",
514
- "tr",
515
- "ru",
516
- "nl",
517
- "cs",
518
- "ar",
519
- "zh-cn",
520
- "ja"
521
  ],
522
  max_choices=1,
523
- value="en",
524
  ),
525
  gr.Audio(
526
  label="Reference Audio",
@@ -561,4 +547,4 @@ gr.Interface(
561
  article=article,
562
  examples=examples,
563
  cache_examples=False,
564
- ).queue().launch(debug=True,show_api=True)
 
29
  from TTS.tts.models.xtts import Xtts
30
  from TTS.utils.generic_utils import get_user_data_dir
31
 
32
+ HF_TOKEN = os.environ.get("HF_TOKEN")
33
 
34
  from huggingface_hub import HfApi
35
 
36
  # will use api to restart space on a unrecoverable error
37
+ api = HfApi(token=HF_TOKEN)
38
  repo_id = "coqui/xtts-streaming"
39
 
40
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
41
+ print("Export newer ffmpeg binary for denoise filter")
42
+ ZipFile("ffmpeg.zip").extractall()
43
+ print("Make ffmpeg binary executable")
44
+ st = os.stat('ffmpeg')
45
+ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
46
 
47
  # This will trigger downloading model
48
  print("Downloading if not downloaded Coqui XTTS V1.1")
 
73
  DEVICE_ASSERT_PROMPT=None
74
  DEVICE_ASSERT_LANG=None
75
 
76
+ #supported_languages=["pt"]
77
  supported_languages=config.languages
78
 
79
  def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
 
91
 
92
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
93
 
94
+ # tts expects
95
+ if language_predicted
96
  #we use zh-cn
97
+ language_predicted
98
+ print(f"Detected language:{language_predicted}n language:{language}")
99
 
100
  # After text character length 15 trigger language detection
101
  if len(prompt)>15:
 
151
  out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
152
 
153
  #we will use newer ffmpeg as that has afftn denoise filter
154
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
 
155
 
156
  command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
157
  speaker_wav=out_filename
 
162
  else:
163
  speaker_wav=speaker_wav
164
 
165
+ if len(prompt)<1:
166
  gr.Warning("Please give a longer prompt text")
167
  return (
168
  None,
169
  None,
170
  None,
171
  )
172
+ if len(prompt)>3000:
173
+ gr.Warning("Text length limited to characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
174
  return (
175
  None,
176
  None,
 
252
  print(error_data)
253
  print(speaker_wav)
254
  write_io = StringIO()
255
+ csv.writer(write_io).writerows([error_data])
256
  csv_upload= write_io.getvalue().encode()
257
 
258
  filename = error_time+"_xtts-stream_" + str(uuid.uuid4()) +".csv"
 
277
  )
278
 
279
  # HF Space specific.. This error is unrecoverable need to restart space
280
+ api.restart_space(repo_id=repo_id)
281
  else:
282
  if "Failed to decode" in str(e):
283
  print("Speaker encoding error", str(e))
 
341
 
342
  </p>
343
  <p>Language Selectors:
344
+ Arabic: ar, Brazilian Portuguese: pt
 
 
345
  </p>
346
  <p> Notice: Autoplay may not work on mobile, if you see black waveform image on mobile click it your Audio is there</p>
347
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
 
494
  inputs=[
495
  gr.Textbox(
496
  label="Text Prompt",
497
+ info="One or two sentences at a time is better. Up to text characters.",
498
  value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
499
  ),
500
  gr.Dropdown(
501
  label="Language",
502
  info="Select an output language for the synthesised speech",
503
  choices=[
504
+
 
 
 
 
505
  "pt",
506
+
 
 
 
 
 
 
 
507
  ],
508
  max_choices=1,
509
+ value="pt",
510
  ),
511
  gr.Audio(
512
  label="Reference Audio",
 
547
  article=article,
548
  examples=examples,
549
  cache_examples=False,
550
+ ).queue().launch(debug=True,show_api=True)
gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ examples/female.wav filter=lfs diff=lfs merge=lfs -text
gitattributes (1) ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/female.wav filter=lfs diff=lfs merge=lfs -text