r3gm commited on
Commit
62b1e34
Β·
1 Parent(s): 0b8b7c2

delete v0.1.0

Browse files
.github/workflows/main.yml DELETED
@@ -1,23 +0,0 @@
1
- name: Sync to Hugging Face hub
2
- on:
3
- push:
4
- branches: [main]
5
-
6
- # to run this workflow manually from the Actions tab
7
- workflow_dispatch:
8
-
9
- jobs:
10
- sync-to-hub:
11
- runs-on: ubuntu-latest
12
- steps:
13
- - uses: actions/checkout@v2
14
- with:
15
- fetch-depth: 0
16
- - name: Add remote
17
- env:
18
- HF: ${{ secrets.HF }}
19
- run: git remote add space https://r3gm:$HF@huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content
20
- - name: Push to hub
21
- env:
22
- HF: ${{ secrets.HF }}
23
- run: git push --force https://r3gm:$HF@huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content main
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,1064 +0,0 @@
1
- #%cd SoniTranslate
2
- # vc infer pipe 161 np.int
3
- import os
4
-
5
- os.system("pip install -r requirements_colab.txt")
6
- os.system("pip install -r requirements_extra.txt")
7
-
8
- os.system('apt install git-lfs')
9
- os.system('git lfs install')
10
- os.system('apt -y install -qq aria2')
11
- os.system('aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d . -o hubert_base.pt')
12
- os.system('wget https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt')
13
-
14
- import numpy as np
15
- import gradio as gr
16
- import whisperx
17
- from whisperx.utils import LANGUAGES as LANG_TRANSCRIPT
18
- from whisperx.alignment import DEFAULT_ALIGN_MODELS_TORCH as DAMT, DEFAULT_ALIGN_MODELS_HF as DAMHF
19
- from IPython.utils import capture
20
- import torch
21
- from gtts import gTTS
22
- import librosa
23
- import edge_tts
24
- import asyncio
25
- import gc
26
- from pydub import AudioSegment
27
- from tqdm import tqdm
28
- from deep_translator import GoogleTranslator
29
- import os
30
- from soni_translate.audio_segments import create_translated_audio
31
- from soni_translate.text_to_speech import make_voice_gradio
32
- from soni_translate.translate_segments import translate_text
33
- import time
34
- import shutil
35
- from urllib.parse import unquote
36
- import zipfile
37
- import rarfile
38
- import logging
39
- logging.getLogger("numba").setLevel(logging.WARNING)
40
- logging.getLogger("httpx").setLevel(logging.WARNING)
41
- logging.getLogger("markdown_it").setLevel(logging.WARNING)
42
-
43
-
44
-
45
- title = "<center><strong><font size='7'>πŸ“½οΈ SoniTranslate 🈷️</font></strong></center>"
46
-
47
- news = """ ## πŸ“– News
48
- πŸ”₯ 2023/07/26: New UI and add mix options.
49
-
50
- πŸ”₯ 2023/07/27: Fix some bug processing the video and audio.
51
-
52
- πŸ”₯ 2023/08/01: Add options for use RVC models.
53
-
54
- πŸ”₯ 2023/08/02: Added support for Arabic, Czech, Danish, Finnish, Greek, Hebrew, Hungarian, Korean, Persian, Polish, Russian, Turkish, Urdu, Hindi, and Vietnamese languages. 🌐
55
-
56
- πŸ”₯ 2023/08/03: Changed default options and added directory view of downloads..
57
- """
58
-
59
- description = """
60
- ### πŸŽ₯ **Translate videos easily with SoniTranslate!** πŸ“½οΈ
61
-
62
- Upload a video or provide a video link. Limitation: 10 seconds for CPU, but no restrictions with a GPU.
63
-
64
- For faster results and no duration limits, try the Colab notebook with a GPU:
65
- [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/R3gm/SoniTranslate/blob/main/SoniTranslate_Colab.ipynb)
66
-
67
- πŸ“½οΈ **This a demo of SoniTranslate; GitHub repository: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!**
68
-
69
- See the tab labeled `Help` for instructions on how to use it. Let's start having fun with video translation! πŸš€πŸŽ‰
70
- """
71
-
72
-
73
-
74
- tutorial = """
75
-
76
- # πŸ”° **Instructions for use:**
77
-
78
- 1. πŸ“€ **Upload a video** on the first tab or 🌐 **use a video link** on the second tab.
79
-
80
- 2. 🌍 Choose the language in which you want to **translate the video**.
81
-
82
- 3. πŸ—£οΈ Specify the **number of people speaking** in the video and **assign each one a text-to-speech voice** suitable for the translation language.
83
-
84
- 4. πŸš€ Press the '**Translate**' button to obtain the results.
85
-
86
-
87
-
88
-
89
- # 🎀 (Optional) How to Use RVC and RVC2 Voices 🎢
90
-
91
-
92
- The goal is to apply a RVC (Retrieval-based Voice Conversion) to the generated TTS (Text-to-Speech) πŸŽ™οΈ
93
-
94
- 1. In the `Custom Voice RVC` tab, download the models you need πŸ“₯ You can use links from Hugging Face and Google Drive in formats like zip, pth, or index. You can also download complete HF space repositories, but this option is not very stable πŸ˜•
95
-
96
- 2. Now, go to `Replace voice: TTS to RVC` and check the `enable` box βœ… After this, you can choose the models you want to apply to each TTS speaker πŸ‘©β€πŸ¦°πŸ‘¨β€πŸ¦±πŸ‘©β€πŸ¦³πŸ‘¨β€πŸ¦²
97
-
98
- 3. Adjust the F0 method that will be applied to all RVCs πŸŽ›οΈ
99
-
100
- 4. Press `APPLY CONFIGURATION` to apply the changes you made πŸ”„
101
-
102
- 5. Go back to the video translation tab and click on 'Translate' ▢️ Now, the translation will be done applying the RVCs πŸ—£οΈ
103
-
104
- Tip: You can use `Test RVC` to experiment and find the best TTS or configurations to apply to the RVC πŸ§ͺπŸ”
105
-
106
- """
107
-
108
-
109
-
110
- # Check GPU
111
- if torch.cuda.is_available():
112
- device = "cuda"
113
- list_compute_type = ['float16', 'float32']
114
- compute_type_default = 'float16'
115
- whisper_model_default = 'large-v2'
116
- else:
117
- device = "cpu"
118
- list_compute_type = ['float32']
119
- compute_type_default = 'float32'
120
- whisper_model_default = 'medium'
121
- print('Working in: ', device)
122
-
123
- list_tts = ['af-ZA-AdriNeural-Female', 'af-ZA-WillemNeural-Male', 'am-ET-AmehaNeural-Male', 'am-ET-MekdesNeural-Female', 'ar-AE-FatimaNeural-Female', 'ar-AE-HamdanNeural-Male', 'ar-BH-AliNeural-Male', 'ar-BH-LailaNeural-Female', 'ar-DZ-AminaNeural-Female', 'ar-DZ-IsmaelNeural-Male', 'ar-EG-SalmaNeural-Female', 'ar-EG-ShakirNeural-Male', 'ar-IQ-BasselNeural-Male', 'ar-IQ-RanaNeural-Female', 'ar-JO-SanaNeural-Female', 'ar-JO-TaimNeural-Male', 'ar-KW-FahedNeural-Male', 'ar-KW-NouraNeural-Female', 'ar-LB-LaylaNeural-Female', 'ar-LB-RamiNeural-Male', 'ar-LY-ImanNeural-Female', 'ar-LY-OmarNeural-Male', 'ar-MA-JamalNeural-Male', 'ar-MA-MounaNeural-Female', 'ar-OM-AbdullahNeural-Male', 'ar-OM-AyshaNeural-Female', 'ar-QA-AmalNeural-Female', 'ar-QA-MoazNeural-Male', 'ar-SA-HamedNeural-Male', 'ar-SA-ZariyahNeural-Female', 'ar-SY-AmanyNeural-Female', 'ar-SY-LaithNeural-Male', 'ar-TN-HediNeural-Male', 'ar-TN-ReemNeural-Female', 'ar-YE-MaryamNeural-Female', 'ar-YE-SalehNeural-Male', 'az-AZ-BabekNeural-Male', 'az-AZ-BanuNeural-Female', 'bg-BG-BorislavNeural-Male', 'bg-BG-KalinaNeural-Female', 'bn-BD-NabanitaNeural-Female', 'bn-BD-PradeepNeural-Male', 'bn-IN-BashkarNeural-Male', 'bn-IN-TanishaaNeural-Female', 'bs-BA-GoranNeural-Male', 'bs-BA-VesnaNeural-Female', 'ca-ES-EnricNeural-Male', 'ca-ES-JoanaNeural-Female', 'cs-CZ-AntoninNeural-Male', 'cs-CZ-VlastaNeural-Female', 'cy-GB-AledNeural-Male', 'cy-GB-NiaNeural-Female', 'da-DK-ChristelNeural-Female', 'da-DK-JeppeNeural-Male', 'de-AT-IngridNeural-Female', 'de-AT-JonasNeural-Male', 'de-CH-JanNeural-Male', 'de-CH-LeniNeural-Female', 'de-DE-AmalaNeural-Female', 'de-DE-ConradNeural-Male', 'de-DE-KatjaNeural-Female', 'de-DE-KillianNeural-Male', 'el-GR-AthinaNeural-Female', 'el-GR-NestorasNeural-Male', 'en-AU-NatashaNeural-Female', 'en-AU-WilliamNeural-Male', 'en-CA-ClaraNeural-Female', 'en-CA-LiamNeural-Male', 'en-GB-LibbyNeural-Female', 'en-GB-MaisieNeural-Female', 'en-GB-RyanNeural-Male', 'en-GB-SoniaNeural-Female', 'en-GB-ThomasNeural-Male', 'en-HK-SamNeural-Male', 'en-HK-YanNeural-Female', 'en-IE-ConnorNeural-Male', 'en-IE-EmilyNeural-Female', 'en-IN-NeerjaExpressiveNeural-Female', 'en-IN-NeerjaNeural-Female', 'en-IN-PrabhatNeural-Male', 'en-KE-AsiliaNeural-Female', 'en-KE-ChilembaNeural-Male', 'en-NG-AbeoNeural-Male', 'en-NG-EzinneNeural-Female', 'en-NZ-MitchellNeural-Male', 'en-NZ-MollyNeural-Female', 'en-PH-JamesNeural-Male', 'en-PH-RosaNeural-Female', 'en-SG-LunaNeural-Female', 'en-SG-WayneNeural-Male', 'en-TZ-ElimuNeural-Male', 'en-TZ-ImaniNeural-Female', 'en-US-AnaNeural-Female', 'en-US-AriaNeural-Female', 'en-US-ChristopherNeural-Male', 'en-US-EricNeural-Male', 'en-US-GuyNeural-Male', 'en-US-JennyNeural-Female', 'en-US-MichelleNeural-Female', 'en-US-RogerNeural-Male', 'en-US-SteffanNeural-Male', 'en-ZA-LeahNeural-Female', 'en-ZA-LukeNeural-Male', 'es-AR-ElenaNeural-Female', 'es-AR-TomasNeural-Male', 'es-BO-MarceloNeural-Male', 'es-BO-SofiaNeural-Female', 'es-CL-CatalinaNeural-Female', 'es-CL-LorenzoNeural-Male', 'es-CO-GonzaloNeural-Male', 'es-CO-SalomeNeural-Female', 'es-CR-JuanNeural-Male', 'es-CR-MariaNeural-Female', 'es-CU-BelkysNeural-Female', 'es-CU-ManuelNeural-Male', 'es-DO-EmilioNeural-Male', 'es-DO-RamonaNeural-Female', 'es-EC-AndreaNeural-Female', 'es-EC-LuisNeural-Male', 'es-ES-AlvaroNeural-Male', 'es-ES-ElviraNeural-Female', 'es-GQ-JavierNeural-Male', 'es-GQ-TeresaNeural-Female', 'es-GT-AndresNeural-Male', 'es-GT-MartaNeural-Female', 'es-HN-CarlosNeural-Male', 'es-HN-KarlaNeural-Female', 'es-MX-DaliaNeural-Female', 'es-MX-JorgeNeural-Male', 'es-NI-FedericoNeural-Male', 'es-NI-YolandaNeural-Female', 'es-PA-MargaritaNeural-Female', 'es-PA-RobertoNeural-Male', 'es-PE-AlexNeural-Male', 'es-PE-CamilaNeural-Female', 'es-PR-KarinaNeural-Female', 'es-PR-VictorNeural-Male', 'es-PY-MarioNeural-Male', 'es-PY-TaniaNeural-Female', 'es-SV-LorenaNeural-Female', 'es-SV-RodrigoNeural-Male', 'es-US-AlonsoNeural-Male', 'es-US-PalomaNeural-Female', 'es-UY-MateoNeural-Male', 'es-UY-ValentinaNeural-Female', 'es-VE-PaolaNeural-Female', 'es-VE-SebastianNeural-Male', 'et-EE-AnuNeural-Female', 'et-EE-KertNeural-Male', 'fa-IR-DilaraNeural-Female', 'fa-IR-FaridNeural-Male', 'fi-FI-HarriNeural-Male', 'fi-FI-NooraNeural-Female', 'fil-PH-AngeloNeural-Male', 'fil-PH-BlessicaNeural-Female', 'fr-BE-CharlineNeural-Female', 'fr-BE-GerardNeural-Male', 'fr-CA-AntoineNeural-Male', 'fr-CA-JeanNeural-Male', 'fr-CA-SylvieNeural-Female', 'fr-CH-ArianeNeural-Female', 'fr-CH-FabriceNeural-Male', 'fr-FR-DeniseNeural-Female', 'fr-FR-EloiseNeural-Female', 'fr-FR-HenriNeural-Male', 'ga-IE-ColmNeural-Male', 'ga-IE-OrlaNeural-Female', 'gl-ES-RoiNeural-Male', 'gl-ES-SabelaNeural-Female', 'gu-IN-DhwaniNeural-Female', 'gu-IN-NiranjanNeural-Male', 'he-IL-AvriNeural-Male', 'he-IL-HilaNeural-Female', 'hi-IN-MadhurNeural-Male', 'hi-IN-SwaraNeural-Female', 'hr-HR-GabrijelaNeural-Female', 'hr-HR-SreckoNeural-Male', 'hu-HU-NoemiNeural-Female', 'hu-HU-TamasNeural-Male', 'id-ID-ArdiNeural-Male', 'id-ID-GadisNeural-Female', 'is-IS-GudrunNeural-Female', 'is-IS-GunnarNeural-Male', 'it-IT-DiegoNeural-Male', 'it-IT-ElsaNeural-Female', 'it-IT-IsabellaNeural-Female', 'ja-JP-KeitaNeural-Male', 'ja-JP-NanamiNeural-Female', 'jv-ID-DimasNeural-Male', 'jv-ID-SitiNeural-Female', 'ka-GE-EkaNeural-Female', 'ka-GE-GiorgiNeural-Male', 'kk-KZ-AigulNeural-Female', 'kk-KZ-DauletNeural-Male', 'km-KH-PisethNeural-Male', 'km-KH-SreymomNeural-Female', 'kn-IN-GaganNeural-Male', 'kn-IN-SapnaNeural-Female', 'ko-KR-InJoonNeural-Male', 'ko-KR-SunHiNeural-Female', 'lo-LA-ChanthavongNeural-Male', 'lo-LA-KeomanyNeural-Female', 'lt-LT-LeonasNeural-Male', 'lt-LT-OnaNeural-Female', 'lv-LV-EveritaNeural-Female', 'lv-LV-NilsNeural-Male', 'mk-MK-AleksandarNeural-Male', 'mk-MK-MarijaNeural-Female', 'ml-IN-MidhunNeural-Male', 'ml-IN-SobhanaNeural-Female', 'mn-MN-BataaNeural-Male', 'mn-MN-YesuiNeural-Female', 'mr-IN-AarohiNeural-Female', 'mr-IN-ManoharNeural-Male', 'ms-MY-OsmanNeural-Male', 'ms-MY-YasminNeural-Female', 'mt-MT-GraceNeural-Female', 'mt-MT-JosephNeural-Male', 'my-MM-NilarNeural-Female', 'my-MM-ThihaNeural-Male', 'nb-NO-FinnNeural-Male', 'nb-NO-PernilleNeural-Female', 'ne-NP-HemkalaNeural-Female', 'ne-NP-SagarNeural-Male', 'nl-BE-ArnaudNeural-Male', 'nl-BE-DenaNeural-Female', 'nl-NL-ColetteNeural-Female', 'nl-NL-FennaNeural-Female', 'nl-NL-MaartenNeural-Male', 'pl-PL-MarekNeural-Male', 'pl-PL-ZofiaNeural-Female', 'ps-AF-GulNawazNeural-Male', 'ps-AF-LatifaNeural-Female', 'pt-BR-AntonioNeural-Male', 'pt-BR-FranciscaNeural-Female', 'pt-PT-DuarteNeural-Male', 'pt-PT-RaquelNeural-Female', 'ro-RO-AlinaNeural-Female', 'ro-RO-EmilNeural-Male', 'ru-RU-DmitryNeural-Male', 'ru-RU-SvetlanaNeural-Female', 'si-LK-SameeraNeural-Male', 'si-LK-ThiliniNeural-Female', 'sk-SK-LukasNeural-Male', 'sk-SK-ViktoriaNeural-Female', 'sl-SI-PetraNeural-Female', 'sl-SI-RokNeural-Male', 'so-SO-MuuseNeural-Male', 'so-SO-UbaxNeural-Female', 'sq-AL-AnilaNeural-Female', 'sq-AL-IlirNeural-Male', 'sr-RS-NicholasNeural-Male', 'sr-RS-SophieNeural-Female', 'su-ID-JajangNeural-Male', 'su-ID-TutiNeural-Female', 'sv-SE-MattiasNeural-Male', 'sv-SE-SofieNeural-Female', 'sw-KE-RafikiNeural-Male', 'sw-KE-ZuriNeural-Female', 'sw-TZ-DaudiNeural-Male', 'sw-TZ-RehemaNeural-Female', 'ta-IN-PallaviNeural-Female', 'ta-IN-ValluvarNeural-Male', 'ta-LK-KumarNeural-Male', 'ta-LK-SaranyaNeural-Female', 'ta-MY-KaniNeural-Female', 'ta-MY-SuryaNeural-Male', 'ta-SG-AnbuNeural-Male', 'ta-SG-VenbaNeural-Female', 'te-IN-MohanNeural-Male', 'te-IN-ShrutiNeural-Female', 'th-TH-NiwatNeural-Male', 'th-TH-PremwadeeNeural-Female', 'tr-TR-AhmetNeural-Male', 'tr-TR-EmelNeural-Female', 'uk-UA-OstapNeural-Male', 'uk-UA-PolinaNeural-Female', 'ur-IN-GulNeural-Female', 'ur-IN-SalmanNeural-Male', 'ur-PK-AsadNeural-Male', 'ur-PK-UzmaNeural-Female', 'uz-UZ-MadinaNeural-Female', 'uz-UZ-SardorNeural-Male', 'vi-VN-HoaiMyNeural-Female', 'vi-VN-NamMinhNeural-Male', 'zh-CN-XiaoxiaoNeural-Female', 'zh-CN-XiaoyiNeural-Female', 'zh-CN-YunjianNeural-Male', 'zh-CN-YunxiNeural-Male', 'zh-CN-YunxiaNeural-Male', 'zh-CN-YunyangNeural-Male', 'zh-CN-liaoning-XiaobeiNeural-Female', 'zh-CN-shaanxi-XiaoniNeural-Female']
124
-
125
- ### voices
126
- with capture.capture_output() as cap:
127
- os.system('mkdir downloads')
128
- os.system('mkdir logs')
129
- os.system('mkdir weights')
130
- os.system('mkdir downloads')
131
- del cap
132
-
133
-
134
- def print_tree_directory(root_dir, indent=''):
135
- if not os.path.exists(root_dir):
136
- print(f"{indent}Invalid directory or file: {root_dir}")
137
- return
138
-
139
- items = os.listdir(root_dir)
140
-
141
- for index, item in enumerate(sorted(items)):
142
- item_path = os.path.join(root_dir, item)
143
- is_last_item = index == len(items) - 1
144
-
145
- if os.path.isfile(item_path) and item_path.endswith('.zip'):
146
- with zipfile.ZipFile(item_path, 'r') as zip_file:
147
- print(f"{indent}{'└──' if is_last_item else 'β”œβ”€β”€'} {item} (zip file)")
148
- zip_contents = zip_file.namelist()
149
- for zip_item in sorted(zip_contents):
150
- print(f"{indent}{' ' if is_last_item else 'β”‚ '}{zip_item}")
151
- else:
152
- print(f"{indent}{'└──' if is_last_item else 'β”œβ”€β”€'} {item}")
153
-
154
- if os.path.isdir(item_path):
155
- new_indent = indent + (' ' if is_last_item else 'β”‚ ')
156
- print_tree_directory(item_path, new_indent)
157
-
158
-
159
- def upload_model_list():
160
- weight_root = "weights"
161
- models = []
162
- for name in os.listdir(weight_root):
163
- if name.endswith(".pth"):
164
- models.append(name)
165
-
166
- index_root = "logs"
167
- index_paths = []
168
- for name in os.listdir(index_root):
169
- if name.endswith(".index"):
170
- index_paths.append("logs/"+name)
171
-
172
- print(models, index_paths)
173
- return models, index_paths
174
-
175
- def manual_download(url, dst):
176
- token = os.getenv("YOUR_HF_TOKEN")
177
- user_header = f"\"Authorization: Bearer {token}\""
178
-
179
- if 'drive.google' in url:
180
- print("Drive link")
181
- if 'folders' in url:
182
- print("folder")
183
- os.system(f'gdown --folder "{url}" -O {dst} --fuzzy -c')
184
- else:
185
- print("single")
186
- os.system(f'gdown "{url}" -O {dst} --fuzzy -c')
187
- elif 'huggingface' in url:
188
- print("HuggingFace link")
189
- if '/blob/' in url or '/resolve/' in url:
190
- if '/blob/' in url:
191
- url = url.replace('/blob/', '/resolve/')
192
- #parsed_link = '\n{}\n\tout={}'.format(url, unquote(url.split('/')[-1]))
193
- #os.system(f'echo -e "{parsed_link}" | aria2c --header={user_header} --console-log-level=error --summary-interval=10 -i- -j5 -x16 -s16 -k1M -c -d "{dst}"')
194
- os.system(f"wget -P {dst} {url}")
195
- else:
196
- os.system(f"git clone {url} {dst+'repo/'}")
197
- elif 'http' in url or 'magnet' in url:
198
- parsed_link = '"{}"'.format(url)
199
- os.system(f'aria2c --optimize-concurrent-downloads --console-log-level=error --summary-interval=10 -j5 -x16 -s16 -k1M -c -d {dst} -Z {parsed_link}')
200
-
201
-
202
- def download_list(text_downloads):
203
- try:
204
- urls = [elem.strip() for elem in text_downloads.split(',')]
205
- except:
206
- return 'No valid link'
207
-
208
- os.system('mkdir downloads')
209
- os.system('mkdir logs')
210
- os.system('mkdir weights')
211
- path_download = "downloads/"
212
- for url in urls:
213
- manual_download(url, path_download)
214
-
215
- # Tree
216
- print('####################################')
217
- print_tree_directory("downloads", indent='')
218
- print('####################################')
219
-
220
- # Place files
221
- select_zip_and_rar_files("downloads/")
222
-
223
- models, _ = upload_model_list()
224
- os.system("rm -rf downloads/repo")
225
-
226
- return f"Downloaded = {models}"
227
-
228
-
229
- def select_zip_and_rar_files(directory_path="downloads/"):
230
- #filter
231
- zip_files = []
232
- rar_files = []
233
-
234
- for file_name in os.listdir(directory_path):
235
- if file_name.endswith(".zip"):
236
- zip_files.append(file_name)
237
- elif file_name.endswith(".rar"):
238
- rar_files.append(file_name)
239
-
240
- # extract
241
- for file_name in zip_files:
242
- file_path = os.path.join(directory_path, file_name)
243
- with zipfile.ZipFile(file_path, 'r') as zip_ref:
244
- zip_ref.extractall(directory_path)
245
-
246
- for file_name in rar_files:
247
- file_path = os.path.join(directory_path, file_name)
248
- with rarfile.RarFile(file_path, 'r') as rar_ref:
249
- rar_ref.extractall(directory_path)
250
-
251
- # set in path
252
- def move_files_with_extension(src_dir, extension, destination_dir):
253
- for root, _, files in os.walk(src_dir):
254
- for file_name in files:
255
- if file_name.endswith(extension):
256
- source_file = os.path.join(root, file_name)
257
- destination = os.path.join(destination_dir, file_name)
258
- shutil.move(source_file, destination)
259
-
260
- move_files_with_extension(directory_path, ".index", "logs/")
261
- move_files_with_extension(directory_path, ".pth", "weights/")
262
-
263
- return 'Download complete'
264
-
265
- def custom_model_voice_enable(enable_custom_voice):
266
- if enable_custom_voice:
267
- os.environ["VOICES_MODELS"] = 'ENABLE'
268
- else:
269
- os.environ["VOICES_MODELS"] = 'DISABLE'
270
-
271
-
272
- models, index_paths = upload_model_list()
273
-
274
- f0_methods_voice = ["pm", "harvest", "crepe", "rmvpe"]
275
-
276
-
277
- from voice_main import ClassVoices
278
- voices = ClassVoices()
279
-
280
- '''
281
- def translate_from_video(video, WHISPER_MODEL_SIZE, batch_size, compute_type,
282
- TRANSLATE_AUDIO_TO, min_speakers, max_speakers,
283
- tts_voice00, tts_voice01,tts_voice02,tts_voice03,tts_voice04,tts_voice05):
284
-
285
- YOUR_HF_TOKEN = os.getenv("My_hf_token")
286
-
287
- create_translated_audio(result_diarize, audio_files, Output_name_file)
288
-
289
- os.system("rm audio_dub_stereo.wav")
290
- os.system("ffmpeg -i audio_dub_solo.wav -ac 1 audio_dub_stereo.wav")
291
-
292
- os.system(f"rm {mix_audio}")
293
- os.system(f'ffmpeg -y -i audio.wav -i audio_dub_stereo.wav -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
294
-
295
- os.system(f"rm {video_output}")
296
- os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
297
-
298
- return video_output
299
- '''
300
-
301
- def translate_from_video(
302
- video,
303
- YOUR_HF_TOKEN,
304
- preview=False,
305
- WHISPER_MODEL_SIZE="large-v1",
306
- batch_size=16,
307
- compute_type="float16",
308
- SOURCE_LANGUAGE= "Automatic detection",
309
- TRANSLATE_AUDIO_TO="English (en)",
310
- min_speakers=1,
311
- max_speakers=2,
312
- tts_voice00="en-AU-WilliamNeural-Male",
313
- tts_voice01="en-CA-ClaraNeural-Female",
314
- tts_voice02="en-GB-ThomasNeural-Male",
315
- tts_voice03="en-GB-SoniaNeural-Female",
316
- tts_voice04="en-NZ-MitchellNeural-Male",
317
- tts_voice05="en-GB-MaisieNeural-Female",
318
- video_output="video_dub.mp4",
319
- AUDIO_MIX_METHOD='Adjusting volumes and mixing audio',
320
- progress=gr.Progress(),
321
- ):
322
-
323
- if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
324
- YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
325
- if YOUR_HF_TOKEN == None:
326
- print('No valid token')
327
- return "No valid token"
328
- else:
329
- os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN
330
-
331
- video = video if isinstance(video, str) else video.name
332
- print(video)
333
-
334
- if "SET_LIMIT" == os.getenv("DEMO"):
335
- preview=True
336
- print("DEMO; set preview=True; The generation is **limited to 10 seconds** to prevent errors with the CPU. If you use a GPU, you won't have any of these limitations.")
337
- AUDIO_MIX_METHOD='Adjusting volumes and mixing audio'
338
- print("DEMO; set Adjusting volumes and mixing audio")
339
- WHISPER_MODEL_SIZE="medium"
340
- print("DEMO; set whisper model to medium")
341
-
342
- LANGUAGES = {
343
- 'Automatic detection': 'Automatic detection',
344
- 'Arabic (ar)': 'ar',
345
- 'Chinese (zh)': 'zh',
346
- 'Czech (cs)': 'cs',
347
- 'Danish (da)': 'da',
348
- 'Dutch (nl)': 'nl',
349
- 'English (en)': 'en',
350
- 'Finnish (fi)': 'fi',
351
- 'French (fr)': 'fr',
352
- 'German (de)': 'de',
353
- 'Greek (el)': 'el',
354
- 'Hebrew (he)': 'he',
355
- 'Hungarian (hu)': 'hu',
356
- 'Italian (it)': 'it',
357
- 'Japanese (ja)': 'ja',
358
- 'Korean (ko)': 'ko',
359
- 'Persian (fa)': 'fa',
360
- 'Polish (pl)': 'pl',
361
- 'Portuguese (pt)': 'pt',
362
- 'Russian (ru)': 'ru',
363
- 'Spanish (es)': 'es',
364
- 'Turkish (tr)': 'tr',
365
- 'Ukrainian (uk)': 'uk',
366
- 'Urdu (ur)': 'ur',
367
- 'Vietnamese (vi)': 'vi',
368
- 'Hindi (hi)': 'hi',
369
- }
370
-
371
- TRANSLATE_AUDIO_TO = LANGUAGES[TRANSLATE_AUDIO_TO]
372
- SOURCE_LANGUAGE = LANGUAGES[SOURCE_LANGUAGE]
373
-
374
-
375
- if not os.path.exists('audio'):
376
- os.makedirs('audio')
377
-
378
- if not os.path.exists('audio2/audio'):
379
- os.makedirs('audio2/audio')
380
-
381
- # Check GPU
382
- device = "cuda" if torch.cuda.is_available() else "cpu"
383
- compute_type = "float32" if device == "cpu" else compute_type
384
-
385
- OutputFile = 'Video.mp4'
386
- audio_wav = "audio.wav"
387
- Output_name_file = "audio_dub_solo.ogg"
388
- mix_audio = "audio_mix.mp3"
389
-
390
- os.system("rm Video.mp4")
391
- os.system("rm audio.webm")
392
- os.system("rm audio.wav")
393
-
394
- progress(0.15, desc="Processing video...")
395
- if os.path.exists(video):
396
- if preview:
397
- print('Creating a preview video of 10 seconds, to disable this option, go to advanced settings and turn off preview.')
398
- os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
399
- else:
400
- # Check if the file ends with ".mp4" extension
401
- if video.endswith(".mp4"):
402
- destination_path = os.path.join(os.getcwd(), "Video.mp4")
403
- shutil.copy(video, destination_path)
404
- else:
405
- print("File does not have the '.mp4' extension. Converting video.")
406
- os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
407
-
408
- for i in range (120):
409
- time.sleep(1)
410
- print('process video...')
411
- if os.path.exists(OutputFile):
412
- time.sleep(1)
413
- os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
414
- time.sleep(1)
415
- break
416
- if i == 119:
417
- print('Error processing video')
418
- return
419
-
420
- for i in range (120):
421
- time.sleep(1)
422
- print('process audio...')
423
- if os.path.exists(audio_wav):
424
- break
425
- if i == 60 and round(os.path.getsize(f'{OutputFile}') / (1024 * 1024), 1) == 0.0:
426
- os.system('rm intermediate.aac') # only for demo
427
- os.system(f'ffmpeg -i {video} -ss 00:00:00 -t 00:00:10 -vn -acodec aac -strict -2 intermediate.aac')
428
- time.sleep(5)
429
- os.system('ffmpeg -i intermediate.aac -acodec pcm_s16le -ar 44100 -ac 2 audio.wav')
430
- if i == 119:
431
- print("Error can't create the audio")
432
- return
433
- os.system('rm intermediate.aac')
434
- else:
435
- if preview:
436
- print('Creating a preview from the link, 10 seconds to disable this option, go to advanced settings and turn off preview.')
437
- #https://github.com/yt-dlp/yt-dlp/issues/2220
438
- mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
439
- wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
440
- os.system(mp4_)
441
- os.system(wav_)
442
- else:
443
- mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
444
- wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
445
-
446
- os.system(wav_)
447
-
448
- for i in range (120):
449
- time.sleep(1)
450
- print('process audio...')
451
- if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
452
- time.sleep(1)
453
- os.system(mp4_)
454
- break
455
- if i == 119:
456
- print('Error donwloading the audio')
457
- return
458
-
459
- print("Set file complete.")
460
- progress(0.30, desc="Transcribing...")
461
-
462
- SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE
463
-
464
- # 1. Transcribe with original whisper (batched)
465
- with capture.capture_output() as cap:
466
- model = whisperx.load_model(
467
- WHISPER_MODEL_SIZE,
468
- device,
469
- compute_type=compute_type,
470
- language= SOURCE_LANGUAGE,
471
- )
472
- del cap
473
- audio = whisperx.load_audio(audio_wav)
474
- result = model.transcribe(audio, batch_size=batch_size)
475
- gc.collect(); torch.cuda.empty_cache(); del model
476
- print("Transcript complete")
477
-
478
-
479
-
480
- # 2. Align whisper output
481
- progress(0.45, desc="Aligning...")
482
- DAMHF.update(DAMT) #lang align
483
- EXTRA_ALIGN = {
484
- "hi": "theainerd/Wav2Vec2-large-xlsr-hindi"
485
- } # add new align models here
486
- #print(result['language'], DAM.keys(), EXTRA_ALIGN.keys())
487
- if not result['language'] in DAMHF.keys() and not result['language'] in EXTRA_ALIGN.keys():
488
- audio = result = None
489
- print("Automatic detection: Source language not incompatible")
490
- print(f"Detected language {LANG_TRANSCRIPT[result['language']]} incompatible, you can select the source language to avoid this error.")
491
- return
492
-
493
- model_a, metadata = whisperx.load_align_model(
494
- language_code=result["language"],
495
- device=device,
496
- model_name = None if result["language"] in DAMHF.keys() else EXTRA_ALIGN[result["language"]]
497
- )
498
- result = whisperx.align(
499
- result["segments"],
500
- model_a,
501
- metadata,
502
- audio,
503
- device,
504
- return_char_alignments=True,
505
- )
506
- gc.collect(); torch.cuda.empty_cache(); del model_a
507
- print("Align complete")
508
-
509
- if result['segments'] == []:
510
- print('No active speech found in audio')
511
- return
512
-
513
- # 3. Assign speaker labels
514
- progress(0.60, desc="Diarizing...")
515
- with capture.capture_output() as cap:
516
- diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
517
- del cap
518
- diarize_segments = diarize_model(
519
- audio_wav,
520
- min_speakers=min_speakers,
521
- max_speakers=max_speakers)
522
- result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
523
- gc.collect(); torch.cuda.empty_cache(); del diarize_model
524
- print("Diarize complete")
525
-
526
- progress(0.75, desc="Translating...")
527
- if TRANSLATE_AUDIO_TO == "zh":
528
- TRANSLATE_AUDIO_TO = "zh-CN"
529
- if TRANSLATE_AUDIO_TO == "he":
530
- TRANSLATE_AUDIO_TO = "iw"
531
- result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
532
- print("Translation complete")
533
-
534
- progress(0.85, desc="Text_to_speech...")
535
- audio_files = []
536
- speakers_list = []
537
-
538
- # Mapping speakers to voice variables
539
- speaker_to_voice = {
540
- 'SPEAKER_00': tts_voice00,
541
- 'SPEAKER_01': tts_voice01,
542
- 'SPEAKER_02': tts_voice02,
543
- 'SPEAKER_03': tts_voice03,
544
- 'SPEAKER_04': tts_voice04,
545
- 'SPEAKER_05': tts_voice05
546
- }
547
-
548
- for segment in tqdm(result_diarize['segments']):
549
-
550
- text = segment['text']
551
- start = segment['start']
552
- end = segment['end']
553
-
554
- try:
555
- speaker = segment['speaker']
556
- except KeyError:
557
- segment['speaker'] = "SPEAKER_99"
558
- speaker = segment['speaker']
559
- print(f"NO SPEAKER DETECT IN SEGMENT: TTS auxiliary will be used in the segment time {segment['start'], segment['text']}")
560
-
561
- # make the tts audio
562
- filename = f"audio/{start}.ogg"
563
-
564
- if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
565
- make_voice_gradio(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
566
- elif speaker == "SPEAKER_99":
567
- try:
568
- tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
569
- tts.save(filename)
570
- print('Using GTTS')
571
- except:
572
- tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
573
- tts.save(filename)
574
- print('Error: Audio will be replaced.')
575
-
576
- # duration
577
- duration_true = end - start
578
- duration_tts = librosa.get_duration(filename=filename)
579
-
580
- # porcentaje
581
- porcentaje = duration_tts / duration_true
582
-
583
- if porcentaje > 2.1:
584
- porcentaje = 2.1
585
- elif porcentaje <= 1.2 and porcentaje >= 0.8:
586
- porcentaje = 1.0
587
- elif porcentaje <= 0.79:
588
- porcentaje = 0.8
589
-
590
- # Smoth and round
591
- porcentaje = round(porcentaje+0.0, 1)
592
-
593
- # apply aceleration or opposite to the audio file in audio2 folder
594
- os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")
595
-
596
- duration_create = librosa.get_duration(filename=f"audio2/{filename}")
597
- audio_files.append(filename)
598
- speakers_list.append(speaker)
599
-
600
- # custom voice
601
- if os.getenv('VOICES_MODELS') == 'ENABLE':
602
- progress(0.90, desc="Applying customized voices...")
603
- voices(speakers_list, audio_files)
604
-
605
- # replace files with the accelerates
606
- os.system("mv -f audio2/audio/*.ogg audio/")
607
-
608
- os.system(f"rm {Output_name_file}")
609
-
610
- progress(0.95, desc="Creating final translated video...")
611
-
612
- create_translated_audio(result_diarize, audio_files, Output_name_file)
613
-
614
- os.system(f"rm {mix_audio}")
615
-
616
- # TYPE MIX AUDIO
617
- if AUDIO_MIX_METHOD == 'Adjusting volumes and mixing audio':
618
- # volume mix
619
- os.system(f'ffmpeg -y -i {audio_wav} -i {Output_name_file} -filter_complex "[0:0]volume=0.15[a];[1:0]volume=1.90[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
620
- else:
621
- try:
622
- # background mix
623
- os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')
624
- except:
625
- # volume mix except
626
- os.system(f'ffmpeg -y -i {audio_wav} -i {Output_name_file} -filter_complex "[0:0]volume=0.25[a];[1:0]volume=1.80[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio}')
627
-
628
- os.system(f"rm {video_output}")
629
- os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
630
-
631
- return video_output
632
-
633
- import sys
634
-
635
- class Logger:
636
- def __init__(self, filename):
637
- self.terminal = sys.stdout
638
- self.log = open(filename, "w")
639
-
640
- def write(self, message):
641
- self.terminal.write(message)
642
- self.log.write(message)
643
-
644
- def flush(self):
645
- self.terminal.flush()
646
- self.log.flush()
647
-
648
- def isatty(self):
649
- return False
650
-
651
- sys.stdout = Logger("output.log")
652
-
653
- def read_logs():
654
- sys.stdout.flush()
655
- with open("output.log", "r") as f:
656
- return f.read()
657
-
658
- def submit_file_func(file):
659
- print(file.name)
660
- return file.name, file.name
661
-
662
- # max tts
663
- MAX_TTS = 6
664
-
665
- theme='Taithrah/Minimal'
666
-
667
- with gr.Blocks(theme=theme) as demo:
668
- gr.Markdown(title)
669
- gr.Markdown(description)
670
-
671
-
672
- ### link
673
-
674
- with gr.Tab("Audio Translation via Video Link"):
675
- with gr.Row():
676
- with gr.Column():
677
-
678
- blink_input = gr.Textbox(label="Media link.", info="Example: www.youtube.com/watch?v=g_9rPvbENUw", placeholder="URL goes here...")
679
-
680
- bSOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
681
- bTRANSLATE_AUDIO_TO = gr.Dropdown(['Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
682
-
683
- bline_ = gr.HTML("<hr></h2>")
684
- gr.Markdown("Select how many people are speaking in the video.")
685
- bmin_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
686
- bmax_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
687
- gr.Markdown("Select the voice you want for each speaker.")
688
- def bsubmit(value):
689
- visibility_dict = {
690
- f'btts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
691
- }
692
- return [value for value in visibility_dict.values()]
693
- btts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
694
- btts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
695
- btts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
696
- btts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
697
- btts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
698
- btts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
699
- bmax_speakers.change(bsubmit, bmax_speakers, [btts_voice00, btts_voice01, btts_voice02, btts_voice03, btts_voice04, btts_voice05])
700
-
701
-
702
- with gr.Column():
703
- with gr.Accordion("Advanced Settings", open=False):
704
-
705
- bAUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
706
-
707
- gr.HTML("<hr></h2>")
708
- gr.Markdown("Default configuration of Whisper.")
709
- bWHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
710
- bbatch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
711
- bcompute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
712
-
713
- gr.HTML("<hr></h2>")
714
- bVIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
715
- bPREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
716
-
717
- with gr.Column(variant='compact'):
718
- with gr.Row():
719
- text_button = gr.Button("TRANSLATE")
720
- with gr.Row():
721
- blink_output = gr.Video() #gr.outputs.File(label="DOWNLOAD TRANSLATED VIDEO") # gr.Video()
722
-
723
-
724
- bline_ = gr.HTML("<hr></h2>")
725
- if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
726
- bHFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
727
- else:
728
- bHFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
729
-
730
- gr.Examples(
731
- examples=[
732
- [
733
- "https://www.youtube.com/watch?v=5ZeHtRKHl7Y",
734
- "",
735
- False,
736
- "large-v2",
737
- 16,
738
- "float16",
739
- "Japanese (ja)",
740
- "English (en)",
741
- 1,
742
- 2,
743
- 'en-CA-ClaraNeural-Female',
744
- 'en-AU-WilliamNeural-Male',
745
- 'en-GB-ThomasNeural-Male',
746
- 'en-GB-SoniaNeural-Female',
747
- 'en-NZ-MitchellNeural-Male',
748
- 'en-GB-MaisieNeural-Female',
749
- "video_output.mp4",
750
- 'Adjusting volumes and mixing audio',
751
- ],
752
- ],
753
- fn=translate_from_video,
754
- inputs=[
755
- blink_input,
756
- bHFKEY,
757
- bPREVIEW,
758
- bWHISPER_MODEL_SIZE,
759
- bbatch_size,
760
- bcompute_type,
761
- bSOURCE_LANGUAGE,
762
- bTRANSLATE_AUDIO_TO,
763
- bmin_speakers,
764
- bmax_speakers,
765
- btts_voice00,
766
- btts_voice01,
767
- btts_voice02,
768
- btts_voice03,
769
- btts_voice04,
770
- btts_voice05,
771
- bVIDEO_OUTPUT_NAME,
772
- bAUDIO_MIX
773
- ],
774
- outputs=[blink_output],
775
- cache_examples=False,
776
- )
777
-
778
-
779
- #### video
780
- with gr.Tab("Audio Translation for a Video"):
781
- with gr.Row():
782
- with gr.Column():
783
- #video_input = gr.UploadButton("Click to Upload a video", file_types=["video"], file_count="single") #gr.Video() # height=300,width=300
784
- video_input = gr.File(label="Submit a short Video")
785
- #link = gr.HTML()
786
- #video_input.change(submit_file_func, video_input, [video_input, link], show_progress='full')
787
-
788
- SOURCE_LANGUAGE = gr.Dropdown(['Automatic detection', 'Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='Automatic detection',label = 'Source language', info="This is the original language of the video")
789
- TRANSLATE_AUDIO_TO = gr.Dropdown(['Arabic (ar)', 'Chinese (zh)', 'Czech (cs)', 'Danish (da)', 'Dutch (nl)', 'English (en)', 'Finnish (fi)', 'French (fr)', 'German (de)', 'Greek (el)', 'Hebrew (he)', 'Hindi (hi)', 'Hungarian (hu)', 'Italian (it)', 'Japanese (ja)', 'Korean (ko)', 'Persian (fa)', 'Polish (pl)', 'Portuguese (pt)', 'Russian (ru)', 'Spanish (es)', 'Turkish (tr)', 'Ukrainian (uk)', 'Urdu (ur)', 'Vietnamese (vi)'], value='English (en)',label = 'Translate audio to', info="Select the target language, and make sure to select the language corresponding to the speakers of the target language to avoid errors in the process.")
790
-
791
- line_ = gr.HTML("<hr></h2>")
792
- gr.Markdown("Select how many people are speaking in the video.")
793
- min_speakers = gr.Slider(1, MAX_TTS, default=1, label="min_speakers", step=1, visible=False)
794
- max_speakers = gr.Slider(1, MAX_TTS, value=2, step=1, label="Max speakers", interative=True)
795
- gr.Markdown("Select the voice you want for each speaker.")
796
- def submit(value):
797
- visibility_dict = {
798
- f'tts_voice{i:02d}': gr.update(visible=i < value) for i in range(6)
799
- }
800
- return [value for value in visibility_dict.values()]
801
- tts_voice00 = gr.Dropdown(list_tts, value='en-AU-WilliamNeural-Male', label = 'TTS Speaker 1', visible=True, interactive= True)
802
- tts_voice01 = gr.Dropdown(list_tts, value='en-CA-ClaraNeural-Female', label = 'TTS Speaker 2', visible=True, interactive= True)
803
- tts_voice02 = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS Speaker 3', visible=False, interactive= True)
804
- tts_voice03 = gr.Dropdown(list_tts, value='en-GB-SoniaNeural-Female', label = 'TTS Speaker 4', visible=False, interactive= True)
805
- tts_voice04 = gr.Dropdown(list_tts, value='en-NZ-MitchellNeural-Male', label = 'TTS Speaker 5', visible=False, interactive= True)
806
- tts_voice05 = gr.Dropdown(list_tts, value='en-GB-MaisieNeural-Female', label = 'TTS Speaker 6', visible=False, interactive= True)
807
- max_speakers.change(submit, max_speakers, [tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04, tts_voice05])
808
-
809
- with gr.Column():
810
- with gr.Accordion("Advanced Settings", open=False):
811
-
812
- AUDIO_MIX = gr.Dropdown(['Mixing audio with sidechain compression', 'Adjusting volumes and mixing audio'], value='Adjusting volumes and mixing audio', label = 'Audio Mixing Method', info="Mix original and translated audio files to create a customized, balanced output with two available mixing modes.")
813
-
814
- gr.HTML("<hr></h2>")
815
- gr.Markdown("Default configuration of Whisper.")
816
- WHISPER_MODEL_SIZE = gr.inputs.Dropdown(['tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2'], default=whisper_model_default, label="Whisper model")
817
- batch_size = gr.inputs.Slider(1, 32, default=16, label="Batch size", step=1)
818
- compute_type = gr.inputs.Dropdown(list_compute_type, default=compute_type_default, label="Compute type")
819
-
820
- gr.HTML("<hr></h2>")
821
- VIDEO_OUTPUT_NAME = gr.Textbox(label="Translated file name" ,value="video_output.mp4", info="The name of the output file")
822
- PREVIEW = gr.Checkbox(label="Preview", info="Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.")
823
-
824
- with gr.Column(variant='compact'):
825
- with gr.Row():
826
- video_button = gr.Button("TRANSLATE", )
827
- with gr.Row():
828
- video_output = gr.Video() #gr.outputs.File(label="DOWNLOAD TRANSLATED VIDEO")
829
-
830
- line_ = gr.HTML("<hr></h2>")
831
- if os.getenv("YOUR_HF_TOKEN") == None or os.getenv("YOUR_HF_TOKEN") == "":
832
- HFKEY = gr.Textbox(visible= True, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
833
- else:
834
- HFKEY = gr.Textbox(visible= False, label="HF Token", info="One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", placeholder="Token goes here...")
835
-
836
- gr.Examples(
837
- examples=[
838
- [
839
- "./assets/Video_main.mp4",
840
- "",
841
- False,
842
- "large-v2",
843
- 16,
844
- "float16",
845
- "Spanish (es)",
846
- "English (en)",
847
- 1,
848
- 2,
849
- 'en-AU-WilliamNeural-Male',
850
- 'en-CA-ClaraNeural-Female',
851
- 'en-GB-ThomasNeural-Male',
852
- 'en-GB-SoniaNeural-Female',
853
- 'en-NZ-MitchellNeural-Male',
854
- 'en-GB-MaisieNeural-Female',
855
- "video_output.mp4",
856
- 'Adjusting volumes and mixing audio',
857
- ],
858
- ],
859
- fn=translate_from_video,
860
- inputs=[
861
- video_input,
862
- HFKEY,
863
- PREVIEW,
864
- WHISPER_MODEL_SIZE,
865
- batch_size,
866
- compute_type,
867
- SOURCE_LANGUAGE,
868
- TRANSLATE_AUDIO_TO,
869
- min_speakers,
870
- max_speakers,
871
- tts_voice00,
872
- tts_voice01,
873
- tts_voice02,
874
- tts_voice03,
875
- tts_voice04,
876
- tts_voice05,
877
- VIDEO_OUTPUT_NAME,
878
- AUDIO_MIX,
879
- ],
880
- outputs=[video_output],
881
- cache_examples=False,
882
- )
883
-
884
-
885
-
886
-
887
- with gr.Tab("Custom voice RVC"):
888
- with gr.Column():
889
- with gr.Accordion("Download RVC Models", open=True):
890
- url_links = gr.Textbox(label="URLs", value="",info="Automatically download the RVC models from the URL. You can use links from HuggingFace or Drive, and you can include several links, each one separated by a comma.", placeholder="urls here...", lines=1)
891
- download_finish = gr.HTML()
892
- download_button = gr.Button("DOWNLOAD MODELS")
893
-
894
- def update_models():
895
- models, index_paths = upload_model_list()
896
- for i in range(8):
897
- dict_models = {
898
- f'model_voice_path{i:02d}': gr.update(choices=models) for i in range(8)
899
- }
900
- dict_index = {
901
- f'file_index2_{i:02d}': gr.update(choices=index_paths) for i in range(8)
902
- }
903
- dict_changes = {**dict_models, **dict_index}
904
- return [value for value in dict_changes.values()]
905
-
906
- with gr.Column():
907
- with gr.Accordion("Replace voice: TTS to RVC", open=False):
908
- with gr.Column(variant='compact'):
909
- with gr.Column():
910
- gr.Markdown("### 1. To enable its use, mark it as enable.")
911
- enable_custom_voice = gr.Checkbox(label="ENABLE", info="Check this to enable the use of the models.")
912
- enable_custom_voice.change(custom_model_voice_enable, [enable_custom_voice], [])
913
-
914
- gr.Markdown("### 2. Select a voice that will be applied to each TTS of each corresponding speaker and apply the configurations.")
915
-
916
- gr.Markdown("Voice to apply to the first speaker.")
917
- with gr.Row():
918
- model_voice_path00 = gr.Dropdown(models, label = 'Model-1', visible=True, interactive= True)
919
- file_index2_00 = gr.Dropdown(index_paths, label = 'Index-1', visible=True, interactive= True)
920
- name_transpose00 = gr.Number(label = 'Transpose-1', value=0, visible=True, interactive= True)
921
- gr.HTML("<hr></h2>")
922
- gr.Markdown("Voice to apply to the second speaker.")
923
- with gr.Row():
924
- model_voice_path01 = gr.Dropdown(models, label='Model-2', visible=True, interactive=True)
925
- file_index2_01 = gr.Dropdown(index_paths, label='Index-2', visible=True, interactive=True)
926
- name_transpose01 = gr.Number(label='Transpose-2', value=0, visible=True, interactive=True)
927
- gr.HTML("<hr></h2>")
928
- gr.Markdown("Voice to apply to the third speaker.")
929
- with gr.Row():
930
- model_voice_path02 = gr.Dropdown(models, label='Model-3', visible=True, interactive=True)
931
- file_index2_02 = gr.Dropdown(index_paths, label='Index-3', visible=True, interactive=True)
932
- name_transpose02 = gr.Number(label='Transpose-3', value=0, visible=True, interactive=True)
933
- gr.HTML("<hr></h2>")
934
- gr.Markdown("Voice to apply to the fourth speaker.")
935
- with gr.Row():
936
- model_voice_path03 = gr.Dropdown(models, label='Model-4', visible=True, interactive=True)
937
- file_index2_03 = gr.Dropdown(index_paths, label='Index-4', visible=True, interactive=True)
938
- name_transpose03 = gr.Number(label='Transpose-4', value=0, visible=True, interactive=True)
939
- gr.HTML("<hr></h2>")
940
- gr.Markdown("Voice to apply to the fifth speaker.")
941
- with gr.Row():
942
- model_voice_path04 = gr.Dropdown(models, label='Model-5', visible=True, interactive=True)
943
- file_index2_04 = gr.Dropdown(index_paths, label='Index-5', visible=True, interactive=True)
944
- name_transpose04 = gr.Number(label='Transpose-5', value=0, visible=True, interactive=True)
945
- gr.HTML("<hr></h2>")
946
- gr.Markdown("Voice to apply to the sixth speaker.")
947
- with gr.Row():
948
- model_voice_path05 = gr.Dropdown(models, label='Model-6', visible=True, interactive=True)
949
- file_index2_05 = gr.Dropdown(index_paths, label='Index-6', visible=True, interactive=True)
950
- name_transpose05 = gr.Number(label='Transpose-6', value=0, visible=True, interactive=True)
951
- gr.HTML("<hr></h2>")
952
- gr.Markdown("- Voice to apply in case a speaker is not detected successfully.")
953
- with gr.Row():
954
- model_voice_path06 = gr.Dropdown(models, label='Model-Aux', visible=True, interactive=True)
955
- file_index2_06 = gr.Dropdown(index_paths, label='Index-Aux', visible=True, interactive=True)
956
- name_transpose06 = gr.Number(label='Transpose-Aux', value=0, visible=True, interactive=True)
957
- gr.HTML("<hr></h2>")
958
- with gr.Row():
959
- f0_method_global = gr.Dropdown(f0_methods_voice, value='pm', label = 'Global F0 method', visible=True, interactive= True)
960
-
961
- with gr.Row(variant='compact'):
962
- button_config = gr.Button("APPLY CONFIGURATION")
963
-
964
- confirm_conf = gr.HTML()
965
-
966
- button_config.click(voices.apply_conf, inputs=[
967
- f0_method_global,
968
- model_voice_path00, name_transpose00, file_index2_00,
969
- model_voice_path01, name_transpose01, file_index2_01,
970
- model_voice_path02, name_transpose02, file_index2_02,
971
- model_voice_path03, name_transpose03, file_index2_03,
972
- model_voice_path04, name_transpose04, file_index2_04,
973
- model_voice_path05, name_transpose05, file_index2_05,
974
- model_voice_path06, name_transpose06, file_index2_06,
975
- ], outputs=[confirm_conf])
976
-
977
-
978
- with gr.Column():
979
- with gr.Accordion("Test RVC", open=False):
980
-
981
- with gr.Row(variant='compact'):
982
- text_test = gr.Textbox(label="Text", value="This is an example",info="write a text", placeholder="...", lines=5)
983
- with gr.Column():
984
- tts_test = gr.Dropdown(list_tts, value='en-GB-ThomasNeural-Male', label = 'TTS', visible=True, interactive= True)
985
- model_voice_path07 = gr.Dropdown(models, label = 'Model', visible=True, interactive= True) #value=''
986
- file_index2_07 = gr.Dropdown(index_paths, label = 'Index', visible=True, interactive= True) #value=''
987
- transpose_test = gr.Number(label = 'Transpose', value=0, visible=True, interactive= True, info="integer, number of semitones, raise by an octave: 12, lower by an octave: -12")
988
- f0method_test = gr.Dropdown(f0_methods_voice, value='pm', label = 'F0 method', visible=True, interactive= True)
989
- with gr.Row(variant='compact'):
990
- button_test = gr.Button("Test audio")
991
-
992
- with gr.Column():
993
- with gr.Row():
994
- original_ttsvoice = gr.Audio()
995
- ttsvoice = gr.Audio()
996
-
997
- button_test.click(voices.make_test, inputs=[
998
- text_test,
999
- tts_test,
1000
- model_voice_path07,
1001
- file_index2_07,
1002
- transpose_test,
1003
- f0method_test,
1004
- ], outputs=[ttsvoice, original_ttsvoice])
1005
-
1006
- download_button.click(download_list, [url_links], [download_finish]).then(update_models, [],
1007
- [
1008
- model_voice_path00, model_voice_path01, model_voice_path02, model_voice_path03, model_voice_path04, model_voice_path05, model_voice_path06, model_voice_path07,
1009
- file_index2_00, file_index2_01, file_index2_02, file_index2_03, file_index2_04, file_index2_05, file_index2_06, file_index2_07
1010
- ])
1011
-
1012
-
1013
- with gr.Tab("Help"):
1014
- gr.Markdown(tutorial)
1015
- gr.Markdown(news)
1016
-
1017
- with gr.Accordion("Logs", open = False):
1018
- logs = gr.Textbox()
1019
- demo.load(read_logs, None, logs, every=1)
1020
-
1021
- # run
1022
- video_button.click(translate_from_video, inputs=[
1023
- video_input,
1024
- HFKEY,
1025
- PREVIEW,
1026
- WHISPER_MODEL_SIZE,
1027
- batch_size,
1028
- compute_type,
1029
- SOURCE_LANGUAGE,
1030
- TRANSLATE_AUDIO_TO,
1031
- min_speakers,
1032
- max_speakers,
1033
- tts_voice00,
1034
- tts_voice01,
1035
- tts_voice02,
1036
- tts_voice03,
1037
- tts_voice04,
1038
- tts_voice05,
1039
- VIDEO_OUTPUT_NAME,
1040
- AUDIO_MIX,
1041
- ], outputs=video_output)
1042
- text_button.click(translate_from_video, inputs=[
1043
- blink_input,
1044
- bHFKEY,
1045
- bPREVIEW,
1046
- bWHISPER_MODEL_SIZE,
1047
- bbatch_size,
1048
- bcompute_type,
1049
- bSOURCE_LANGUAGE,
1050
- bTRANSLATE_AUDIO_TO,
1051
- bmin_speakers,
1052
- bmax_speakers,
1053
- btts_voice00,
1054
- btts_voice01,
1055
- btts_voice02,
1056
- btts_voice03,
1057
- btts_voice04,
1058
- btts_voice05,
1059
- bVIDEO_OUTPUT_NAME,
1060
- bAUDIO_MIX,
1061
- ], outputs=blink_output)
1062
-
1063
- demo.launch(debug=False, enable_queue=True)
1064
- #demo.launch(share=True, enable_queue=True, quiet=True, debug=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/Video_main.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:51ca9b5aac32a1b95b15420fc3cd339843d893d1d5e774e5b24b65127784e74f
3
- size 9106809
 
 
 
 
assets/Video_subtitled.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:98cb285e72372f3a141029571b8b5398c28196081f1f30e4590e1fc37faa0536
3
- size 3490389
 
 
 
 
configs/32k.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": false,
11
- "lr_decay": 0.999875,
12
- "segment_size": 12800,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 32000,
21
- "filter_length": 1024,
22
- "hop_length": 320,
23
- "win_length": 1024,
24
- "n_mel_channels": 80,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [10,4,2,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16,16,4,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/32k_v2.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": false,
11
- "lr_decay": 0.999875,
12
- "segment_size": 12800,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 32000,
21
- "filter_length": 1024,
22
- "hop_length": 320,
23
- "win_length": 1024,
24
- "n_mel_channels": 80,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [10,8,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [20,16,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/40k.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": false,
11
- "lr_decay": 0.999875,
12
- "segment_size": 12800,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 40000,
21
- "filter_length": 2048,
22
- "hop_length": 400,
23
- "win_length": 2048,
24
- "n_mel_channels": 125,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [10,10,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16,16,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/48k.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": false,
11
- "lr_decay": 0.999875,
12
- "segment_size": 11520,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 48000,
21
- "filter_length": 2048,
22
- "hop_length": 480,
23
- "win_length": 2048,
24
- "n_mel_channels": 128,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [10,6,2,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16,16,4,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/48k_v2.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": false,
11
- "lr_decay": 0.999875,
12
- "segment_size": 17280,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 48000,
21
- "filter_length": 2048,
22
- "hop_length": 480,
23
- "win_length": 2048,
24
- "n_mel_channels": 128,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [12,10,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [24,20,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/audio.py DELETED
@@ -1,21 +0,0 @@
1
- import ffmpeg
2
- import numpy as np
3
-
4
-
5
- def load_audio(file, sr):
6
- try:
7
- # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
8
- # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
9
- # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10
- file = (
11
- file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12
- ) # To prevent beginners from copying paths with leading or trailing spaces, quotation marks, and line breaks.
13
- out, _ = (
14
- ffmpeg.input(file, threads=0)
15
- .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16
- .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17
- )
18
- except Exception as e:
19
- raise RuntimeError(f"Failed to load audio: {e}")
20
-
21
- return np.frombuffer(out, np.float32).flatten()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/attentions.py DELETED
@@ -1,417 +0,0 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import torch
5
- from torch import nn
6
- from torch.nn import functional as F
7
-
8
- from lib.infer_pack import commons
9
- from lib.infer_pack import modules
10
- from lib.infer_pack.modules import LayerNorm
11
-
12
-
13
- class Encoder(nn.Module):
14
- def __init__(
15
- self,
16
- hidden_channels,
17
- filter_channels,
18
- n_heads,
19
- n_layers,
20
- kernel_size=1,
21
- p_dropout=0.0,
22
- window_size=10,
23
- **kwargs
24
- ):
25
- super().__init__()
26
- self.hidden_channels = hidden_channels
27
- self.filter_channels = filter_channels
28
- self.n_heads = n_heads
29
- self.n_layers = n_layers
30
- self.kernel_size = kernel_size
31
- self.p_dropout = p_dropout
32
- self.window_size = window_size
33
-
34
- self.drop = nn.Dropout(p_dropout)
35
- self.attn_layers = nn.ModuleList()
36
- self.norm_layers_1 = nn.ModuleList()
37
- self.ffn_layers = nn.ModuleList()
38
- self.norm_layers_2 = nn.ModuleList()
39
- for i in range(self.n_layers):
40
- self.attn_layers.append(
41
- MultiHeadAttention(
42
- hidden_channels,
43
- hidden_channels,
44
- n_heads,
45
- p_dropout=p_dropout,
46
- window_size=window_size,
47
- )
48
- )
49
- self.norm_layers_1.append(LayerNorm(hidden_channels))
50
- self.ffn_layers.append(
51
- FFN(
52
- hidden_channels,
53
- hidden_channels,
54
- filter_channels,
55
- kernel_size,
56
- p_dropout=p_dropout,
57
- )
58
- )
59
- self.norm_layers_2.append(LayerNorm(hidden_channels))
60
-
61
- def forward(self, x, x_mask):
62
- attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
63
- x = x * x_mask
64
- for i in range(self.n_layers):
65
- y = self.attn_layers[i](x, x, attn_mask)
66
- y = self.drop(y)
67
- x = self.norm_layers_1[i](x + y)
68
-
69
- y = self.ffn_layers[i](x, x_mask)
70
- y = self.drop(y)
71
- x = self.norm_layers_2[i](x + y)
72
- x = x * x_mask
73
- return x
74
-
75
-
76
- class Decoder(nn.Module):
77
- def __init__(
78
- self,
79
- hidden_channels,
80
- filter_channels,
81
- n_heads,
82
- n_layers,
83
- kernel_size=1,
84
- p_dropout=0.0,
85
- proximal_bias=False,
86
- proximal_init=True,
87
- **kwargs
88
- ):
89
- super().__init__()
90
- self.hidden_channels = hidden_channels
91
- self.filter_channels = filter_channels
92
- self.n_heads = n_heads
93
- self.n_layers = n_layers
94
- self.kernel_size = kernel_size
95
- self.p_dropout = p_dropout
96
- self.proximal_bias = proximal_bias
97
- self.proximal_init = proximal_init
98
-
99
- self.drop = nn.Dropout(p_dropout)
100
- self.self_attn_layers = nn.ModuleList()
101
- self.norm_layers_0 = nn.ModuleList()
102
- self.encdec_attn_layers = nn.ModuleList()
103
- self.norm_layers_1 = nn.ModuleList()
104
- self.ffn_layers = nn.ModuleList()
105
- self.norm_layers_2 = nn.ModuleList()
106
- for i in range(self.n_layers):
107
- self.self_attn_layers.append(
108
- MultiHeadAttention(
109
- hidden_channels,
110
- hidden_channels,
111
- n_heads,
112
- p_dropout=p_dropout,
113
- proximal_bias=proximal_bias,
114
- proximal_init=proximal_init,
115
- )
116
- )
117
- self.norm_layers_0.append(LayerNorm(hidden_channels))
118
- self.encdec_attn_layers.append(
119
- MultiHeadAttention(
120
- hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
- )
122
- )
123
- self.norm_layers_1.append(LayerNorm(hidden_channels))
124
- self.ffn_layers.append(
125
- FFN(
126
- hidden_channels,
127
- hidden_channels,
128
- filter_channels,
129
- kernel_size,
130
- p_dropout=p_dropout,
131
- causal=True,
132
- )
133
- )
134
- self.norm_layers_2.append(LayerNorm(hidden_channels))
135
-
136
- def forward(self, x, x_mask, h, h_mask):
137
- """
138
- x: decoder input
139
- h: encoder output
140
- """
141
- self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142
- device=x.device, dtype=x.dtype
143
- )
144
- encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145
- x = x * x_mask
146
- for i in range(self.n_layers):
147
- y = self.self_attn_layers[i](x, x, self_attn_mask)
148
- y = self.drop(y)
149
- x = self.norm_layers_0[i](x + y)
150
-
151
- y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152
- y = self.drop(y)
153
- x = self.norm_layers_1[i](x + y)
154
-
155
- y = self.ffn_layers[i](x, x_mask)
156
- y = self.drop(y)
157
- x = self.norm_layers_2[i](x + y)
158
- x = x * x_mask
159
- return x
160
-
161
-
162
- class MultiHeadAttention(nn.Module):
163
- def __init__(
164
- self,
165
- channels,
166
- out_channels,
167
- n_heads,
168
- p_dropout=0.0,
169
- window_size=None,
170
- heads_share=True,
171
- block_length=None,
172
- proximal_bias=False,
173
- proximal_init=False,
174
- ):
175
- super().__init__()
176
- assert channels % n_heads == 0
177
-
178
- self.channels = channels
179
- self.out_channels = out_channels
180
- self.n_heads = n_heads
181
- self.p_dropout = p_dropout
182
- self.window_size = window_size
183
- self.heads_share = heads_share
184
- self.block_length = block_length
185
- self.proximal_bias = proximal_bias
186
- self.proximal_init = proximal_init
187
- self.attn = None
188
-
189
- self.k_channels = channels // n_heads
190
- self.conv_q = nn.Conv1d(channels, channels, 1)
191
- self.conv_k = nn.Conv1d(channels, channels, 1)
192
- self.conv_v = nn.Conv1d(channels, channels, 1)
193
- self.conv_o = nn.Conv1d(channels, out_channels, 1)
194
- self.drop = nn.Dropout(p_dropout)
195
-
196
- if window_size is not None:
197
- n_heads_rel = 1 if heads_share else n_heads
198
- rel_stddev = self.k_channels**-0.5
199
- self.emb_rel_k = nn.Parameter(
200
- torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201
- * rel_stddev
202
- )
203
- self.emb_rel_v = nn.Parameter(
204
- torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205
- * rel_stddev
206
- )
207
-
208
- nn.init.xavier_uniform_(self.conv_q.weight)
209
- nn.init.xavier_uniform_(self.conv_k.weight)
210
- nn.init.xavier_uniform_(self.conv_v.weight)
211
- if proximal_init:
212
- with torch.no_grad():
213
- self.conv_k.weight.copy_(self.conv_q.weight)
214
- self.conv_k.bias.copy_(self.conv_q.bias)
215
-
216
- def forward(self, x, c, attn_mask=None):
217
- q = self.conv_q(x)
218
- k = self.conv_k(c)
219
- v = self.conv_v(c)
220
-
221
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
222
-
223
- x = self.conv_o(x)
224
- return x
225
-
226
- def attention(self, query, key, value, mask=None):
227
- # reshape [b, d, t] -> [b, n_h, t, d_k]
228
- b, d, t_s, t_t = (*key.size(), query.size(2))
229
- query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230
- key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231
- value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232
-
233
- scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234
- if self.window_size is not None:
235
- assert (
236
- t_s == t_t
237
- ), "Relative attention is only available for self-attention."
238
- key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239
- rel_logits = self._matmul_with_relative_keys(
240
- query / math.sqrt(self.k_channels), key_relative_embeddings
241
- )
242
- scores_local = self._relative_position_to_absolute_position(rel_logits)
243
- scores = scores + scores_local
244
- if self.proximal_bias:
245
- assert t_s == t_t, "Proximal bias is only available for self-attention."
246
- scores = scores + self._attention_bias_proximal(t_s).to(
247
- device=scores.device, dtype=scores.dtype
248
- )
249
- if mask is not None:
250
- scores = scores.masked_fill(mask == 0, -1e4)
251
- if self.block_length is not None:
252
- assert (
253
- t_s == t_t
254
- ), "Local attention is only available for self-attention."
255
- block_mask = (
256
- torch.ones_like(scores)
257
- .triu(-self.block_length)
258
- .tril(self.block_length)
259
- )
260
- scores = scores.masked_fill(block_mask == 0, -1e4)
261
- p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
262
- p_attn = self.drop(p_attn)
263
- output = torch.matmul(p_attn, value)
264
- if self.window_size is not None:
265
- relative_weights = self._absolute_position_to_relative_position(p_attn)
266
- value_relative_embeddings = self._get_relative_embeddings(
267
- self.emb_rel_v, t_s
268
- )
269
- output = output + self._matmul_with_relative_values(
270
- relative_weights, value_relative_embeddings
271
- )
272
- output = (
273
- output.transpose(2, 3).contiguous().view(b, d, t_t)
274
- ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
275
- return output, p_attn
276
-
277
- def _matmul_with_relative_values(self, x, y):
278
- """
279
- x: [b, h, l, m]
280
- y: [h or 1, m, d]
281
- ret: [b, h, l, d]
282
- """
283
- ret = torch.matmul(x, y.unsqueeze(0))
284
- return ret
285
-
286
- def _matmul_with_relative_keys(self, x, y):
287
- """
288
- x: [b, h, l, d]
289
- y: [h or 1, m, d]
290
- ret: [b, h, l, m]
291
- """
292
- ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293
- return ret
294
-
295
- def _get_relative_embeddings(self, relative_embeddings, length):
296
- max_relative_position = 2 * self.window_size + 1
297
- # Pad first before slice to avoid using cond ops.
298
- pad_length = max(length - (self.window_size + 1), 0)
299
- slice_start_position = max((self.window_size + 1) - length, 0)
300
- slice_end_position = slice_start_position + 2 * length - 1
301
- if pad_length > 0:
302
- padded_relative_embeddings = F.pad(
303
- relative_embeddings,
304
- commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305
- )
306
- else:
307
- padded_relative_embeddings = relative_embeddings
308
- used_relative_embeddings = padded_relative_embeddings[
309
- :, slice_start_position:slice_end_position
310
- ]
311
- return used_relative_embeddings
312
-
313
- def _relative_position_to_absolute_position(self, x):
314
- """
315
- x: [b, h, l, 2*l-1]
316
- ret: [b, h, l, l]
317
- """
318
- batch, heads, length, _ = x.size()
319
- # Concat columns of pad to shift from relative to absolute indexing.
320
- x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321
-
322
- # Concat extra elements so to add up to shape (len+1, 2*len-1).
323
- x_flat = x.view([batch, heads, length * 2 * length])
324
- x_flat = F.pad(
325
- x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326
- )
327
-
328
- # Reshape and slice out the padded elements.
329
- x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330
- :, :, :length, length - 1 :
331
- ]
332
- return x_final
333
-
334
- def _absolute_position_to_relative_position(self, x):
335
- """
336
- x: [b, h, l, l]
337
- ret: [b, h, l, 2*l-1]
338
- """
339
- batch, heads, length, _ = x.size()
340
- # padd along column
341
- x = F.pad(
342
- x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343
- )
344
- x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345
- # add 0's in the beginning that will skew the elements after reshape
346
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347
- x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348
- return x_final
349
-
350
- def _attention_bias_proximal(self, length):
351
- """Bias for self-attention to encourage attention to close positions.
352
- Args:
353
- length: an integer scalar.
354
- Returns:
355
- a Tensor with shape [1, 1, length, length]
356
- """
357
- r = torch.arange(length, dtype=torch.float32)
358
- diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359
- return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360
-
361
-
362
- class FFN(nn.Module):
363
- def __init__(
364
- self,
365
- in_channels,
366
- out_channels,
367
- filter_channels,
368
- kernel_size,
369
- p_dropout=0.0,
370
- activation=None,
371
- causal=False,
372
- ):
373
- super().__init__()
374
- self.in_channels = in_channels
375
- self.out_channels = out_channels
376
- self.filter_channels = filter_channels
377
- self.kernel_size = kernel_size
378
- self.p_dropout = p_dropout
379
- self.activation = activation
380
- self.causal = causal
381
-
382
- if causal:
383
- self.padding = self._causal_padding
384
- else:
385
- self.padding = self._same_padding
386
-
387
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388
- self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389
- self.drop = nn.Dropout(p_dropout)
390
-
391
- def forward(self, x, x_mask):
392
- x = self.conv_1(self.padding(x * x_mask))
393
- if self.activation == "gelu":
394
- x = x * torch.sigmoid(1.702 * x)
395
- else:
396
- x = torch.relu(x)
397
- x = self.drop(x)
398
- x = self.conv_2(self.padding(x * x_mask))
399
- return x * x_mask
400
-
401
- def _causal_padding(self, x):
402
- if self.kernel_size == 1:
403
- return x
404
- pad_l = self.kernel_size - 1
405
- pad_r = 0
406
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407
- x = F.pad(x, commons.convert_pad_shape(padding))
408
- return x
409
-
410
- def _same_padding(self, x):
411
- if self.kernel_size == 1:
412
- return x
413
- pad_l = (self.kernel_size - 1) // 2
414
- pad_r = self.kernel_size // 2
415
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416
- x = F.pad(x, commons.convert_pad_shape(padding))
417
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/commons.py DELETED
@@ -1,166 +0,0 @@
1
- import math
2
- import numpy as np
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
-
8
- def init_weights(m, mean=0.0, std=0.01):
9
- classname = m.__class__.__name__
10
- if classname.find("Conv") != -1:
11
- m.weight.data.normal_(mean, std)
12
-
13
-
14
- def get_padding(kernel_size, dilation=1):
15
- return int((kernel_size * dilation - dilation) / 2)
16
-
17
-
18
- def convert_pad_shape(pad_shape):
19
- l = pad_shape[::-1]
20
- pad_shape = [item for sublist in l for item in sublist]
21
- return pad_shape
22
-
23
-
24
- def kl_divergence(m_p, logs_p, m_q, logs_q):
25
- """KL(P||Q)"""
26
- kl = (logs_q - logs_p) - 0.5
27
- kl += (
28
- 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
29
- )
30
- return kl
31
-
32
-
33
- def rand_gumbel(shape):
34
- """Sample from the Gumbel distribution, protect from overflows."""
35
- uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
36
- return -torch.log(-torch.log(uniform_samples))
37
-
38
-
39
- def rand_gumbel_like(x):
40
- g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
41
- return g
42
-
43
-
44
- def slice_segments(x, ids_str, segment_size=4):
45
- ret = torch.zeros_like(x[:, :, :segment_size])
46
- for i in range(x.size(0)):
47
- idx_str = ids_str[i]
48
- idx_end = idx_str + segment_size
49
- ret[i] = x[i, :, idx_str:idx_end]
50
- return ret
51
-
52
-
53
- def slice_segments2(x, ids_str, segment_size=4):
54
- ret = torch.zeros_like(x[:, :segment_size])
55
- for i in range(x.size(0)):
56
- idx_str = ids_str[i]
57
- idx_end = idx_str + segment_size
58
- ret[i] = x[i, idx_str:idx_end]
59
- return ret
60
-
61
-
62
- def rand_slice_segments(x, x_lengths=None, segment_size=4):
63
- b, d, t = x.size()
64
- if x_lengths is None:
65
- x_lengths = t
66
- ids_str_max = x_lengths - segment_size + 1
67
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
68
- ret = slice_segments(x, ids_str, segment_size)
69
- return ret, ids_str
70
-
71
-
72
- def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
73
- position = torch.arange(length, dtype=torch.float)
74
- num_timescales = channels // 2
75
- log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
76
- num_timescales - 1
77
- )
78
- inv_timescales = min_timescale * torch.exp(
79
- torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
80
- )
81
- scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
82
- signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
83
- signal = F.pad(signal, [0, 0, 0, channels % 2])
84
- signal = signal.view(1, channels, length)
85
- return signal
86
-
87
-
88
- def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
89
- b, channels, length = x.size()
90
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
91
- return x + signal.to(dtype=x.dtype, device=x.device)
92
-
93
-
94
- def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
95
- b, channels, length = x.size()
96
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
97
- return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
98
-
99
-
100
- def subsequent_mask(length):
101
- mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
102
- return mask
103
-
104
-
105
- @torch.jit.script
106
- def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
107
- n_channels_int = n_channels[0]
108
- in_act = input_a + input_b
109
- t_act = torch.tanh(in_act[:, :n_channels_int, :])
110
- s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
111
- acts = t_act * s_act
112
- return acts
113
-
114
-
115
- def convert_pad_shape(pad_shape):
116
- l = pad_shape[::-1]
117
- pad_shape = [item for sublist in l for item in sublist]
118
- return pad_shape
119
-
120
-
121
- def shift_1d(x):
122
- x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
123
- return x
124
-
125
-
126
- def sequence_mask(length, max_length=None):
127
- if max_length is None:
128
- max_length = length.max()
129
- x = torch.arange(max_length, dtype=length.dtype, device=length.device)
130
- return x.unsqueeze(0) < length.unsqueeze(1)
131
-
132
-
133
- def generate_path(duration, mask):
134
- """
135
- duration: [b, 1, t_x]
136
- mask: [b, 1, t_y, t_x]
137
- """
138
- device = duration.device
139
-
140
- b, _, t_y, t_x = mask.shape
141
- cum_duration = torch.cumsum(duration, -1)
142
-
143
- cum_duration_flat = cum_duration.view(b * t_x)
144
- path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145
- path = path.view(b, t_x, t_y)
146
- path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147
- path = path.unsqueeze(1).transpose(2, 3) * mask
148
- return path
149
-
150
-
151
- def clip_grad_value_(parameters, clip_value, norm_type=2):
152
- if isinstance(parameters, torch.Tensor):
153
- parameters = [parameters]
154
- parameters = list(filter(lambda p: p.grad is not None, parameters))
155
- norm_type = float(norm_type)
156
- if clip_value is not None:
157
- clip_value = float(clip_value)
158
-
159
- total_norm = 0
160
- for p in parameters:
161
- param_norm = p.grad.data.norm(norm_type)
162
- total_norm += param_norm.item() ** norm_type
163
- if clip_value is not None:
164
- p.grad.data.clamp_(min=-clip_value, max=clip_value)
165
- total_norm = total_norm ** (1.0 / norm_type)
166
- return total_norm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/models.py DELETED
@@ -1,1142 +0,0 @@
1
- import math, pdb, os
2
- from time import time as ttime
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
- from lib.infer_pack import modules
7
- from lib.infer_pack import attentions
8
- from lib.infer_pack import commons
9
- from lib.infer_pack.commons import init_weights, get_padding
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from lib.infer_pack.commons import init_weights
13
- import numpy as np
14
- from lib.infer_pack import commons
15
-
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder768(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(768, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- stats = self.proj(x) * x_mask
106
-
107
- m, logs = torch.split(stats, self.out_channels, dim=1)
108
- return m, logs, x_mask
109
-
110
-
111
- class ResidualCouplingBlock(nn.Module):
112
- def __init__(
113
- self,
114
- channels,
115
- hidden_channels,
116
- kernel_size,
117
- dilation_rate,
118
- n_layers,
119
- n_flows=4,
120
- gin_channels=0,
121
- ):
122
- super().__init__()
123
- self.channels = channels
124
- self.hidden_channels = hidden_channels
125
- self.kernel_size = kernel_size
126
- self.dilation_rate = dilation_rate
127
- self.n_layers = n_layers
128
- self.n_flows = n_flows
129
- self.gin_channels = gin_channels
130
-
131
- self.flows = nn.ModuleList()
132
- for i in range(n_flows):
133
- self.flows.append(
134
- modules.ResidualCouplingLayer(
135
- channels,
136
- hidden_channels,
137
- kernel_size,
138
- dilation_rate,
139
- n_layers,
140
- gin_channels=gin_channels,
141
- mean_only=True,
142
- )
143
- )
144
- self.flows.append(modules.Flip())
145
-
146
- def forward(self, x, x_mask, g=None, reverse=False):
147
- if not reverse:
148
- for flow in self.flows:
149
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
- else:
151
- for flow in reversed(self.flows):
152
- x = flow(x, x_mask, g=g, reverse=reverse)
153
- return x
154
-
155
- def remove_weight_norm(self):
156
- for i in range(self.n_flows):
157
- self.flows[i * 2].remove_weight_norm()
158
-
159
-
160
- class PosteriorEncoder(nn.Module):
161
- def __init__(
162
- self,
163
- in_channels,
164
- out_channels,
165
- hidden_channels,
166
- kernel_size,
167
- dilation_rate,
168
- n_layers,
169
- gin_channels=0,
170
- ):
171
- super().__init__()
172
- self.in_channels = in_channels
173
- self.out_channels = out_channels
174
- self.hidden_channels = hidden_channels
175
- self.kernel_size = kernel_size
176
- self.dilation_rate = dilation_rate
177
- self.n_layers = n_layers
178
- self.gin_channels = gin_channels
179
-
180
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
- self.enc = modules.WN(
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- gin_channels=gin_channels,
187
- )
188
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
-
190
- def forward(self, x, x_lengths, g=None):
191
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
- x.dtype
193
- )
194
- x = self.pre(x) * x_mask
195
- x = self.enc(x, x_mask, g=g)
196
- stats = self.proj(x) * x_mask
197
- m, logs = torch.split(stats, self.out_channels, dim=1)
198
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
- return z, m, logs, x_mask
200
-
201
- def remove_weight_norm(self):
202
- self.enc.remove_weight_norm()
203
-
204
-
205
- class Generator(torch.nn.Module):
206
- def __init__(
207
- self,
208
- initial_channel,
209
- resblock,
210
- resblock_kernel_sizes,
211
- resblock_dilation_sizes,
212
- upsample_rates,
213
- upsample_initial_channel,
214
- upsample_kernel_sizes,
215
- gin_channels=0,
216
- ):
217
- super(Generator, self).__init__()
218
- self.num_kernels = len(resblock_kernel_sizes)
219
- self.num_upsamples = len(upsample_rates)
220
- self.conv_pre = Conv1d(
221
- initial_channel, upsample_initial_channel, 7, 1, padding=3
222
- )
223
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
-
225
- self.ups = nn.ModuleList()
226
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
- self.ups.append(
228
- weight_norm(
229
- ConvTranspose1d(
230
- upsample_initial_channel // (2**i),
231
- upsample_initial_channel // (2 ** (i + 1)),
232
- k,
233
- u,
234
- padding=(k - u) // 2,
235
- )
236
- )
237
- )
238
-
239
- self.resblocks = nn.ModuleList()
240
- for i in range(len(self.ups)):
241
- ch = upsample_initial_channel // (2 ** (i + 1))
242
- for j, (k, d) in enumerate(
243
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
- ):
245
- self.resblocks.append(resblock(ch, k, d))
246
-
247
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
- self.ups.apply(init_weights)
249
-
250
- if gin_channels != 0:
251
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
-
253
- def forward(self, x, g=None):
254
- x = self.conv_pre(x)
255
- if g is not None:
256
- x = x + self.cond(g)
257
-
258
- for i in range(self.num_upsamples):
259
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
- x = self.ups[i](x)
261
- xs = None
262
- for j in range(self.num_kernels):
263
- if xs is None:
264
- xs = self.resblocks[i * self.num_kernels + j](x)
265
- else:
266
- xs += self.resblocks[i * self.num_kernels + j](x)
267
- x = xs / self.num_kernels
268
- x = F.leaky_relu(x)
269
- x = self.conv_post(x)
270
- x = torch.tanh(x)
271
-
272
- return x
273
-
274
- def remove_weight_norm(self):
275
- for l in self.ups:
276
- remove_weight_norm(l)
277
- for l in self.resblocks:
278
- l.remove_weight_norm()
279
-
280
-
281
- class SineGen(torch.nn.Module):
282
- """Definition of sine generator
283
- SineGen(samp_rate, harmonic_num = 0,
284
- sine_amp = 0.1, noise_std = 0.003,
285
- voiced_threshold = 0,
286
- flag_for_pulse=False)
287
- samp_rate: sampling rate in Hz
288
- harmonic_num: number of harmonic overtones (default 0)
289
- sine_amp: amplitude of sine-wavefrom (default 0.1)
290
- noise_std: std of Gaussian noise (default 0.003)
291
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
- Note: when flag_for_pulse is True, the first time step of a voiced
294
- segment is always sin(np.pi) or cos(0)
295
- """
296
-
297
- def __init__(
298
- self,
299
- samp_rate,
300
- harmonic_num=0,
301
- sine_amp=0.1,
302
- noise_std=0.003,
303
- voiced_threshold=0,
304
- flag_for_pulse=False,
305
- ):
306
- super(SineGen, self).__init__()
307
- self.sine_amp = sine_amp
308
- self.noise_std = noise_std
309
- self.harmonic_num = harmonic_num
310
- self.dim = self.harmonic_num + 1
311
- self.sampling_rate = samp_rate
312
- self.voiced_threshold = voiced_threshold
313
-
314
- def _f02uv(self, f0):
315
- # generate uv signal
316
- uv = torch.ones_like(f0)
317
- uv = uv * (f0 > self.voiced_threshold)
318
- return uv
319
-
320
- def forward(self, f0, upp):
321
- """sine_tensor, uv = forward(f0)
322
- input F0: tensor(batchsize=1, length, dim=1)
323
- f0 for unvoiced steps should be 0
324
- output sine_tensor: tensor(batchsize=1, length, dim)
325
- output uv: tensor(batchsize=1, length, 1)
326
- """
327
- with torch.no_grad():
328
- f0 = f0[:, None].transpose(1, 2)
329
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
- # fundamental component
331
- f0_buf[:, :, 0] = f0[:, :, 0]
332
- for idx in np.arange(self.harmonic_num):
333
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
- idx + 2
335
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1 means that the product of n_har cannot be post-processed and optimized
337
- rand_ini = torch.rand(
338
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
- )
340
- rand_ini[:, 0] = 0
341
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1 means that the following cumsum can no longer be optimized
343
- tmp_over_one *= upp
344
- tmp_over_one = F.interpolate(
345
- tmp_over_one.transpose(2, 1),
346
- scale_factor=upp,
347
- mode="linear",
348
- align_corners=True,
349
- ).transpose(2, 1)
350
- rad_values = F.interpolate(
351
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
- ).transpose(
353
- 2, 1
354
- ) #######
355
- tmp_over_one %= 1
356
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
- cumsum_shift = torch.zeros_like(rad_values)
358
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
- sine_waves = torch.sin(
360
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
- )
362
- sine_waves = sine_waves * self.sine_amp
363
- uv = self._f02uv(f0)
364
- uv = F.interpolate(
365
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
- ).transpose(2, 1)
367
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
- noise = noise_amp * torch.randn_like(sine_waves)
369
- sine_waves = sine_waves * uv + noise
370
- return sine_waves, uv, noise
371
-
372
-
373
- class SourceModuleHnNSF(torch.nn.Module):
374
- """SourceModule for hn-nsf
375
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
- add_noise_std=0.003, voiced_threshod=0)
377
- sampling_rate: sampling_rate in Hz
378
- harmonic_num: number of harmonic above F0 (default: 0)
379
- sine_amp: amplitude of sine source signal (default: 0.1)
380
- add_noise_std: std of additive Gaussian noise (default: 0.003)
381
- note that amplitude of noise in unvoiced is decided
382
- by sine_amp
383
- voiced_threshold: threhold to set U/V given F0 (default: 0)
384
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
- F0_sampled (batchsize, length, 1)
386
- Sine_source (batchsize, length, 1)
387
- noise_source (batchsize, length 1)
388
- uv (batchsize, length, 1)
389
- """
390
-
391
- def __init__(
392
- self,
393
- sampling_rate,
394
- harmonic_num=0,
395
- sine_amp=0.1,
396
- add_noise_std=0.003,
397
- voiced_threshod=0,
398
- is_half=True,
399
- ):
400
- super(SourceModuleHnNSF, self).__init__()
401
-
402
- self.sine_amp = sine_amp
403
- self.noise_std = add_noise_std
404
- self.is_half = is_half
405
- # to produce sine waveforms
406
- self.l_sin_gen = SineGen(
407
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
- )
409
-
410
- # to merge source harmonics into a single excitation
411
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
- self.l_tanh = torch.nn.Tanh()
413
-
414
- def forward(self, x, upp=None):
415
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
- if self.is_half:
417
- sine_wavs = sine_wavs.half()
418
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
- return sine_merge, None, None # noise, uv
420
-
421
-
422
- class GeneratorNSF(torch.nn.Module):
423
- def __init__(
424
- self,
425
- initial_channel,
426
- resblock,
427
- resblock_kernel_sizes,
428
- resblock_dilation_sizes,
429
- upsample_rates,
430
- upsample_initial_channel,
431
- upsample_kernel_sizes,
432
- gin_channels,
433
- sr,
434
- is_half=False,
435
- ):
436
- super(GeneratorNSF, self).__init__()
437
- self.num_kernels = len(resblock_kernel_sizes)
438
- self.num_upsamples = len(upsample_rates)
439
-
440
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
- self.m_source = SourceModuleHnNSF(
442
- sampling_rate=sr, harmonic_num=0, is_half=is_half
443
- )
444
- self.noise_convs = nn.ModuleList()
445
- self.conv_pre = Conv1d(
446
- initial_channel, upsample_initial_channel, 7, 1, padding=3
447
- )
448
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
-
450
- self.ups = nn.ModuleList()
451
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
- c_cur = upsample_initial_channel // (2 ** (i + 1))
453
- self.ups.append(
454
- weight_norm(
455
- ConvTranspose1d(
456
- upsample_initial_channel // (2**i),
457
- upsample_initial_channel // (2 ** (i + 1)),
458
- k,
459
- u,
460
- padding=(k - u) // 2,
461
- )
462
- )
463
- )
464
- if i + 1 < len(upsample_rates):
465
- stride_f0 = np.prod(upsample_rates[i + 1 :])
466
- self.noise_convs.append(
467
- Conv1d(
468
- 1,
469
- c_cur,
470
- kernel_size=stride_f0 * 2,
471
- stride=stride_f0,
472
- padding=stride_f0 // 2,
473
- )
474
- )
475
- else:
476
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
-
478
- self.resblocks = nn.ModuleList()
479
- for i in range(len(self.ups)):
480
- ch = upsample_initial_channel // (2 ** (i + 1))
481
- for j, (k, d) in enumerate(
482
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
- ):
484
- self.resblocks.append(resblock(ch, k, d))
485
-
486
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
- self.ups.apply(init_weights)
488
-
489
- if gin_channels != 0:
490
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
-
492
- self.upp = np.prod(upsample_rates)
493
-
494
- def forward(self, x, f0, g=None):
495
- har_source, noi_source, uv = self.m_source(f0, self.upp)
496
- har_source = har_source.transpose(1, 2)
497
- x = self.conv_pre(x)
498
- if g is not None:
499
- x = x + self.cond(g)
500
-
501
- for i in range(self.num_upsamples):
502
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
- x = self.ups[i](x)
504
- x_source = self.noise_convs[i](har_source)
505
- x = x + x_source
506
- xs = None
507
- for j in range(self.num_kernels):
508
- if xs is None:
509
- xs = self.resblocks[i * self.num_kernels + j](x)
510
- else:
511
- xs += self.resblocks[i * self.num_kernels + j](x)
512
- x = xs / self.num_kernels
513
- x = F.leaky_relu(x)
514
- x = self.conv_post(x)
515
- x = torch.tanh(x)
516
- return x
517
-
518
- def remove_weight_norm(self):
519
- for l in self.ups:
520
- remove_weight_norm(l)
521
- for l in self.resblocks:
522
- l.remove_weight_norm()
523
-
524
-
525
- sr2sr = {
526
- "32k": 32000,
527
- "40k": 40000,
528
- "48k": 48000,
529
- }
530
-
531
-
532
- class SynthesizerTrnMs256NSFsid(nn.Module):
533
- def __init__(
534
- self,
535
- spec_channels,
536
- segment_size,
537
- inter_channels,
538
- hidden_channels,
539
- filter_channels,
540
- n_heads,
541
- n_layers,
542
- kernel_size,
543
- p_dropout,
544
- resblock,
545
- resblock_kernel_sizes,
546
- resblock_dilation_sizes,
547
- upsample_rates,
548
- upsample_initial_channel,
549
- upsample_kernel_sizes,
550
- spk_embed_dim,
551
- gin_channels,
552
- sr,
553
- **kwargs
554
- ):
555
- super().__init__()
556
- if type(sr) == type("strr"):
557
- sr = sr2sr[sr]
558
- self.spec_channels = spec_channels
559
- self.inter_channels = inter_channels
560
- self.hidden_channels = hidden_channels
561
- self.filter_channels = filter_channels
562
- self.n_heads = n_heads
563
- self.n_layers = n_layers
564
- self.kernel_size = kernel_size
565
- self.p_dropout = p_dropout
566
- self.resblock = resblock
567
- self.resblock_kernel_sizes = resblock_kernel_sizes
568
- self.resblock_dilation_sizes = resblock_dilation_sizes
569
- self.upsample_rates = upsample_rates
570
- self.upsample_initial_channel = upsample_initial_channel
571
- self.upsample_kernel_sizes = upsample_kernel_sizes
572
- self.segment_size = segment_size
573
- self.gin_channels = gin_channels
574
- # self.hop_length = hop_length#
575
- self.spk_embed_dim = spk_embed_dim
576
- self.enc_p = TextEncoder256(
577
- inter_channels,
578
- hidden_channels,
579
- filter_channels,
580
- n_heads,
581
- n_layers,
582
- kernel_size,
583
- p_dropout,
584
- )
585
- self.dec = GeneratorNSF(
586
- inter_channels,
587
- resblock,
588
- resblock_kernel_sizes,
589
- resblock_dilation_sizes,
590
- upsample_rates,
591
- upsample_initial_channel,
592
- upsample_kernel_sizes,
593
- gin_channels=gin_channels,
594
- sr=sr,
595
- is_half=kwargs["is_half"],
596
- )
597
- self.enc_q = PosteriorEncoder(
598
- spec_channels,
599
- inter_channels,
600
- hidden_channels,
601
- 5,
602
- 1,
603
- 16,
604
- gin_channels=gin_channels,
605
- )
606
- self.flow = ResidualCouplingBlock(
607
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
608
- )
609
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
610
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
611
-
612
- def remove_weight_norm(self):
613
- self.dec.remove_weight_norm()
614
- self.flow.remove_weight_norm()
615
- self.enc_q.remove_weight_norm()
616
-
617
- def forward(
618
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
619
- ): # Here ds is id, [bs,1]
620
- # print(1,pitch.shape)#[bs,t]
621
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast
622
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
623
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
624
- z_p = self.flow(z, y_mask, g=g)
625
- z_slice, ids_slice = commons.rand_slice_segments(
626
- z, y_lengths, self.segment_size
627
- )
628
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
629
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
630
- # print(-2,pitchf.shape,z_slice.shape)
631
- o = self.dec(z_slice, pitchf, g=g)
632
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
633
-
634
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
635
- g = self.emb_g(sid).unsqueeze(-1)
636
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
637
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
638
- if rate:
639
- head = int(z_p.shape[2] * rate)
640
- z_p = z_p[:, :, -head:]
641
- x_mask = x_mask[:, :, -head:]
642
- nsff0 = nsff0[:, -head:]
643
- z = self.flow(z_p, x_mask, g=g, reverse=True)
644
- o = self.dec(z * x_mask, nsff0, g=g)
645
- return o, x_mask, (z, z_p, m_p, logs_p)
646
-
647
-
648
- class SynthesizerTrnMs768NSFsid(nn.Module):
649
- def __init__(
650
- self,
651
- spec_channels,
652
- segment_size,
653
- inter_channels,
654
- hidden_channels,
655
- filter_channels,
656
- n_heads,
657
- n_layers,
658
- kernel_size,
659
- p_dropout,
660
- resblock,
661
- resblock_kernel_sizes,
662
- resblock_dilation_sizes,
663
- upsample_rates,
664
- upsample_initial_channel,
665
- upsample_kernel_sizes,
666
- spk_embed_dim,
667
- gin_channels,
668
- sr,
669
- **kwargs
670
- ):
671
- super().__init__()
672
- if type(sr) == type("strr"):
673
- sr = sr2sr[sr]
674
- self.spec_channels = spec_channels
675
- self.inter_channels = inter_channels
676
- self.hidden_channels = hidden_channels
677
- self.filter_channels = filter_channels
678
- self.n_heads = n_heads
679
- self.n_layers = n_layers
680
- self.kernel_size = kernel_size
681
- self.p_dropout = p_dropout
682
- self.resblock = resblock
683
- self.resblock_kernel_sizes = resblock_kernel_sizes
684
- self.resblock_dilation_sizes = resblock_dilation_sizes
685
- self.upsample_rates = upsample_rates
686
- self.upsample_initial_channel = upsample_initial_channel
687
- self.upsample_kernel_sizes = upsample_kernel_sizes
688
- self.segment_size = segment_size
689
- self.gin_channels = gin_channels
690
- # self.hop_length = hop_length#
691
- self.spk_embed_dim = spk_embed_dim
692
- self.enc_p = TextEncoder768(
693
- inter_channels,
694
- hidden_channels,
695
- filter_channels,
696
- n_heads,
697
- n_layers,
698
- kernel_size,
699
- p_dropout,
700
- )
701
- self.dec = GeneratorNSF(
702
- inter_channels,
703
- resblock,
704
- resblock_kernel_sizes,
705
- resblock_dilation_sizes,
706
- upsample_rates,
707
- upsample_initial_channel,
708
- upsample_kernel_sizes,
709
- gin_channels=gin_channels,
710
- sr=sr,
711
- is_half=kwargs["is_half"],
712
- )
713
- self.enc_q = PosteriorEncoder(
714
- spec_channels,
715
- inter_channels,
716
- hidden_channels,
717
- 5,
718
- 1,
719
- 16,
720
- gin_channels=gin_channels,
721
- )
722
- self.flow = ResidualCouplingBlock(
723
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
724
- )
725
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
726
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
727
-
728
- def remove_weight_norm(self):
729
- self.dec.remove_weight_norm()
730
- self.flow.remove_weight_norm()
731
- self.enc_q.remove_weight_norm()
732
-
733
- def forward(
734
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
735
- ): # Here ds is id,[bs,1]
736
- # print(1,pitch.shape)#[bs,t]
737
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast
738
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
739
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
740
- z_p = self.flow(z, y_mask, g=g)
741
- z_slice, ids_slice = commons.rand_slice_segments(
742
- z, y_lengths, self.segment_size
743
- )
744
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
745
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
746
- # print(-2,pitchf.shape,z_slice.shape)
747
- o = self.dec(z_slice, pitchf, g=g)
748
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
749
-
750
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
751
- g = self.emb_g(sid).unsqueeze(-1)
752
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
753
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
754
- if rate:
755
- head = int(z_p.shape[2] * rate)
756
- z_p = z_p[:, :, -head:]
757
- x_mask = x_mask[:, :, -head:]
758
- nsff0 = nsff0[:, -head:]
759
- z = self.flow(z_p, x_mask, g=g, reverse=True)
760
- o = self.dec(z * x_mask, nsff0, g=g)
761
- return o, x_mask, (z, z_p, m_p, logs_p)
762
-
763
-
764
- class SynthesizerTrnMs256NSFsid_nono(nn.Module):
765
- def __init__(
766
- self,
767
- spec_channels,
768
- segment_size,
769
- inter_channels,
770
- hidden_channels,
771
- filter_channels,
772
- n_heads,
773
- n_layers,
774
- kernel_size,
775
- p_dropout,
776
- resblock,
777
- resblock_kernel_sizes,
778
- resblock_dilation_sizes,
779
- upsample_rates,
780
- upsample_initial_channel,
781
- upsample_kernel_sizes,
782
- spk_embed_dim,
783
- gin_channels,
784
- sr=None,
785
- **kwargs
786
- ):
787
- super().__init__()
788
- self.spec_channels = spec_channels
789
- self.inter_channels = inter_channels
790
- self.hidden_channels = hidden_channels
791
- self.filter_channels = filter_channels
792
- self.n_heads = n_heads
793
- self.n_layers = n_layers
794
- self.kernel_size = kernel_size
795
- self.p_dropout = p_dropout
796
- self.resblock = resblock
797
- self.resblock_kernel_sizes = resblock_kernel_sizes
798
- self.resblock_dilation_sizes = resblock_dilation_sizes
799
- self.upsample_rates = upsample_rates
800
- self.upsample_initial_channel = upsample_initial_channel
801
- self.upsample_kernel_sizes = upsample_kernel_sizes
802
- self.segment_size = segment_size
803
- self.gin_channels = gin_channels
804
- # self.hop_length = hop_length#
805
- self.spk_embed_dim = spk_embed_dim
806
- self.enc_p = TextEncoder256(
807
- inter_channels,
808
- hidden_channels,
809
- filter_channels,
810
- n_heads,
811
- n_layers,
812
- kernel_size,
813
- p_dropout,
814
- f0=False,
815
- )
816
- self.dec = Generator(
817
- inter_channels,
818
- resblock,
819
- resblock_kernel_sizes,
820
- resblock_dilation_sizes,
821
- upsample_rates,
822
- upsample_initial_channel,
823
- upsample_kernel_sizes,
824
- gin_channels=gin_channels,
825
- )
826
- self.enc_q = PosteriorEncoder(
827
- spec_channels,
828
- inter_channels,
829
- hidden_channels,
830
- 5,
831
- 1,
832
- 16,
833
- gin_channels=gin_channels,
834
- )
835
- self.flow = ResidualCouplingBlock(
836
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
837
- )
838
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
839
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
840
-
841
- def remove_weight_norm(self):
842
- self.dec.remove_weight_norm()
843
- self.flow.remove_weight_norm()
844
- self.enc_q.remove_weight_norm()
845
-
846
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # Here ds is id,[bs,1]
847
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast
848
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
849
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
850
- z_p = self.flow(z, y_mask, g=g)
851
- z_slice, ids_slice = commons.rand_slice_segments(
852
- z, y_lengths, self.segment_size
853
- )
854
- o = self.dec(z_slice, g=g)
855
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
856
-
857
- def infer(self, phone, phone_lengths, sid, rate=None):
858
- g = self.emb_g(sid).unsqueeze(-1)
859
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
860
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
861
- if rate:
862
- head = int(z_p.shape[2] * rate)
863
- z_p = z_p[:, :, -head:]
864
- x_mask = x_mask[:, :, -head:]
865
- z = self.flow(z_p, x_mask, g=g, reverse=True)
866
- o = self.dec(z * x_mask, g=g)
867
- return o, x_mask, (z, z_p, m_p, logs_p)
868
-
869
-
870
- class SynthesizerTrnMs768NSFsid_nono(nn.Module):
871
- def __init__(
872
- self,
873
- spec_channels,
874
- segment_size,
875
- inter_channels,
876
- hidden_channels,
877
- filter_channels,
878
- n_heads,
879
- n_layers,
880
- kernel_size,
881
- p_dropout,
882
- resblock,
883
- resblock_kernel_sizes,
884
- resblock_dilation_sizes,
885
- upsample_rates,
886
- upsample_initial_channel,
887
- upsample_kernel_sizes,
888
- spk_embed_dim,
889
- gin_channels,
890
- sr=None,
891
- **kwargs
892
- ):
893
- super().__init__()
894
- self.spec_channels = spec_channels
895
- self.inter_channels = inter_channels
896
- self.hidden_channels = hidden_channels
897
- self.filter_channels = filter_channels
898
- self.n_heads = n_heads
899
- self.n_layers = n_layers
900
- self.kernel_size = kernel_size
901
- self.p_dropout = p_dropout
902
- self.resblock = resblock
903
- self.resblock_kernel_sizes = resblock_kernel_sizes
904
- self.resblock_dilation_sizes = resblock_dilation_sizes
905
- self.upsample_rates = upsample_rates
906
- self.upsample_initial_channel = upsample_initial_channel
907
- self.upsample_kernel_sizes = upsample_kernel_sizes
908
- self.segment_size = segment_size
909
- self.gin_channels = gin_channels
910
- # self.hop_length = hop_length#
911
- self.spk_embed_dim = spk_embed_dim
912
- self.enc_p = TextEncoder768(
913
- inter_channels,
914
- hidden_channels,
915
- filter_channels,
916
- n_heads,
917
- n_layers,
918
- kernel_size,
919
- p_dropout,
920
- f0=False,
921
- )
922
- self.dec = Generator(
923
- inter_channels,
924
- resblock,
925
- resblock_kernel_sizes,
926
- resblock_dilation_sizes,
927
- upsample_rates,
928
- upsample_initial_channel,
929
- upsample_kernel_sizes,
930
- gin_channels=gin_channels,
931
- )
932
- self.enc_q = PosteriorEncoder(
933
- spec_channels,
934
- inter_channels,
935
- hidden_channels,
936
- 5,
937
- 1,
938
- 16,
939
- gin_channels=gin_channels,
940
- )
941
- self.flow = ResidualCouplingBlock(
942
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
943
- )
944
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
945
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
946
-
947
- def remove_weight_norm(self):
948
- self.dec.remove_weight_norm()
949
- self.flow.remove_weight_norm()
950
- self.enc_q.remove_weight_norm()
951
-
952
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # Here ds is id,[bs,1]
953
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast
954
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
955
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
956
- z_p = self.flow(z, y_mask, g=g)
957
- z_slice, ids_slice = commons.rand_slice_segments(
958
- z, y_lengths, self.segment_size
959
- )
960
- o = self.dec(z_slice, g=g)
961
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
962
-
963
- def infer(self, phone, phone_lengths, sid, rate=None):
964
- g = self.emb_g(sid).unsqueeze(-1)
965
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
966
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
967
- if rate:
968
- head = int(z_p.shape[2] * rate)
969
- z_p = z_p[:, :, -head:]
970
- x_mask = x_mask[:, :, -head:]
971
- z = self.flow(z_p, x_mask, g=g, reverse=True)
972
- o = self.dec(z * x_mask, g=g)
973
- return o, x_mask, (z, z_p, m_p, logs_p)
974
-
975
-
976
- class MultiPeriodDiscriminator(torch.nn.Module):
977
- def __init__(self, use_spectral_norm=False):
978
- super(MultiPeriodDiscriminator, self).__init__()
979
- periods = [2, 3, 5, 7, 11, 17]
980
- # periods = [3, 5, 7, 11, 17, 23, 37]
981
-
982
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
983
- discs = discs + [
984
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
985
- ]
986
- self.discriminators = nn.ModuleList(discs)
987
-
988
- def forward(self, y, y_hat):
989
- y_d_rs = [] #
990
- y_d_gs = []
991
- fmap_rs = []
992
- fmap_gs = []
993
- for i, d in enumerate(self.discriminators):
994
- y_d_r, fmap_r = d(y)
995
- y_d_g, fmap_g = d(y_hat)
996
- # for j in range(len(fmap_r)):
997
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
998
- y_d_rs.append(y_d_r)
999
- y_d_gs.append(y_d_g)
1000
- fmap_rs.append(fmap_r)
1001
- fmap_gs.append(fmap_g)
1002
-
1003
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1004
-
1005
-
1006
- class MultiPeriodDiscriminatorV2(torch.nn.Module):
1007
- def __init__(self, use_spectral_norm=False):
1008
- super(MultiPeriodDiscriminatorV2, self).__init__()
1009
- # periods = [2, 3, 5, 7, 11, 17]
1010
- periods = [2, 3, 5, 7, 11, 17, 23, 37]
1011
-
1012
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1013
- discs = discs + [
1014
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1015
- ]
1016
- self.discriminators = nn.ModuleList(discs)
1017
-
1018
- def forward(self, y, y_hat):
1019
- y_d_rs = [] #
1020
- y_d_gs = []
1021
- fmap_rs = []
1022
- fmap_gs = []
1023
- for i, d in enumerate(self.discriminators):
1024
- y_d_r, fmap_r = d(y)
1025
- y_d_g, fmap_g = d(y_hat)
1026
- # for j in range(len(fmap_r)):
1027
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1028
- y_d_rs.append(y_d_r)
1029
- y_d_gs.append(y_d_g)
1030
- fmap_rs.append(fmap_r)
1031
- fmap_gs.append(fmap_g)
1032
-
1033
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1034
-
1035
-
1036
- class DiscriminatorS(torch.nn.Module):
1037
- def __init__(self, use_spectral_norm=False):
1038
- super(DiscriminatorS, self).__init__()
1039
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1040
- self.convs = nn.ModuleList(
1041
- [
1042
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1043
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1044
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1045
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1046
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1047
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1048
- ]
1049
- )
1050
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1051
-
1052
- def forward(self, x):
1053
- fmap = []
1054
-
1055
- for l in self.convs:
1056
- x = l(x)
1057
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1058
- fmap.append(x)
1059
- x = self.conv_post(x)
1060
- fmap.append(x)
1061
- x = torch.flatten(x, 1, -1)
1062
-
1063
- return x, fmap
1064
-
1065
-
1066
- class DiscriminatorP(torch.nn.Module):
1067
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1068
- super(DiscriminatorP, self).__init__()
1069
- self.period = period
1070
- self.use_spectral_norm = use_spectral_norm
1071
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1072
- self.convs = nn.ModuleList(
1073
- [
1074
- norm_f(
1075
- Conv2d(
1076
- 1,
1077
- 32,
1078
- (kernel_size, 1),
1079
- (stride, 1),
1080
- padding=(get_padding(kernel_size, 1), 0),
1081
- )
1082
- ),
1083
- norm_f(
1084
- Conv2d(
1085
- 32,
1086
- 128,
1087
- (kernel_size, 1),
1088
- (stride, 1),
1089
- padding=(get_padding(kernel_size, 1), 0),
1090
- )
1091
- ),
1092
- norm_f(
1093
- Conv2d(
1094
- 128,
1095
- 512,
1096
- (kernel_size, 1),
1097
- (stride, 1),
1098
- padding=(get_padding(kernel_size, 1), 0),
1099
- )
1100
- ),
1101
- norm_f(
1102
- Conv2d(
1103
- 512,
1104
- 1024,
1105
- (kernel_size, 1),
1106
- (stride, 1),
1107
- padding=(get_padding(kernel_size, 1), 0),
1108
- )
1109
- ),
1110
- norm_f(
1111
- Conv2d(
1112
- 1024,
1113
- 1024,
1114
- (kernel_size, 1),
1115
- 1,
1116
- padding=(get_padding(kernel_size, 1), 0),
1117
- )
1118
- ),
1119
- ]
1120
- )
1121
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1122
-
1123
- def forward(self, x):
1124
- fmap = []
1125
-
1126
- # 1d to 2d
1127
- b, c, t = x.shape
1128
- if t % self.period != 0: # pad first
1129
- n_pad = self.period - (t % self.period)
1130
- x = F.pad(x, (0, n_pad), "reflect")
1131
- t = t + n_pad
1132
- x = x.view(b, c, t // self.period, self.period)
1133
-
1134
- for l in self.convs:
1135
- x = l(x)
1136
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1137
- fmap.append(x)
1138
- x = self.conv_post(x)
1139
- fmap.append(x)
1140
- x = torch.flatten(x, 1, -1)
1141
-
1142
- return x, fmap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/models_dml.py DELETED
@@ -1,1124 +0,0 @@
1
- import math, pdb, os
2
- from time import time as ttime
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
- from lib.infer_pack import modules
7
- from lib.infer_pack import attentions
8
- from lib.infer_pack import commons
9
- from lib.infer_pack.commons import init_weights, get_padding
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from lib.infer_pack.commons import init_weights
13
- import numpy as np
14
- from lib.infer_pack import commons
15
-
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder768(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(768, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- stats = self.proj(x) * x_mask
106
-
107
- m, logs = torch.split(stats, self.out_channels, dim=1)
108
- return m, logs, x_mask
109
-
110
-
111
- class ResidualCouplingBlock(nn.Module):
112
- def __init__(
113
- self,
114
- channels,
115
- hidden_channels,
116
- kernel_size,
117
- dilation_rate,
118
- n_layers,
119
- n_flows=4,
120
- gin_channels=0,
121
- ):
122
- super().__init__()
123
- self.channels = channels
124
- self.hidden_channels = hidden_channels
125
- self.kernel_size = kernel_size
126
- self.dilation_rate = dilation_rate
127
- self.n_layers = n_layers
128
- self.n_flows = n_flows
129
- self.gin_channels = gin_channels
130
-
131
- self.flows = nn.ModuleList()
132
- for i in range(n_flows):
133
- self.flows.append(
134
- modules.ResidualCouplingLayer(
135
- channels,
136
- hidden_channels,
137
- kernel_size,
138
- dilation_rate,
139
- n_layers,
140
- gin_channels=gin_channels,
141
- mean_only=True,
142
- )
143
- )
144
- self.flows.append(modules.Flip())
145
-
146
- def forward(self, x, x_mask, g=None, reverse=False):
147
- if not reverse:
148
- for flow in self.flows:
149
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
- else:
151
- for flow in reversed(self.flows):
152
- x = flow(x, x_mask, g=g, reverse=reverse)
153
- return x
154
-
155
- def remove_weight_norm(self):
156
- for i in range(self.n_flows):
157
- self.flows[i * 2].remove_weight_norm()
158
-
159
-
160
- class PosteriorEncoder(nn.Module):
161
- def __init__(
162
- self,
163
- in_channels,
164
- out_channels,
165
- hidden_channels,
166
- kernel_size,
167
- dilation_rate,
168
- n_layers,
169
- gin_channels=0,
170
- ):
171
- super().__init__()
172
- self.in_channels = in_channels
173
- self.out_channels = out_channels
174
- self.hidden_channels = hidden_channels
175
- self.kernel_size = kernel_size
176
- self.dilation_rate = dilation_rate
177
- self.n_layers = n_layers
178
- self.gin_channels = gin_channels
179
-
180
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
- self.enc = modules.WN(
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- gin_channels=gin_channels,
187
- )
188
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
-
190
- def forward(self, x, x_lengths, g=None):
191
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
- x.dtype
193
- )
194
- x = self.pre(x) * x_mask
195
- x = self.enc(x, x_mask, g=g)
196
- stats = self.proj(x) * x_mask
197
- m, logs = torch.split(stats, self.out_channels, dim=1)
198
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
- return z, m, logs, x_mask
200
-
201
- def remove_weight_norm(self):
202
- self.enc.remove_weight_norm()
203
-
204
-
205
- class Generator(torch.nn.Module):
206
- def __init__(
207
- self,
208
- initial_channel,
209
- resblock,
210
- resblock_kernel_sizes,
211
- resblock_dilation_sizes,
212
- upsample_rates,
213
- upsample_initial_channel,
214
- upsample_kernel_sizes,
215
- gin_channels=0,
216
- ):
217
- super(Generator, self).__init__()
218
- self.num_kernels = len(resblock_kernel_sizes)
219
- self.num_upsamples = len(upsample_rates)
220
- self.conv_pre = Conv1d(
221
- initial_channel, upsample_initial_channel, 7, 1, padding=3
222
- )
223
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
-
225
- self.ups = nn.ModuleList()
226
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
- self.ups.append(
228
- weight_norm(
229
- ConvTranspose1d(
230
- upsample_initial_channel // (2**i),
231
- upsample_initial_channel // (2 ** (i + 1)),
232
- k,
233
- u,
234
- padding=(k - u) // 2,
235
- )
236
- )
237
- )
238
-
239
- self.resblocks = nn.ModuleList()
240
- for i in range(len(self.ups)):
241
- ch = upsample_initial_channel // (2 ** (i + 1))
242
- for j, (k, d) in enumerate(
243
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
- ):
245
- self.resblocks.append(resblock(ch, k, d))
246
-
247
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
- self.ups.apply(init_weights)
249
-
250
- if gin_channels != 0:
251
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
-
253
- def forward(self, x, g=None):
254
- x = self.conv_pre(x)
255
- if g is not None:
256
- x = x + self.cond(g)
257
-
258
- for i in range(self.num_upsamples):
259
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
- x = self.ups[i](x)
261
- xs = None
262
- for j in range(self.num_kernels):
263
- if xs is None:
264
- xs = self.resblocks[i * self.num_kernels + j](x)
265
- else:
266
- xs += self.resblocks[i * self.num_kernels + j](x)
267
- x = xs / self.num_kernels
268
- x = F.leaky_relu(x)
269
- x = self.conv_post(x)
270
- x = torch.tanh(x)
271
-
272
- return x
273
-
274
- def remove_weight_norm(self):
275
- for l in self.ups:
276
- remove_weight_norm(l)
277
- for l in self.resblocks:
278
- l.remove_weight_norm()
279
-
280
-
281
- class SineGen(torch.nn.Module):
282
- """Definition of sine generator
283
- SineGen(samp_rate, harmonic_num = 0,
284
- sine_amp = 0.1, noise_std = 0.003,
285
- voiced_threshold = 0,
286
- flag_for_pulse=False)
287
- samp_rate: sampling rate in Hz
288
- harmonic_num: number of harmonic overtones (default 0)
289
- sine_amp: amplitude of sine-wavefrom (default 0.1)
290
- noise_std: std of Gaussian noise (default 0.003)
291
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
- Note: when flag_for_pulse is True, the first time step of a voiced
294
- segment is always sin(np.pi) or cos(0)
295
- """
296
-
297
- def __init__(
298
- self,
299
- samp_rate,
300
- harmonic_num=0,
301
- sine_amp=0.1,
302
- noise_std=0.003,
303
- voiced_threshold=0,
304
- flag_for_pulse=False,
305
- ):
306
- super(SineGen, self).__init__()
307
- self.sine_amp = sine_amp
308
- self.noise_std = noise_std
309
- self.harmonic_num = harmonic_num
310
- self.dim = self.harmonic_num + 1
311
- self.sampling_rate = samp_rate
312
- self.voiced_threshold = voiced_threshold
313
-
314
- def _f02uv(self, f0):
315
- # generate uv signal
316
- uv = torch.ones_like(f0)
317
- uv = uv * (f0 > self.voiced_threshold)
318
- return uv.float()
319
-
320
- def forward(self, f0, upp):
321
- """sine_tensor, uv = forward(f0)
322
- input F0: tensor(batchsize=1, length, dim=1)
323
- f0 for unvoiced steps should be 0
324
- output sine_tensor: tensor(batchsize=1, length, dim)
325
- output uv: tensor(batchsize=1, length, 1)
326
- """
327
- with torch.no_grad():
328
- f0 = f0[:, None].transpose(1, 2)
329
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
- # fundamental component
331
- f0_buf[:, :, 0] = f0[:, :, 0]
332
- for idx in np.arange(self.harmonic_num):
333
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
- idx + 2
335
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_harηš„δΉ˜η§―ζ— ζ³•εŽε€„η†δΌ˜εŒ–
337
- rand_ini = torch.rand(
338
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
- )
340
- rand_ini[:, 0] = 0
341
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1ζ„ε‘³η€εŽι’ηš„cumsumζ— ζ³•ε†δΌ˜εŒ–
343
- tmp_over_one *= upp
344
- tmp_over_one = F.interpolate(
345
- tmp_over_one.transpose(2, 1),
346
- scale_factor=upp,
347
- mode="linear",
348
- align_corners=True,
349
- ).transpose(2, 1)
350
- rad_values = F.interpolate(
351
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
- ).transpose(
353
- 2, 1
354
- ) #######
355
- tmp_over_one %= 1
356
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
- cumsum_shift = torch.zeros_like(rad_values)
358
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
- sine_waves = torch.sin(
360
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
- )
362
- sine_waves = sine_waves * self.sine_amp
363
- uv = self._f02uv(f0)
364
- uv = F.interpolate(
365
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
- ).transpose(2, 1)
367
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
- noise = noise_amp * torch.randn_like(sine_waves)
369
- sine_waves = sine_waves * uv + noise
370
- return sine_waves, uv, noise
371
-
372
-
373
- class SourceModuleHnNSF(torch.nn.Module):
374
- """SourceModule for hn-nsf
375
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
- add_noise_std=0.003, voiced_threshod=0)
377
- sampling_rate: sampling_rate in Hz
378
- harmonic_num: number of harmonic above F0 (default: 0)
379
- sine_amp: amplitude of sine source signal (default: 0.1)
380
- add_noise_std: std of additive Gaussian noise (default: 0.003)
381
- note that amplitude of noise in unvoiced is decided
382
- by sine_amp
383
- voiced_threshold: threhold to set U/V given F0 (default: 0)
384
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
- F0_sampled (batchsize, length, 1)
386
- Sine_source (batchsize, length, 1)
387
- noise_source (batchsize, length 1)
388
- uv (batchsize, length, 1)
389
- """
390
-
391
- def __init__(
392
- self,
393
- sampling_rate,
394
- harmonic_num=0,
395
- sine_amp=0.1,
396
- add_noise_std=0.003,
397
- voiced_threshod=0,
398
- is_half=True,
399
- ):
400
- super(SourceModuleHnNSF, self).__init__()
401
-
402
- self.sine_amp = sine_amp
403
- self.noise_std = add_noise_std
404
- self.is_half = is_half
405
- # to produce sine waveforms
406
- self.l_sin_gen = SineGen(
407
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
- )
409
-
410
- # to merge source harmonics into a single excitation
411
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
- self.l_tanh = torch.nn.Tanh()
413
-
414
- def forward(self, x, upp=None):
415
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
- if self.is_half:
417
- sine_wavs = sine_wavs.half()
418
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
- return sine_merge, None, None # noise, uv
420
-
421
-
422
- class GeneratorNSF(torch.nn.Module):
423
- def __init__(
424
- self,
425
- initial_channel,
426
- resblock,
427
- resblock_kernel_sizes,
428
- resblock_dilation_sizes,
429
- upsample_rates,
430
- upsample_initial_channel,
431
- upsample_kernel_sizes,
432
- gin_channels,
433
- sr,
434
- is_half=False,
435
- ):
436
- super(GeneratorNSF, self).__init__()
437
- self.num_kernels = len(resblock_kernel_sizes)
438
- self.num_upsamples = len(upsample_rates)
439
-
440
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
- self.m_source = SourceModuleHnNSF(
442
- sampling_rate=sr, harmonic_num=0, is_half=is_half
443
- )
444
- self.noise_convs = nn.ModuleList()
445
- self.conv_pre = Conv1d(
446
- initial_channel, upsample_initial_channel, 7, 1, padding=3
447
- )
448
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
-
450
- self.ups = nn.ModuleList()
451
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
- c_cur = upsample_initial_channel // (2 ** (i + 1))
453
- self.ups.append(
454
- weight_norm(
455
- ConvTranspose1d(
456
- upsample_initial_channel // (2**i),
457
- upsample_initial_channel // (2 ** (i + 1)),
458
- k,
459
- u,
460
- padding=(k - u) // 2,
461
- )
462
- )
463
- )
464
- if i + 1 < len(upsample_rates):
465
- stride_f0 = np.prod(upsample_rates[i + 1 :])
466
- self.noise_convs.append(
467
- Conv1d(
468
- 1,
469
- c_cur,
470
- kernel_size=stride_f0 * 2,
471
- stride=stride_f0,
472
- padding=stride_f0 // 2,
473
- )
474
- )
475
- else:
476
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
-
478
- self.resblocks = nn.ModuleList()
479
- for i in range(len(self.ups)):
480
- ch = upsample_initial_channel // (2 ** (i + 1))
481
- for j, (k, d) in enumerate(
482
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
- ):
484
- self.resblocks.append(resblock(ch, k, d))
485
-
486
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
- self.ups.apply(init_weights)
488
-
489
- if gin_channels != 0:
490
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
-
492
- self.upp = np.prod(upsample_rates)
493
-
494
- def forward(self, x, f0, g=None):
495
- har_source, noi_source, uv = self.m_source(f0, self.upp)
496
- har_source = har_source.transpose(1, 2)
497
- x = self.conv_pre(x)
498
- if g is not None:
499
- x = x + self.cond(g)
500
-
501
- for i in range(self.num_upsamples):
502
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
- x = self.ups[i](x)
504
- x_source = self.noise_convs[i](har_source)
505
- x = x + x_source
506
- xs = None
507
- for j in range(self.num_kernels):
508
- if xs is None:
509
- xs = self.resblocks[i * self.num_kernels + j](x)
510
- else:
511
- xs += self.resblocks[i * self.num_kernels + j](x)
512
- x = xs / self.num_kernels
513
- x = F.leaky_relu(x)
514
- x = self.conv_post(x)
515
- x = torch.tanh(x)
516
- return x
517
-
518
- def remove_weight_norm(self):
519
- for l in self.ups:
520
- remove_weight_norm(l)
521
- for l in self.resblocks:
522
- l.remove_weight_norm()
523
-
524
-
525
- sr2sr = {
526
- "32k": 32000,
527
- "40k": 40000,
528
- "48k": 48000,
529
- }
530
-
531
-
532
- class SynthesizerTrnMs256NSFsid(nn.Module):
533
- def __init__(
534
- self,
535
- spec_channels,
536
- segment_size,
537
- inter_channels,
538
- hidden_channels,
539
- filter_channels,
540
- n_heads,
541
- n_layers,
542
- kernel_size,
543
- p_dropout,
544
- resblock,
545
- resblock_kernel_sizes,
546
- resblock_dilation_sizes,
547
- upsample_rates,
548
- upsample_initial_channel,
549
- upsample_kernel_sizes,
550
- spk_embed_dim,
551
- gin_channels,
552
- sr,
553
- **kwargs
554
- ):
555
- super().__init__()
556
- if type(sr) == type("strr"):
557
- sr = sr2sr[sr]
558
- self.spec_channels = spec_channels
559
- self.inter_channels = inter_channels
560
- self.hidden_channels = hidden_channels
561
- self.filter_channels = filter_channels
562
- self.n_heads = n_heads
563
- self.n_layers = n_layers
564
- self.kernel_size = kernel_size
565
- self.p_dropout = p_dropout
566
- self.resblock = resblock
567
- self.resblock_kernel_sizes = resblock_kernel_sizes
568
- self.resblock_dilation_sizes = resblock_dilation_sizes
569
- self.upsample_rates = upsample_rates
570
- self.upsample_initial_channel = upsample_initial_channel
571
- self.upsample_kernel_sizes = upsample_kernel_sizes
572
- self.segment_size = segment_size
573
- self.gin_channels = gin_channels
574
- # self.hop_length = hop_length#
575
- self.spk_embed_dim = spk_embed_dim
576
- self.enc_p = TextEncoder256(
577
- inter_channels,
578
- hidden_channels,
579
- filter_channels,
580
- n_heads,
581
- n_layers,
582
- kernel_size,
583
- p_dropout,
584
- )
585
- self.dec = GeneratorNSF(
586
- inter_channels,
587
- resblock,
588
- resblock_kernel_sizes,
589
- resblock_dilation_sizes,
590
- upsample_rates,
591
- upsample_initial_channel,
592
- upsample_kernel_sizes,
593
- gin_channels=gin_channels,
594
- sr=sr,
595
- is_half=kwargs["is_half"],
596
- )
597
- self.enc_q = PosteriorEncoder(
598
- spec_channels,
599
- inter_channels,
600
- hidden_channels,
601
- 5,
602
- 1,
603
- 16,
604
- gin_channels=gin_channels,
605
- )
606
- self.flow = ResidualCouplingBlock(
607
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
608
- )
609
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
610
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
611
-
612
- def remove_weight_norm(self):
613
- self.dec.remove_weight_norm()
614
- self.flow.remove_weight_norm()
615
- self.enc_q.remove_weight_norm()
616
-
617
- def forward(
618
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
619
- ): # θΏ™ι‡Œds是id,[bs,1]
620
- # print(1,pitch.shape)#[bs,t]
621
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是tοΌŒεΉΏζ’­ηš„
622
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
623
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
624
- z_p = self.flow(z, y_mask, g=g)
625
- z_slice, ids_slice = commons.rand_slice_segments(
626
- z, y_lengths, self.segment_size
627
- )
628
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
629
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
630
- # print(-2,pitchf.shape,z_slice.shape)
631
- o = self.dec(z_slice, pitchf, g=g)
632
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
633
-
634
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
635
- g = self.emb_g(sid).unsqueeze(-1)
636
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
637
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
638
- z = self.flow(z_p, x_mask, g=g, reverse=True)
639
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
640
- return o, x_mask, (z, z_p, m_p, logs_p)
641
-
642
-
643
- class SynthesizerTrnMs768NSFsid(nn.Module):
644
- def __init__(
645
- self,
646
- spec_channels,
647
- segment_size,
648
- inter_channels,
649
- hidden_channels,
650
- filter_channels,
651
- n_heads,
652
- n_layers,
653
- kernel_size,
654
- p_dropout,
655
- resblock,
656
- resblock_kernel_sizes,
657
- resblock_dilation_sizes,
658
- upsample_rates,
659
- upsample_initial_channel,
660
- upsample_kernel_sizes,
661
- spk_embed_dim,
662
- gin_channels,
663
- sr,
664
- **kwargs
665
- ):
666
- super().__init__()
667
- if type(sr) == type("strr"):
668
- sr = sr2sr[sr]
669
- self.spec_channels = spec_channels
670
- self.inter_channels = inter_channels
671
- self.hidden_channels = hidden_channels
672
- self.filter_channels = filter_channels
673
- self.n_heads = n_heads
674
- self.n_layers = n_layers
675
- self.kernel_size = kernel_size
676
- self.p_dropout = p_dropout
677
- self.resblock = resblock
678
- self.resblock_kernel_sizes = resblock_kernel_sizes
679
- self.resblock_dilation_sizes = resblock_dilation_sizes
680
- self.upsample_rates = upsample_rates
681
- self.upsample_initial_channel = upsample_initial_channel
682
- self.upsample_kernel_sizes = upsample_kernel_sizes
683
- self.segment_size = segment_size
684
- self.gin_channels = gin_channels
685
- # self.hop_length = hop_length#
686
- self.spk_embed_dim = spk_embed_dim
687
- self.enc_p = TextEncoder768(
688
- inter_channels,
689
- hidden_channels,
690
- filter_channels,
691
- n_heads,
692
- n_layers,
693
- kernel_size,
694
- p_dropout,
695
- )
696
- self.dec = GeneratorNSF(
697
- inter_channels,
698
- resblock,
699
- resblock_kernel_sizes,
700
- resblock_dilation_sizes,
701
- upsample_rates,
702
- upsample_initial_channel,
703
- upsample_kernel_sizes,
704
- gin_channels=gin_channels,
705
- sr=sr,
706
- is_half=kwargs["is_half"],
707
- )
708
- self.enc_q = PosteriorEncoder(
709
- spec_channels,
710
- inter_channels,
711
- hidden_channels,
712
- 5,
713
- 1,
714
- 16,
715
- gin_channels=gin_channels,
716
- )
717
- self.flow = ResidualCouplingBlock(
718
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
719
- )
720
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
721
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
722
-
723
- def remove_weight_norm(self):
724
- self.dec.remove_weight_norm()
725
- self.flow.remove_weight_norm()
726
- self.enc_q.remove_weight_norm()
727
-
728
- def forward(
729
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
730
- ): # θΏ™ι‡Œds是id,[bs,1]
731
- # print(1,pitch.shape)#[bs,t]
732
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是tοΌŒεΉΏζ’­ηš„
733
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
734
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
735
- z_p = self.flow(z, y_mask, g=g)
736
- z_slice, ids_slice = commons.rand_slice_segments(
737
- z, y_lengths, self.segment_size
738
- )
739
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
740
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
741
- # print(-2,pitchf.shape,z_slice.shape)
742
- o = self.dec(z_slice, pitchf, g=g)
743
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
744
-
745
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
746
- g = self.emb_g(sid).unsqueeze(-1)
747
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
748
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
749
- z = self.flow(z_p, x_mask, g=g, reverse=True)
750
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
751
- return o, x_mask, (z, z_p, m_p, logs_p)
752
-
753
-
754
- class SynthesizerTrnMs256NSFsid_nono(nn.Module):
755
- def __init__(
756
- self,
757
- spec_channels,
758
- segment_size,
759
- inter_channels,
760
- hidden_channels,
761
- filter_channels,
762
- n_heads,
763
- n_layers,
764
- kernel_size,
765
- p_dropout,
766
- resblock,
767
- resblock_kernel_sizes,
768
- resblock_dilation_sizes,
769
- upsample_rates,
770
- upsample_initial_channel,
771
- upsample_kernel_sizes,
772
- spk_embed_dim,
773
- gin_channels,
774
- sr=None,
775
- **kwargs
776
- ):
777
- super().__init__()
778
- self.spec_channels = spec_channels
779
- self.inter_channels = inter_channels
780
- self.hidden_channels = hidden_channels
781
- self.filter_channels = filter_channels
782
- self.n_heads = n_heads
783
- self.n_layers = n_layers
784
- self.kernel_size = kernel_size
785
- self.p_dropout = p_dropout
786
- self.resblock = resblock
787
- self.resblock_kernel_sizes = resblock_kernel_sizes
788
- self.resblock_dilation_sizes = resblock_dilation_sizes
789
- self.upsample_rates = upsample_rates
790
- self.upsample_initial_channel = upsample_initial_channel
791
- self.upsample_kernel_sizes = upsample_kernel_sizes
792
- self.segment_size = segment_size
793
- self.gin_channels = gin_channels
794
- # self.hop_length = hop_length#
795
- self.spk_embed_dim = spk_embed_dim
796
- self.enc_p = TextEncoder256(
797
- inter_channels,
798
- hidden_channels,
799
- filter_channels,
800
- n_heads,
801
- n_layers,
802
- kernel_size,
803
- p_dropout,
804
- f0=False,
805
- )
806
- self.dec = Generator(
807
- inter_channels,
808
- resblock,
809
- resblock_kernel_sizes,
810
- resblock_dilation_sizes,
811
- upsample_rates,
812
- upsample_initial_channel,
813
- upsample_kernel_sizes,
814
- gin_channels=gin_channels,
815
- )
816
- self.enc_q = PosteriorEncoder(
817
- spec_channels,
818
- inter_channels,
819
- hidden_channels,
820
- 5,
821
- 1,
822
- 16,
823
- gin_channels=gin_channels,
824
- )
825
- self.flow = ResidualCouplingBlock(
826
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
827
- )
828
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
829
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
830
-
831
- def remove_weight_norm(self):
832
- self.dec.remove_weight_norm()
833
- self.flow.remove_weight_norm()
834
- self.enc_q.remove_weight_norm()
835
-
836
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # θΏ™ι‡Œds是id,[bs,1]
837
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是tοΌŒεΉΏζ’­ηš„
838
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
839
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
840
- z_p = self.flow(z, y_mask, g=g)
841
- z_slice, ids_slice = commons.rand_slice_segments(
842
- z, y_lengths, self.segment_size
843
- )
844
- o = self.dec(z_slice, g=g)
845
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
846
-
847
- def infer(self, phone, phone_lengths, sid, max_len=None):
848
- g = self.emb_g(sid).unsqueeze(-1)
849
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
850
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
851
- z = self.flow(z_p, x_mask, g=g, reverse=True)
852
- o = self.dec((z * x_mask)[:, :, :max_len], g=g)
853
- return o, x_mask, (z, z_p, m_p, logs_p)
854
-
855
-
856
- class SynthesizerTrnMs768NSFsid_nono(nn.Module):
857
- def __init__(
858
- self,
859
- spec_channels,
860
- segment_size,
861
- inter_channels,
862
- hidden_channels,
863
- filter_channels,
864
- n_heads,
865
- n_layers,
866
- kernel_size,
867
- p_dropout,
868
- resblock,
869
- resblock_kernel_sizes,
870
- resblock_dilation_sizes,
871
- upsample_rates,
872
- upsample_initial_channel,
873
- upsample_kernel_sizes,
874
- spk_embed_dim,
875
- gin_channels,
876
- sr=None,
877
- **kwargs
878
- ):
879
- super().__init__()
880
- self.spec_channels = spec_channels
881
- self.inter_channels = inter_channels
882
- self.hidden_channels = hidden_channels
883
- self.filter_channels = filter_channels
884
- self.n_heads = n_heads
885
- self.n_layers = n_layers
886
- self.kernel_size = kernel_size
887
- self.p_dropout = p_dropout
888
- self.resblock = resblock
889
- self.resblock_kernel_sizes = resblock_kernel_sizes
890
- self.resblock_dilation_sizes = resblock_dilation_sizes
891
- self.upsample_rates = upsample_rates
892
- self.upsample_initial_channel = upsample_initial_channel
893
- self.upsample_kernel_sizes = upsample_kernel_sizes
894
- self.segment_size = segment_size
895
- self.gin_channels = gin_channels
896
- # self.hop_length = hop_length#
897
- self.spk_embed_dim = spk_embed_dim
898
- self.enc_p = TextEncoder768(
899
- inter_channels,
900
- hidden_channels,
901
- filter_channels,
902
- n_heads,
903
- n_layers,
904
- kernel_size,
905
- p_dropout,
906
- f0=False,
907
- )
908
- self.dec = Generator(
909
- inter_channels,
910
- resblock,
911
- resblock_kernel_sizes,
912
- resblock_dilation_sizes,
913
- upsample_rates,
914
- upsample_initial_channel,
915
- upsample_kernel_sizes,
916
- gin_channels=gin_channels,
917
- )
918
- self.enc_q = PosteriorEncoder(
919
- spec_channels,
920
- inter_channels,
921
- hidden_channels,
922
- 5,
923
- 1,
924
- 16,
925
- gin_channels=gin_channels,
926
- )
927
- self.flow = ResidualCouplingBlock(
928
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
929
- )
930
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
931
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
932
-
933
- def remove_weight_norm(self):
934
- self.dec.remove_weight_norm()
935
- self.flow.remove_weight_norm()
936
- self.enc_q.remove_weight_norm()
937
-
938
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # θΏ™ι‡Œds是id,[bs,1]
939
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是tοΌŒεΉΏζ’­ηš„
940
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
941
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
942
- z_p = self.flow(z, y_mask, g=g)
943
- z_slice, ids_slice = commons.rand_slice_segments(
944
- z, y_lengths, self.segment_size
945
- )
946
- o = self.dec(z_slice, g=g)
947
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
948
-
949
- def infer(self, phone, phone_lengths, sid, max_len=None):
950
- g = self.emb_g(sid).unsqueeze(-1)
951
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
952
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
953
- z = self.flow(z_p, x_mask, g=g, reverse=True)
954
- o = self.dec((z * x_mask)[:, :, :max_len], g=g)
955
- return o, x_mask, (z, z_p, m_p, logs_p)
956
-
957
-
958
- class MultiPeriodDiscriminator(torch.nn.Module):
959
- def __init__(self, use_spectral_norm=False):
960
- super(MultiPeriodDiscriminator, self).__init__()
961
- periods = [2, 3, 5, 7, 11, 17]
962
- # periods = [3, 5, 7, 11, 17, 23, 37]
963
-
964
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
965
- discs = discs + [
966
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
967
- ]
968
- self.discriminators = nn.ModuleList(discs)
969
-
970
- def forward(self, y, y_hat):
971
- y_d_rs = [] #
972
- y_d_gs = []
973
- fmap_rs = []
974
- fmap_gs = []
975
- for i, d in enumerate(self.discriminators):
976
- y_d_r, fmap_r = d(y)
977
- y_d_g, fmap_g = d(y_hat)
978
- # for j in range(len(fmap_r)):
979
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
980
- y_d_rs.append(y_d_r)
981
- y_d_gs.append(y_d_g)
982
- fmap_rs.append(fmap_r)
983
- fmap_gs.append(fmap_g)
984
-
985
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
986
-
987
-
988
- class MultiPeriodDiscriminatorV2(torch.nn.Module):
989
- def __init__(self, use_spectral_norm=False):
990
- super(MultiPeriodDiscriminatorV2, self).__init__()
991
- # periods = [2, 3, 5, 7, 11, 17]
992
- periods = [2, 3, 5, 7, 11, 17, 23, 37]
993
-
994
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
995
- discs = discs + [
996
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
997
- ]
998
- self.discriminators = nn.ModuleList(discs)
999
-
1000
- def forward(self, y, y_hat):
1001
- y_d_rs = [] #
1002
- y_d_gs = []
1003
- fmap_rs = []
1004
- fmap_gs = []
1005
- for i, d in enumerate(self.discriminators):
1006
- y_d_r, fmap_r = d(y)
1007
- y_d_g, fmap_g = d(y_hat)
1008
- # for j in range(len(fmap_r)):
1009
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1010
- y_d_rs.append(y_d_r)
1011
- y_d_gs.append(y_d_g)
1012
- fmap_rs.append(fmap_r)
1013
- fmap_gs.append(fmap_g)
1014
-
1015
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1016
-
1017
-
1018
- class DiscriminatorS(torch.nn.Module):
1019
- def __init__(self, use_spectral_norm=False):
1020
- super(DiscriminatorS, self).__init__()
1021
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1022
- self.convs = nn.ModuleList(
1023
- [
1024
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1025
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1026
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1027
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1028
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1029
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1030
- ]
1031
- )
1032
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1033
-
1034
- def forward(self, x):
1035
- fmap = []
1036
-
1037
- for l in self.convs:
1038
- x = l(x)
1039
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1040
- fmap.append(x)
1041
- x = self.conv_post(x)
1042
- fmap.append(x)
1043
- x = torch.flatten(x, 1, -1)
1044
-
1045
- return x, fmap
1046
-
1047
-
1048
- class DiscriminatorP(torch.nn.Module):
1049
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1050
- super(DiscriminatorP, self).__init__()
1051
- self.period = period
1052
- self.use_spectral_norm = use_spectral_norm
1053
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1054
- self.convs = nn.ModuleList(
1055
- [
1056
- norm_f(
1057
- Conv2d(
1058
- 1,
1059
- 32,
1060
- (kernel_size, 1),
1061
- (stride, 1),
1062
- padding=(get_padding(kernel_size, 1), 0),
1063
- )
1064
- ),
1065
- norm_f(
1066
- Conv2d(
1067
- 32,
1068
- 128,
1069
- (kernel_size, 1),
1070
- (stride, 1),
1071
- padding=(get_padding(kernel_size, 1), 0),
1072
- )
1073
- ),
1074
- norm_f(
1075
- Conv2d(
1076
- 128,
1077
- 512,
1078
- (kernel_size, 1),
1079
- (stride, 1),
1080
- padding=(get_padding(kernel_size, 1), 0),
1081
- )
1082
- ),
1083
- norm_f(
1084
- Conv2d(
1085
- 512,
1086
- 1024,
1087
- (kernel_size, 1),
1088
- (stride, 1),
1089
- padding=(get_padding(kernel_size, 1), 0),
1090
- )
1091
- ),
1092
- norm_f(
1093
- Conv2d(
1094
- 1024,
1095
- 1024,
1096
- (kernel_size, 1),
1097
- 1,
1098
- padding=(get_padding(kernel_size, 1), 0),
1099
- )
1100
- ),
1101
- ]
1102
- )
1103
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1104
-
1105
- def forward(self, x):
1106
- fmap = []
1107
-
1108
- # 1d to 2d
1109
- b, c, t = x.shape
1110
- if t % self.period != 0: # pad first
1111
- n_pad = self.period - (t % self.period)
1112
- x = F.pad(x, (0, n_pad), "reflect")
1113
- t = t + n_pad
1114
- x = x.view(b, c, t // self.period, self.period)
1115
-
1116
- for l in self.convs:
1117
- x = l(x)
1118
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1119
- fmap.append(x)
1120
- x = self.conv_post(x)
1121
- fmap.append(x)
1122
- x = torch.flatten(x, 1, -1)
1123
-
1124
- return x, fmap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/models_onnx.py DELETED
@@ -1,819 +0,0 @@
1
- import math, pdb, os
2
- from time import time as ttime
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
- from lib.infer_pack import modules
7
- from lib.infer_pack import attentions
8
- from lib.infer_pack import commons
9
- from lib.infer_pack.commons import init_weights, get_padding
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from lib.infer_pack.commons import init_weights
13
- import numpy as np
14
- from lib.infer_pack import commons
15
-
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder768(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(768, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- stats = self.proj(x) * x_mask
106
-
107
- m, logs = torch.split(stats, self.out_channels, dim=1)
108
- return m, logs, x_mask
109
-
110
-
111
- class ResidualCouplingBlock(nn.Module):
112
- def __init__(
113
- self,
114
- channels,
115
- hidden_channels,
116
- kernel_size,
117
- dilation_rate,
118
- n_layers,
119
- n_flows=4,
120
- gin_channels=0,
121
- ):
122
- super().__init__()
123
- self.channels = channels
124
- self.hidden_channels = hidden_channels
125
- self.kernel_size = kernel_size
126
- self.dilation_rate = dilation_rate
127
- self.n_layers = n_layers
128
- self.n_flows = n_flows
129
- self.gin_channels = gin_channels
130
-
131
- self.flows = nn.ModuleList()
132
- for i in range(n_flows):
133
- self.flows.append(
134
- modules.ResidualCouplingLayer(
135
- channels,
136
- hidden_channels,
137
- kernel_size,
138
- dilation_rate,
139
- n_layers,
140
- gin_channels=gin_channels,
141
- mean_only=True,
142
- )
143
- )
144
- self.flows.append(modules.Flip())
145
-
146
- def forward(self, x, x_mask, g=None, reverse=False):
147
- if not reverse:
148
- for flow in self.flows:
149
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
- else:
151
- for flow in reversed(self.flows):
152
- x = flow(x, x_mask, g=g, reverse=reverse)
153
- return x
154
-
155
- def remove_weight_norm(self):
156
- for i in range(self.n_flows):
157
- self.flows[i * 2].remove_weight_norm()
158
-
159
-
160
- class PosteriorEncoder(nn.Module):
161
- def __init__(
162
- self,
163
- in_channels,
164
- out_channels,
165
- hidden_channels,
166
- kernel_size,
167
- dilation_rate,
168
- n_layers,
169
- gin_channels=0,
170
- ):
171
- super().__init__()
172
- self.in_channels = in_channels
173
- self.out_channels = out_channels
174
- self.hidden_channels = hidden_channels
175
- self.kernel_size = kernel_size
176
- self.dilation_rate = dilation_rate
177
- self.n_layers = n_layers
178
- self.gin_channels = gin_channels
179
-
180
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
- self.enc = modules.WN(
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- gin_channels=gin_channels,
187
- )
188
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
-
190
- def forward(self, x, x_lengths, g=None):
191
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
- x.dtype
193
- )
194
- x = self.pre(x) * x_mask
195
- x = self.enc(x, x_mask, g=g)
196
- stats = self.proj(x) * x_mask
197
- m, logs = torch.split(stats, self.out_channels, dim=1)
198
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
- return z, m, logs, x_mask
200
-
201
- def remove_weight_norm(self):
202
- self.enc.remove_weight_norm()
203
-
204
-
205
- class Generator(torch.nn.Module):
206
- def __init__(
207
- self,
208
- initial_channel,
209
- resblock,
210
- resblock_kernel_sizes,
211
- resblock_dilation_sizes,
212
- upsample_rates,
213
- upsample_initial_channel,
214
- upsample_kernel_sizes,
215
- gin_channels=0,
216
- ):
217
- super(Generator, self).__init__()
218
- self.num_kernels = len(resblock_kernel_sizes)
219
- self.num_upsamples = len(upsample_rates)
220
- self.conv_pre = Conv1d(
221
- initial_channel, upsample_initial_channel, 7, 1, padding=3
222
- )
223
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
-
225
- self.ups = nn.ModuleList()
226
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
- self.ups.append(
228
- weight_norm(
229
- ConvTranspose1d(
230
- upsample_initial_channel // (2**i),
231
- upsample_initial_channel // (2 ** (i + 1)),
232
- k,
233
- u,
234
- padding=(k - u) // 2,
235
- )
236
- )
237
- )
238
-
239
- self.resblocks = nn.ModuleList()
240
- for i in range(len(self.ups)):
241
- ch = upsample_initial_channel // (2 ** (i + 1))
242
- for j, (k, d) in enumerate(
243
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
- ):
245
- self.resblocks.append(resblock(ch, k, d))
246
-
247
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
- self.ups.apply(init_weights)
249
-
250
- if gin_channels != 0:
251
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
-
253
- def forward(self, x, g=None):
254
- x = self.conv_pre(x)
255
- if g is not None:
256
- x = x + self.cond(g)
257
-
258
- for i in range(self.num_upsamples):
259
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
- x = self.ups[i](x)
261
- xs = None
262
- for j in range(self.num_kernels):
263
- if xs is None:
264
- xs = self.resblocks[i * self.num_kernels + j](x)
265
- else:
266
- xs += self.resblocks[i * self.num_kernels + j](x)
267
- x = xs / self.num_kernels
268
- x = F.leaky_relu(x)
269
- x = self.conv_post(x)
270
- x = torch.tanh(x)
271
-
272
- return x
273
-
274
- def remove_weight_norm(self):
275
- for l in self.ups:
276
- remove_weight_norm(l)
277
- for l in self.resblocks:
278
- l.remove_weight_norm()
279
-
280
-
281
- class SineGen(torch.nn.Module):
282
- """Definition of sine generator
283
- SineGen(samp_rate, harmonic_num = 0,
284
- sine_amp = 0.1, noise_std = 0.003,
285
- voiced_threshold = 0,
286
- flag_for_pulse=False)
287
- samp_rate: sampling rate in Hz
288
- harmonic_num: number of harmonic overtones (default 0)
289
- sine_amp: amplitude of sine-wavefrom (default 0.1)
290
- noise_std: std of Gaussian noise (default 0.003)
291
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
- Note: when flag_for_pulse is True, the first time step of a voiced
294
- segment is always sin(np.pi) or cos(0)
295
- """
296
-
297
- def __init__(
298
- self,
299
- samp_rate,
300
- harmonic_num=0,
301
- sine_amp=0.1,
302
- noise_std=0.003,
303
- voiced_threshold=0,
304
- flag_for_pulse=False,
305
- ):
306
- super(SineGen, self).__init__()
307
- self.sine_amp = sine_amp
308
- self.noise_std = noise_std
309
- self.harmonic_num = harmonic_num
310
- self.dim = self.harmonic_num + 1
311
- self.sampling_rate = samp_rate
312
- self.voiced_threshold = voiced_threshold
313
-
314
- def _f02uv(self, f0):
315
- # generate uv signal
316
- uv = torch.ones_like(f0)
317
- uv = uv * (f0 > self.voiced_threshold)
318
- return uv
319
-
320
- def forward(self, f0, upp):
321
- """sine_tensor, uv = forward(f0)
322
- input F0: tensor(batchsize=1, length, dim=1)
323
- f0 for unvoiced steps should be 0
324
- output sine_tensor: tensor(batchsize=1, length, dim)
325
- output uv: tensor(batchsize=1, length, 1)
326
- """
327
- with torch.no_grad():
328
- f0 = f0[:, None].transpose(1, 2)
329
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
- # fundamental component
331
- f0_buf[:, :, 0] = f0[:, :, 0]
332
- for idx in np.arange(self.harmonic_num):
333
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
- idx + 2
335
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_harηš„δΉ˜η§―ζ— ζ³•εŽε€„η†δΌ˜εŒ–
337
- rand_ini = torch.rand(
338
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
- )
340
- rand_ini[:, 0] = 0
341
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1ζ„ε‘³η€εŽι’ηš„cumsumζ— ζ³•ε†δΌ˜εŒ–
343
- tmp_over_one *= upp
344
- tmp_over_one = F.interpolate(
345
- tmp_over_one.transpose(2, 1),
346
- scale_factor=upp,
347
- mode="linear",
348
- align_corners=True,
349
- ).transpose(2, 1)
350
- rad_values = F.interpolate(
351
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
- ).transpose(
353
- 2, 1
354
- ) #######
355
- tmp_over_one %= 1
356
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
- cumsum_shift = torch.zeros_like(rad_values)
358
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
- sine_waves = torch.sin(
360
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
- )
362
- sine_waves = sine_waves * self.sine_amp
363
- uv = self._f02uv(f0)
364
- uv = F.interpolate(
365
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
- ).transpose(2, 1)
367
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
- noise = noise_amp * torch.randn_like(sine_waves)
369
- sine_waves = sine_waves * uv + noise
370
- return sine_waves, uv, noise
371
-
372
-
373
- class SourceModuleHnNSF(torch.nn.Module):
374
- """SourceModule for hn-nsf
375
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
- add_noise_std=0.003, voiced_threshod=0)
377
- sampling_rate: sampling_rate in Hz
378
- harmonic_num: number of harmonic above F0 (default: 0)
379
- sine_amp: amplitude of sine source signal (default: 0.1)
380
- add_noise_std: std of additive Gaussian noise (default: 0.003)
381
- note that amplitude of noise in unvoiced is decided
382
- by sine_amp
383
- voiced_threshold: threhold to set U/V given F0 (default: 0)
384
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
- F0_sampled (batchsize, length, 1)
386
- Sine_source (batchsize, length, 1)
387
- noise_source (batchsize, length 1)
388
- uv (batchsize, length, 1)
389
- """
390
-
391
- def __init__(
392
- self,
393
- sampling_rate,
394
- harmonic_num=0,
395
- sine_amp=0.1,
396
- add_noise_std=0.003,
397
- voiced_threshod=0,
398
- is_half=True,
399
- ):
400
- super(SourceModuleHnNSF, self).__init__()
401
-
402
- self.sine_amp = sine_amp
403
- self.noise_std = add_noise_std
404
- self.is_half = is_half
405
- # to produce sine waveforms
406
- self.l_sin_gen = SineGen(
407
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
- )
409
-
410
- # to merge source harmonics into a single excitation
411
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
- self.l_tanh = torch.nn.Tanh()
413
-
414
- def forward(self, x, upp=None):
415
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
- if self.is_half:
417
- sine_wavs = sine_wavs.half()
418
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
- return sine_merge, None, None # noise, uv
420
-
421
-
422
- class GeneratorNSF(torch.nn.Module):
423
- def __init__(
424
- self,
425
- initial_channel,
426
- resblock,
427
- resblock_kernel_sizes,
428
- resblock_dilation_sizes,
429
- upsample_rates,
430
- upsample_initial_channel,
431
- upsample_kernel_sizes,
432
- gin_channels,
433
- sr,
434
- is_half=False,
435
- ):
436
- super(GeneratorNSF, self).__init__()
437
- self.num_kernels = len(resblock_kernel_sizes)
438
- self.num_upsamples = len(upsample_rates)
439
-
440
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
- self.m_source = SourceModuleHnNSF(
442
- sampling_rate=sr, harmonic_num=0, is_half=is_half
443
- )
444
- self.noise_convs = nn.ModuleList()
445
- self.conv_pre = Conv1d(
446
- initial_channel, upsample_initial_channel, 7, 1, padding=3
447
- )
448
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
-
450
- self.ups = nn.ModuleList()
451
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
- c_cur = upsample_initial_channel // (2 ** (i + 1))
453
- self.ups.append(
454
- weight_norm(
455
- ConvTranspose1d(
456
- upsample_initial_channel // (2**i),
457
- upsample_initial_channel // (2 ** (i + 1)),
458
- k,
459
- u,
460
- padding=(k - u) // 2,
461
- )
462
- )
463
- )
464
- if i + 1 < len(upsample_rates):
465
- stride_f0 = np.prod(upsample_rates[i + 1 :])
466
- self.noise_convs.append(
467
- Conv1d(
468
- 1,
469
- c_cur,
470
- kernel_size=stride_f0 * 2,
471
- stride=stride_f0,
472
- padding=stride_f0 // 2,
473
- )
474
- )
475
- else:
476
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
-
478
- self.resblocks = nn.ModuleList()
479
- for i in range(len(self.ups)):
480
- ch = upsample_initial_channel // (2 ** (i + 1))
481
- for j, (k, d) in enumerate(
482
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
- ):
484
- self.resblocks.append(resblock(ch, k, d))
485
-
486
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
- self.ups.apply(init_weights)
488
-
489
- if gin_channels != 0:
490
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
-
492
- self.upp = np.prod(upsample_rates)
493
-
494
- def forward(self, x, f0, g=None):
495
- har_source, noi_source, uv = self.m_source(f0, self.upp)
496
- har_source = har_source.transpose(1, 2)
497
- x = self.conv_pre(x)
498
- if g is not None:
499
- x = x + self.cond(g)
500
-
501
- for i in range(self.num_upsamples):
502
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
- x = self.ups[i](x)
504
- x_source = self.noise_convs[i](har_source)
505
- x = x + x_source
506
- xs = None
507
- for j in range(self.num_kernels):
508
- if xs is None:
509
- xs = self.resblocks[i * self.num_kernels + j](x)
510
- else:
511
- xs += self.resblocks[i * self.num_kernels + j](x)
512
- x = xs / self.num_kernels
513
- x = F.leaky_relu(x)
514
- x = self.conv_post(x)
515
- x = torch.tanh(x)
516
- return x
517
-
518
- def remove_weight_norm(self):
519
- for l in self.ups:
520
- remove_weight_norm(l)
521
- for l in self.resblocks:
522
- l.remove_weight_norm()
523
-
524
-
525
- sr2sr = {
526
- "32k": 32000,
527
- "40k": 40000,
528
- "48k": 48000,
529
- }
530
-
531
-
532
- class SynthesizerTrnMsNSFsidM(nn.Module):
533
- def __init__(
534
- self,
535
- spec_channels,
536
- segment_size,
537
- inter_channels,
538
- hidden_channels,
539
- filter_channels,
540
- n_heads,
541
- n_layers,
542
- kernel_size,
543
- p_dropout,
544
- resblock,
545
- resblock_kernel_sizes,
546
- resblock_dilation_sizes,
547
- upsample_rates,
548
- upsample_initial_channel,
549
- upsample_kernel_sizes,
550
- spk_embed_dim,
551
- gin_channels,
552
- sr,
553
- version,
554
- **kwargs
555
- ):
556
- super().__init__()
557
- if type(sr) == type("strr"):
558
- sr = sr2sr[sr]
559
- self.spec_channels = spec_channels
560
- self.inter_channels = inter_channels
561
- self.hidden_channels = hidden_channels
562
- self.filter_channels = filter_channels
563
- self.n_heads = n_heads
564
- self.n_layers = n_layers
565
- self.kernel_size = kernel_size
566
- self.p_dropout = p_dropout
567
- self.resblock = resblock
568
- self.resblock_kernel_sizes = resblock_kernel_sizes
569
- self.resblock_dilation_sizes = resblock_dilation_sizes
570
- self.upsample_rates = upsample_rates
571
- self.upsample_initial_channel = upsample_initial_channel
572
- self.upsample_kernel_sizes = upsample_kernel_sizes
573
- self.segment_size = segment_size
574
- self.gin_channels = gin_channels
575
- # self.hop_length = hop_length#
576
- self.spk_embed_dim = spk_embed_dim
577
- if version == "v1":
578
- self.enc_p = TextEncoder256(
579
- inter_channels,
580
- hidden_channels,
581
- filter_channels,
582
- n_heads,
583
- n_layers,
584
- kernel_size,
585
- p_dropout,
586
- )
587
- else:
588
- self.enc_p = TextEncoder768(
589
- inter_channels,
590
- hidden_channels,
591
- filter_channels,
592
- n_heads,
593
- n_layers,
594
- kernel_size,
595
- p_dropout,
596
- )
597
- self.dec = GeneratorNSF(
598
- inter_channels,
599
- resblock,
600
- resblock_kernel_sizes,
601
- resblock_dilation_sizes,
602
- upsample_rates,
603
- upsample_initial_channel,
604
- upsample_kernel_sizes,
605
- gin_channels=gin_channels,
606
- sr=sr,
607
- is_half=kwargs["is_half"],
608
- )
609
- self.enc_q = PosteriorEncoder(
610
- spec_channels,
611
- inter_channels,
612
- hidden_channels,
613
- 5,
614
- 1,
615
- 16,
616
- gin_channels=gin_channels,
617
- )
618
- self.flow = ResidualCouplingBlock(
619
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
620
- )
621
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
622
- self.speaker_map = None
623
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
624
-
625
- def remove_weight_norm(self):
626
- self.dec.remove_weight_norm()
627
- self.flow.remove_weight_norm()
628
- self.enc_q.remove_weight_norm()
629
-
630
- def construct_spkmixmap(self, n_speaker):
631
- self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
632
- for i in range(n_speaker):
633
- self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
634
- self.speaker_map = self.speaker_map.unsqueeze(0)
635
-
636
- def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
637
- if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
638
- g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
639
- g = g * self.speaker_map # [N, S, B, 1, H]
640
- g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
641
- g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
642
- else:
643
- g = g.unsqueeze(0)
644
- g = self.emb_g(g).transpose(1, 2)
645
-
646
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
647
- z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
648
- z = self.flow(z_p, x_mask, g=g, reverse=True)
649
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
650
- return o
651
-
652
-
653
- class MultiPeriodDiscriminator(torch.nn.Module):
654
- def __init__(self, use_spectral_norm=False):
655
- super(MultiPeriodDiscriminator, self).__init__()
656
- periods = [2, 3, 5, 7, 11, 17]
657
- # periods = [3, 5, 7, 11, 17, 23, 37]
658
-
659
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
660
- discs = discs + [
661
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
662
- ]
663
- self.discriminators = nn.ModuleList(discs)
664
-
665
- def forward(self, y, y_hat):
666
- y_d_rs = [] #
667
- y_d_gs = []
668
- fmap_rs = []
669
- fmap_gs = []
670
- for i, d in enumerate(self.discriminators):
671
- y_d_r, fmap_r = d(y)
672
- y_d_g, fmap_g = d(y_hat)
673
- # for j in range(len(fmap_r)):
674
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
675
- y_d_rs.append(y_d_r)
676
- y_d_gs.append(y_d_g)
677
- fmap_rs.append(fmap_r)
678
- fmap_gs.append(fmap_g)
679
-
680
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
681
-
682
-
683
- class MultiPeriodDiscriminatorV2(torch.nn.Module):
684
- def __init__(self, use_spectral_norm=False):
685
- super(MultiPeriodDiscriminatorV2, self).__init__()
686
- # periods = [2, 3, 5, 7, 11, 17]
687
- periods = [2, 3, 5, 7, 11, 17, 23, 37]
688
-
689
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
690
- discs = discs + [
691
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
692
- ]
693
- self.discriminators = nn.ModuleList(discs)
694
-
695
- def forward(self, y, y_hat):
696
- y_d_rs = [] #
697
- y_d_gs = []
698
- fmap_rs = []
699
- fmap_gs = []
700
- for i, d in enumerate(self.discriminators):
701
- y_d_r, fmap_r = d(y)
702
- y_d_g, fmap_g = d(y_hat)
703
- # for j in range(len(fmap_r)):
704
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
705
- y_d_rs.append(y_d_r)
706
- y_d_gs.append(y_d_g)
707
- fmap_rs.append(fmap_r)
708
- fmap_gs.append(fmap_g)
709
-
710
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
711
-
712
-
713
- class DiscriminatorS(torch.nn.Module):
714
- def __init__(self, use_spectral_norm=False):
715
- super(DiscriminatorS, self).__init__()
716
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
717
- self.convs = nn.ModuleList(
718
- [
719
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
720
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
721
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
722
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
723
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
724
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
725
- ]
726
- )
727
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
728
-
729
- def forward(self, x):
730
- fmap = []
731
-
732
- for l in self.convs:
733
- x = l(x)
734
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
735
- fmap.append(x)
736
- x = self.conv_post(x)
737
- fmap.append(x)
738
- x = torch.flatten(x, 1, -1)
739
-
740
- return x, fmap
741
-
742
-
743
- class DiscriminatorP(torch.nn.Module):
744
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
745
- super(DiscriminatorP, self).__init__()
746
- self.period = period
747
- self.use_spectral_norm = use_spectral_norm
748
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
749
- self.convs = nn.ModuleList(
750
- [
751
- norm_f(
752
- Conv2d(
753
- 1,
754
- 32,
755
- (kernel_size, 1),
756
- (stride, 1),
757
- padding=(get_padding(kernel_size, 1), 0),
758
- )
759
- ),
760
- norm_f(
761
- Conv2d(
762
- 32,
763
- 128,
764
- (kernel_size, 1),
765
- (stride, 1),
766
- padding=(get_padding(kernel_size, 1), 0),
767
- )
768
- ),
769
- norm_f(
770
- Conv2d(
771
- 128,
772
- 512,
773
- (kernel_size, 1),
774
- (stride, 1),
775
- padding=(get_padding(kernel_size, 1), 0),
776
- )
777
- ),
778
- norm_f(
779
- Conv2d(
780
- 512,
781
- 1024,
782
- (kernel_size, 1),
783
- (stride, 1),
784
- padding=(get_padding(kernel_size, 1), 0),
785
- )
786
- ),
787
- norm_f(
788
- Conv2d(
789
- 1024,
790
- 1024,
791
- (kernel_size, 1),
792
- 1,
793
- padding=(get_padding(kernel_size, 1), 0),
794
- )
795
- ),
796
- ]
797
- )
798
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
799
-
800
- def forward(self, x):
801
- fmap = []
802
-
803
- # 1d to 2d
804
- b, c, t = x.shape
805
- if t % self.period != 0: # pad first
806
- n_pad = self.period - (t % self.period)
807
- x = F.pad(x, (0, n_pad), "reflect")
808
- t = t + n_pad
809
- x = x.view(b, c, t // self.period, self.period)
810
-
811
- for l in self.convs:
812
- x = l(x)
813
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
814
- fmap.append(x)
815
- x = self.conv_post(x)
816
- fmap.append(x)
817
- x = torch.flatten(x, 1, -1)
818
-
819
- return x, fmap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/modules.py DELETED
@@ -1,522 +0,0 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import scipy
5
- import torch
6
- from torch import nn
7
- from torch.nn import functional as F
8
-
9
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
- from torch.nn.utils import weight_norm, remove_weight_norm
11
-
12
- from lib.infer_pack import commons
13
- from lib.infer_pack.commons import init_weights, get_padding
14
- from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
15
-
16
-
17
- LRELU_SLOPE = 0.1
18
-
19
-
20
- class LayerNorm(nn.Module):
21
- def __init__(self, channels, eps=1e-5):
22
- super().__init__()
23
- self.channels = channels
24
- self.eps = eps
25
-
26
- self.gamma = nn.Parameter(torch.ones(channels))
27
- self.beta = nn.Parameter(torch.zeros(channels))
28
-
29
- def forward(self, x):
30
- x = x.transpose(1, -1)
31
- x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
- return x.transpose(1, -1)
33
-
34
-
35
- class ConvReluNorm(nn.Module):
36
- def __init__(
37
- self,
38
- in_channels,
39
- hidden_channels,
40
- out_channels,
41
- kernel_size,
42
- n_layers,
43
- p_dropout,
44
- ):
45
- super().__init__()
46
- self.in_channels = in_channels
47
- self.hidden_channels = hidden_channels
48
- self.out_channels = out_channels
49
- self.kernel_size = kernel_size
50
- self.n_layers = n_layers
51
- self.p_dropout = p_dropout
52
- assert n_layers > 1, "Number of layers should be larger than 0."
53
-
54
- self.conv_layers = nn.ModuleList()
55
- self.norm_layers = nn.ModuleList()
56
- self.conv_layers.append(
57
- nn.Conv1d(
58
- in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
59
- )
60
- )
61
- self.norm_layers.append(LayerNorm(hidden_channels))
62
- self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
63
- for _ in range(n_layers - 1):
64
- self.conv_layers.append(
65
- nn.Conv1d(
66
- hidden_channels,
67
- hidden_channels,
68
- kernel_size,
69
- padding=kernel_size // 2,
70
- )
71
- )
72
- self.norm_layers.append(LayerNorm(hidden_channels))
73
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
74
- self.proj.weight.data.zero_()
75
- self.proj.bias.data.zero_()
76
-
77
- def forward(self, x, x_mask):
78
- x_org = x
79
- for i in range(self.n_layers):
80
- x = self.conv_layers[i](x * x_mask)
81
- x = self.norm_layers[i](x)
82
- x = self.relu_drop(x)
83
- x = x_org + self.proj(x)
84
- return x * x_mask
85
-
86
-
87
- class DDSConv(nn.Module):
88
- """
89
- Dialted and Depth-Separable Convolution
90
- """
91
-
92
- def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
93
- super().__init__()
94
- self.channels = channels
95
- self.kernel_size = kernel_size
96
- self.n_layers = n_layers
97
- self.p_dropout = p_dropout
98
-
99
- self.drop = nn.Dropout(p_dropout)
100
- self.convs_sep = nn.ModuleList()
101
- self.convs_1x1 = nn.ModuleList()
102
- self.norms_1 = nn.ModuleList()
103
- self.norms_2 = nn.ModuleList()
104
- for i in range(n_layers):
105
- dilation = kernel_size**i
106
- padding = (kernel_size * dilation - dilation) // 2
107
- self.convs_sep.append(
108
- nn.Conv1d(
109
- channels,
110
- channels,
111
- kernel_size,
112
- groups=channels,
113
- dilation=dilation,
114
- padding=padding,
115
- )
116
- )
117
- self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118
- self.norms_1.append(LayerNorm(channels))
119
- self.norms_2.append(LayerNorm(channels))
120
-
121
- def forward(self, x, x_mask, g=None):
122
- if g is not None:
123
- x = x + g
124
- for i in range(self.n_layers):
125
- y = self.convs_sep[i](x * x_mask)
126
- y = self.norms_1[i](y)
127
- y = F.gelu(y)
128
- y = self.convs_1x1[i](y)
129
- y = self.norms_2[i](y)
130
- y = F.gelu(y)
131
- y = self.drop(y)
132
- x = x + y
133
- return x * x_mask
134
-
135
-
136
- class WN(torch.nn.Module):
137
- def __init__(
138
- self,
139
- hidden_channels,
140
- kernel_size,
141
- dilation_rate,
142
- n_layers,
143
- gin_channels=0,
144
- p_dropout=0,
145
- ):
146
- super(WN, self).__init__()
147
- assert kernel_size % 2 == 1
148
- self.hidden_channels = hidden_channels
149
- self.kernel_size = (kernel_size,)
150
- self.dilation_rate = dilation_rate
151
- self.n_layers = n_layers
152
- self.gin_channels = gin_channels
153
- self.p_dropout = p_dropout
154
-
155
- self.in_layers = torch.nn.ModuleList()
156
- self.res_skip_layers = torch.nn.ModuleList()
157
- self.drop = nn.Dropout(p_dropout)
158
-
159
- if gin_channels != 0:
160
- cond_layer = torch.nn.Conv1d(
161
- gin_channels, 2 * hidden_channels * n_layers, 1
162
- )
163
- self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164
-
165
- for i in range(n_layers):
166
- dilation = dilation_rate**i
167
- padding = int((kernel_size * dilation - dilation) / 2)
168
- in_layer = torch.nn.Conv1d(
169
- hidden_channels,
170
- 2 * hidden_channels,
171
- kernel_size,
172
- dilation=dilation,
173
- padding=padding,
174
- )
175
- in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176
- self.in_layers.append(in_layer)
177
-
178
- # last one is not necessary
179
- if i < n_layers - 1:
180
- res_skip_channels = 2 * hidden_channels
181
- else:
182
- res_skip_channels = hidden_channels
183
-
184
- res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185
- res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186
- self.res_skip_layers.append(res_skip_layer)
187
-
188
- def forward(self, x, x_mask, g=None, **kwargs):
189
- output = torch.zeros_like(x)
190
- n_channels_tensor = torch.IntTensor([self.hidden_channels])
191
-
192
- if g is not None:
193
- g = self.cond_layer(g)
194
-
195
- for i in range(self.n_layers):
196
- x_in = self.in_layers[i](x)
197
- if g is not None:
198
- cond_offset = i * 2 * self.hidden_channels
199
- g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
200
- else:
201
- g_l = torch.zeros_like(x_in)
202
-
203
- acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
204
- acts = self.drop(acts)
205
-
206
- res_skip_acts = self.res_skip_layers[i](acts)
207
- if i < self.n_layers - 1:
208
- res_acts = res_skip_acts[:, : self.hidden_channels, :]
209
- x = (x + res_acts) * x_mask
210
- output = output + res_skip_acts[:, self.hidden_channels :, :]
211
- else:
212
- output = output + res_skip_acts
213
- return output * x_mask
214
-
215
- def remove_weight_norm(self):
216
- if self.gin_channels != 0:
217
- torch.nn.utils.remove_weight_norm(self.cond_layer)
218
- for l in self.in_layers:
219
- torch.nn.utils.remove_weight_norm(l)
220
- for l in self.res_skip_layers:
221
- torch.nn.utils.remove_weight_norm(l)
222
-
223
-
224
- class ResBlock1(torch.nn.Module):
225
- def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
226
- super(ResBlock1, self).__init__()
227
- self.convs1 = nn.ModuleList(
228
- [
229
- weight_norm(
230
- Conv1d(
231
- channels,
232
- channels,
233
- kernel_size,
234
- 1,
235
- dilation=dilation[0],
236
- padding=get_padding(kernel_size, dilation[0]),
237
- )
238
- ),
239
- weight_norm(
240
- Conv1d(
241
- channels,
242
- channels,
243
- kernel_size,
244
- 1,
245
- dilation=dilation[1],
246
- padding=get_padding(kernel_size, dilation[1]),
247
- )
248
- ),
249
- weight_norm(
250
- Conv1d(
251
- channels,
252
- channels,
253
- kernel_size,
254
- 1,
255
- dilation=dilation[2],
256
- padding=get_padding(kernel_size, dilation[2]),
257
- )
258
- ),
259
- ]
260
- )
261
- self.convs1.apply(init_weights)
262
-
263
- self.convs2 = nn.ModuleList(
264
- [
265
- weight_norm(
266
- Conv1d(
267
- channels,
268
- channels,
269
- kernel_size,
270
- 1,
271
- dilation=1,
272
- padding=get_padding(kernel_size, 1),
273
- )
274
- ),
275
- weight_norm(
276
- Conv1d(
277
- channels,
278
- channels,
279
- kernel_size,
280
- 1,
281
- dilation=1,
282
- padding=get_padding(kernel_size, 1),
283
- )
284
- ),
285
- weight_norm(
286
- Conv1d(
287
- channels,
288
- channels,
289
- kernel_size,
290
- 1,
291
- dilation=1,
292
- padding=get_padding(kernel_size, 1),
293
- )
294
- ),
295
- ]
296
- )
297
- self.convs2.apply(init_weights)
298
-
299
- def forward(self, x, x_mask=None):
300
- for c1, c2 in zip(self.convs1, self.convs2):
301
- xt = F.leaky_relu(x, LRELU_SLOPE)
302
- if x_mask is not None:
303
- xt = xt * x_mask
304
- xt = c1(xt)
305
- xt = F.leaky_relu(xt, LRELU_SLOPE)
306
- if x_mask is not None:
307
- xt = xt * x_mask
308
- xt = c2(xt)
309
- x = xt + x
310
- if x_mask is not None:
311
- x = x * x_mask
312
- return x
313
-
314
- def remove_weight_norm(self):
315
- for l in self.convs1:
316
- remove_weight_norm(l)
317
- for l in self.convs2:
318
- remove_weight_norm(l)
319
-
320
-
321
- class ResBlock2(torch.nn.Module):
322
- def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
323
- super(ResBlock2, self).__init__()
324
- self.convs = nn.ModuleList(
325
- [
326
- weight_norm(
327
- Conv1d(
328
- channels,
329
- channels,
330
- kernel_size,
331
- 1,
332
- dilation=dilation[0],
333
- padding=get_padding(kernel_size, dilation[0]),
334
- )
335
- ),
336
- weight_norm(
337
- Conv1d(
338
- channels,
339
- channels,
340
- kernel_size,
341
- 1,
342
- dilation=dilation[1],
343
- padding=get_padding(kernel_size, dilation[1]),
344
- )
345
- ),
346
- ]
347
- )
348
- self.convs.apply(init_weights)
349
-
350
- def forward(self, x, x_mask=None):
351
- for c in self.convs:
352
- xt = F.leaky_relu(x, LRELU_SLOPE)
353
- if x_mask is not None:
354
- xt = xt * x_mask
355
- xt = c(xt)
356
- x = xt + x
357
- if x_mask is not None:
358
- x = x * x_mask
359
- return x
360
-
361
- def remove_weight_norm(self):
362
- for l in self.convs:
363
- remove_weight_norm(l)
364
-
365
-
366
- class Log(nn.Module):
367
- def forward(self, x, x_mask, reverse=False, **kwargs):
368
- if not reverse:
369
- y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
370
- logdet = torch.sum(-y, [1, 2])
371
- return y, logdet
372
- else:
373
- x = torch.exp(x) * x_mask
374
- return x
375
-
376
-
377
- class Flip(nn.Module):
378
- def forward(self, x, *args, reverse=False, **kwargs):
379
- x = torch.flip(x, [1])
380
- if not reverse:
381
- logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
382
- return x, logdet
383
- else:
384
- return x
385
-
386
-
387
- class ElementwiseAffine(nn.Module):
388
- def __init__(self, channels):
389
- super().__init__()
390
- self.channels = channels
391
- self.m = nn.Parameter(torch.zeros(channels, 1))
392
- self.logs = nn.Parameter(torch.zeros(channels, 1))
393
-
394
- def forward(self, x, x_mask, reverse=False, **kwargs):
395
- if not reverse:
396
- y = self.m + torch.exp(self.logs) * x
397
- y = y * x_mask
398
- logdet = torch.sum(self.logs * x_mask, [1, 2])
399
- return y, logdet
400
- else:
401
- x = (x - self.m) * torch.exp(-self.logs) * x_mask
402
- return x
403
-
404
-
405
- class ResidualCouplingLayer(nn.Module):
406
- def __init__(
407
- self,
408
- channels,
409
- hidden_channels,
410
- kernel_size,
411
- dilation_rate,
412
- n_layers,
413
- p_dropout=0,
414
- gin_channels=0,
415
- mean_only=False,
416
- ):
417
- assert channels % 2 == 0, "channels should be divisible by 2"
418
- super().__init__()
419
- self.channels = channels
420
- self.hidden_channels = hidden_channels
421
- self.kernel_size = kernel_size
422
- self.dilation_rate = dilation_rate
423
- self.n_layers = n_layers
424
- self.half_channels = channels // 2
425
- self.mean_only = mean_only
426
-
427
- self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
428
- self.enc = WN(
429
- hidden_channels,
430
- kernel_size,
431
- dilation_rate,
432
- n_layers,
433
- p_dropout=p_dropout,
434
- gin_channels=gin_channels,
435
- )
436
- self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
437
- self.post.weight.data.zero_()
438
- self.post.bias.data.zero_()
439
-
440
- def forward(self, x, x_mask, g=None, reverse=False):
441
- x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
442
- h = self.pre(x0) * x_mask
443
- h = self.enc(h, x_mask, g=g)
444
- stats = self.post(h) * x_mask
445
- if not self.mean_only:
446
- m, logs = torch.split(stats, [self.half_channels] * 2, 1)
447
- else:
448
- m = stats
449
- logs = torch.zeros_like(m)
450
-
451
- if not reverse:
452
- x1 = m + x1 * torch.exp(logs) * x_mask
453
- x = torch.cat([x0, x1], 1)
454
- logdet = torch.sum(logs, [1, 2])
455
- return x, logdet
456
- else:
457
- x1 = (x1 - m) * torch.exp(-logs) * x_mask
458
- x = torch.cat([x0, x1], 1)
459
- return x
460
-
461
- def remove_weight_norm(self):
462
- self.enc.remove_weight_norm()
463
-
464
-
465
- class ConvFlow(nn.Module):
466
- def __init__(
467
- self,
468
- in_channels,
469
- filter_channels,
470
- kernel_size,
471
- n_layers,
472
- num_bins=10,
473
- tail_bound=5.0,
474
- ):
475
- super().__init__()
476
- self.in_channels = in_channels
477
- self.filter_channels = filter_channels
478
- self.kernel_size = kernel_size
479
- self.n_layers = n_layers
480
- self.num_bins = num_bins
481
- self.tail_bound = tail_bound
482
- self.half_channels = in_channels // 2
483
-
484
- self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
485
- self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
486
- self.proj = nn.Conv1d(
487
- filter_channels, self.half_channels * (num_bins * 3 - 1), 1
488
- )
489
- self.proj.weight.data.zero_()
490
- self.proj.bias.data.zero_()
491
-
492
- def forward(self, x, x_mask, g=None, reverse=False):
493
- x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
494
- h = self.pre(x0)
495
- h = self.convs(h, x_mask, g=g)
496
- h = self.proj(h) * x_mask
497
-
498
- b, c, t = x0.shape
499
- h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
500
-
501
- unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
502
- unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
503
- self.filter_channels
504
- )
505
- unnormalized_derivatives = h[..., 2 * self.num_bins :]
506
-
507
- x1, logabsdet = piecewise_rational_quadratic_transform(
508
- x1,
509
- unnormalized_widths,
510
- unnormalized_heights,
511
- unnormalized_derivatives,
512
- inverse=reverse,
513
- tails="linear",
514
- tail_bound=self.tail_bound,
515
- )
516
-
517
- x = torch.cat([x0, x1], 1) * x_mask
518
- logdet = torch.sum(logabsdet * x_mask, [1, 2])
519
- if not reverse:
520
- return x, logdet
521
- else:
522
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/modules/F0Predictor/DioF0Predictor.py DELETED
@@ -1,90 +0,0 @@
1
- from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
- import pyworld
3
- import numpy as np
4
-
5
-
6
- class DioF0Predictor(F0Predictor):
7
- def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
- self.hop_length = hop_length
9
- self.f0_min = f0_min
10
- self.f0_max = f0_max
11
- self.sampling_rate = sampling_rate
12
-
13
- def interpolate_f0(self, f0):
14
- """
15
- ε―ΉF0θΏ›θ‘Œζ’ε€Όε€„η†
16
- """
17
-
18
- data = np.reshape(f0, (f0.size, 1))
19
-
20
- vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
- vuv_vector[data > 0.0] = 1.0
22
- vuv_vector[data <= 0.0] = 0.0
23
-
24
- ip_data = data
25
-
26
- frame_number = data.size
27
- last_value = 0.0
28
- for i in range(frame_number):
29
- if data[i] <= 0.0:
30
- j = i + 1
31
- for j in range(i + 1, frame_number):
32
- if data[j] > 0.0:
33
- break
34
- if j < frame_number - 1:
35
- if last_value > 0.0:
36
- step = (data[j] - data[i - 1]) / float(j - i)
37
- for k in range(i, j):
38
- ip_data[k] = data[i - 1] + step * (k - i + 1)
39
- else:
40
- for k in range(i, j):
41
- ip_data[k] = data[j]
42
- else:
43
- for k in range(i, frame_number):
44
- ip_data[k] = last_value
45
- else:
46
- ip_data[i] = data[i] # θΏ™ι‡Œε―θƒ½ε­˜εœ¨δΈ€δΈͺζ²‘ζœ‰εΏ…θ¦ηš„ζ‹·θ΄
47
- last_value = data[i]
48
-
49
- return ip_data[:, 0], vuv_vector[:, 0]
50
-
51
- def resize_f0(self, x, target_len):
52
- source = np.array(x)
53
- source[source < 0.001] = np.nan
54
- target = np.interp(
55
- np.arange(0, len(source) * target_len, len(source)) / target_len,
56
- np.arange(0, len(source)),
57
- source,
58
- )
59
- res = np.nan_to_num(target)
60
- return res
61
-
62
- def compute_f0(self, wav, p_len=None):
63
- if p_len is None:
64
- p_len = wav.shape[0] // self.hop_length
65
- f0, t = pyworld.dio(
66
- wav.astype(np.double),
67
- fs=self.sampling_rate,
68
- f0_floor=self.f0_min,
69
- f0_ceil=self.f0_max,
70
- frame_period=1000 * self.hop_length / self.sampling_rate,
71
- )
72
- f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
73
- for index, pitch in enumerate(f0):
74
- f0[index] = round(pitch, 1)
75
- return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
76
-
77
- def compute_f0_uv(self, wav, p_len=None):
78
- if p_len is None:
79
- p_len = wav.shape[0] // self.hop_length
80
- f0, t = pyworld.dio(
81
- wav.astype(np.double),
82
- fs=self.sampling_rate,
83
- f0_floor=self.f0_min,
84
- f0_ceil=self.f0_max,
85
- frame_period=1000 * self.hop_length / self.sampling_rate,
86
- )
87
- f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
88
- for index, pitch in enumerate(f0):
89
- f0[index] = round(pitch, 1)
90
- return self.interpolate_f0(self.resize_f0(f0, p_len))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/modules/F0Predictor/F0Predictor.py DELETED
@@ -1,16 +0,0 @@
1
- class F0Predictor(object):
2
- def compute_f0(self, wav, p_len):
3
- """
4
- input: wav:[signal_length]
5
- p_len:int
6
- output: f0:[signal_length//hop_length]
7
- """
8
- pass
9
-
10
- def compute_f0_uv(self, wav, p_len):
11
- """
12
- input: wav:[signal_length]
13
- p_len:int
14
- output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15
- """
16
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py DELETED
@@ -1,86 +0,0 @@
1
- from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
- import pyworld
3
- import numpy as np
4
-
5
-
6
- class HarvestF0Predictor(F0Predictor):
7
- def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
- self.hop_length = hop_length
9
- self.f0_min = f0_min
10
- self.f0_max = f0_max
11
- self.sampling_rate = sampling_rate
12
-
13
- def interpolate_f0(self, f0):
14
- """
15
- ε―ΉF0θΏ›θ‘Œζ’ε€Όε€„η†
16
- """
17
-
18
- data = np.reshape(f0, (f0.size, 1))
19
-
20
- vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
- vuv_vector[data > 0.0] = 1.0
22
- vuv_vector[data <= 0.0] = 0.0
23
-
24
- ip_data = data
25
-
26
- frame_number = data.size
27
- last_value = 0.0
28
- for i in range(frame_number):
29
- if data[i] <= 0.0:
30
- j = i + 1
31
- for j in range(i + 1, frame_number):
32
- if data[j] > 0.0:
33
- break
34
- if j < frame_number - 1:
35
- if last_value > 0.0:
36
- step = (data[j] - data[i - 1]) / float(j - i)
37
- for k in range(i, j):
38
- ip_data[k] = data[i - 1] + step * (k - i + 1)
39
- else:
40
- for k in range(i, j):
41
- ip_data[k] = data[j]
42
- else:
43
- for k in range(i, frame_number):
44
- ip_data[k] = last_value
45
- else:
46
- ip_data[i] = data[i] # θΏ™ι‡Œε―θƒ½ε­˜εœ¨δΈ€δΈͺζ²‘ζœ‰εΏ…θ¦ηš„ζ‹·θ΄
47
- last_value = data[i]
48
-
49
- return ip_data[:, 0], vuv_vector[:, 0]
50
-
51
- def resize_f0(self, x, target_len):
52
- source = np.array(x)
53
- source[source < 0.001] = np.nan
54
- target = np.interp(
55
- np.arange(0, len(source) * target_len, len(source)) / target_len,
56
- np.arange(0, len(source)),
57
- source,
58
- )
59
- res = np.nan_to_num(target)
60
- return res
61
-
62
- def compute_f0(self, wav, p_len=None):
63
- if p_len is None:
64
- p_len = wav.shape[0] // self.hop_length
65
- f0, t = pyworld.harvest(
66
- wav.astype(np.double),
67
- fs=self.hop_length,
68
- f0_ceil=self.f0_max,
69
- f0_floor=self.f0_min,
70
- frame_period=1000 * self.hop_length / self.sampling_rate,
71
- )
72
- f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
73
- return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
74
-
75
- def compute_f0_uv(self, wav, p_len=None):
76
- if p_len is None:
77
- p_len = wav.shape[0] // self.hop_length
78
- f0, t = pyworld.harvest(
79
- wav.astype(np.double),
80
- fs=self.sampling_rate,
81
- f0_floor=self.f0_min,
82
- f0_ceil=self.f0_max,
83
- frame_period=1000 * self.hop_length / self.sampling_rate,
84
- )
85
- f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
86
- return self.interpolate_f0(self.resize_f0(f0, p_len))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/modules/F0Predictor/PMF0Predictor.py DELETED
@@ -1,97 +0,0 @@
1
- from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
- import parselmouth
3
- import numpy as np
4
-
5
-
6
- class PMF0Predictor(F0Predictor):
7
- def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
8
- self.hop_length = hop_length
9
- self.f0_min = f0_min
10
- self.f0_max = f0_max
11
- self.sampling_rate = sampling_rate
12
-
13
- def interpolate_f0(self, f0):
14
- """
15
- ε―ΉF0θΏ›θ‘Œζ’ε€Όε€„η†
16
- """
17
-
18
- data = np.reshape(f0, (f0.size, 1))
19
-
20
- vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
- vuv_vector[data > 0.0] = 1.0
22
- vuv_vector[data <= 0.0] = 0.0
23
-
24
- ip_data = data
25
-
26
- frame_number = data.size
27
- last_value = 0.0
28
- for i in range(frame_number):
29
- if data[i] <= 0.0:
30
- j = i + 1
31
- for j in range(i + 1, frame_number):
32
- if data[j] > 0.0:
33
- break
34
- if j < frame_number - 1:
35
- if last_value > 0.0:
36
- step = (data[j] - data[i - 1]) / float(j - i)
37
- for k in range(i, j):
38
- ip_data[k] = data[i - 1] + step * (k - i + 1)
39
- else:
40
- for k in range(i, j):
41
- ip_data[k] = data[j]
42
- else:
43
- for k in range(i, frame_number):
44
- ip_data[k] = last_value
45
- else:
46
- ip_data[i] = data[i] # θΏ™ι‡Œε―θƒ½ε­˜εœ¨δΈ€δΈͺζ²‘ζœ‰εΏ…θ¦ηš„ζ‹·θ΄
47
- last_value = data[i]
48
-
49
- return ip_data[:, 0], vuv_vector[:, 0]
50
-
51
- def compute_f0(self, wav, p_len=None):
52
- x = wav
53
- if p_len is None:
54
- p_len = x.shape[0] // self.hop_length
55
- else:
56
- assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
57
- time_step = self.hop_length / self.sampling_rate * 1000
58
- f0 = (
59
- parselmouth.Sound(x, self.sampling_rate)
60
- .to_pitch_ac(
61
- time_step=time_step / 1000,
62
- voicing_threshold=0.6,
63
- pitch_floor=self.f0_min,
64
- pitch_ceiling=self.f0_max,
65
- )
66
- .selected_array["frequency"]
67
- )
68
-
69
- pad_size = (p_len - len(f0) + 1) // 2
70
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
71
- f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
72
- f0, uv = self.interpolate_f0(f0)
73
- return f0
74
-
75
- def compute_f0_uv(self, wav, p_len=None):
76
- x = wav
77
- if p_len is None:
78
- p_len = x.shape[0] // self.hop_length
79
- else:
80
- assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
81
- time_step = self.hop_length / self.sampling_rate * 1000
82
- f0 = (
83
- parselmouth.Sound(x, self.sampling_rate)
84
- .to_pitch_ac(
85
- time_step=time_step / 1000,
86
- voicing_threshold=0.6,
87
- pitch_floor=self.f0_min,
88
- pitch_ceiling=self.f0_max,
89
- )
90
- .selected_array["frequency"]
91
- )
92
-
93
- pad_size = (p_len - len(f0) + 1) // 2
94
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
95
- f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
96
- f0, uv = self.interpolate_f0(f0)
97
- return f0, uv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/modules/F0Predictor/__init__.py DELETED
File without changes
lib/infer_pack/onnx_inference.py DELETED
@@ -1,145 +0,0 @@
1
- import onnxruntime
2
- import librosa
3
- import numpy as np
4
- import soundfile
5
-
6
-
7
- class ContentVec:
8
- def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
9
- print("load model(s) from {}".format(vec_path))
10
- if device == "cpu" or device is None:
11
- providers = ["CPUExecutionProvider"]
12
- elif device == "cuda":
13
- providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
14
- elif device == "dml":
15
- providers = ["DmlExecutionProvider"]
16
- else:
17
- raise RuntimeError("Unsportted Device")
18
- self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
19
-
20
- def __call__(self, wav):
21
- return self.forward(wav)
22
-
23
- def forward(self, wav):
24
- feats = wav
25
- if feats.ndim == 2: # double channels
26
- feats = feats.mean(-1)
27
- assert feats.ndim == 1, feats.ndim
28
- feats = np.expand_dims(np.expand_dims(feats, 0), 0)
29
- onnx_input = {self.model.get_inputs()[0].name: feats}
30
- logits = self.model.run(None, onnx_input)[0]
31
- return logits.transpose(0, 2, 1)
32
-
33
-
34
- def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
35
- if f0_predictor == "pm":
36
- from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
37
-
38
- f0_predictor_object = PMF0Predictor(
39
- hop_length=hop_length, sampling_rate=sampling_rate
40
- )
41
- elif f0_predictor == "harvest":
42
- from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
43
- HarvestF0Predictor,
44
- )
45
-
46
- f0_predictor_object = HarvestF0Predictor(
47
- hop_length=hop_length, sampling_rate=sampling_rate
48
- )
49
- elif f0_predictor == "dio":
50
- from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
51
-
52
- f0_predictor_object = DioF0Predictor(
53
- hop_length=hop_length, sampling_rate=sampling_rate
54
- )
55
- else:
56
- raise Exception("Unknown f0 predictor")
57
- return f0_predictor_object
58
-
59
-
60
- class OnnxRVC:
61
- def __init__(
62
- self,
63
- model_path,
64
- sr=40000,
65
- hop_size=512,
66
- vec_path="vec-768-layer-12",
67
- device="cpu",
68
- ):
69
- vec_path = f"pretrained/{vec_path}.onnx"
70
- self.vec_model = ContentVec(vec_path, device)
71
- if device == "cpu" or device is None:
72
- providers = ["CPUExecutionProvider"]
73
- elif device == "cuda":
74
- providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
75
- elif device == "dml":
76
- providers = ["DmlExecutionProvider"]
77
- else:
78
- raise RuntimeError("Unsportted Device")
79
- self.model = onnxruntime.InferenceSession(model_path, providers=providers)
80
- self.sampling_rate = sr
81
- self.hop_size = hop_size
82
-
83
- def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
84
- onnx_input = {
85
- self.model.get_inputs()[0].name: hubert,
86
- self.model.get_inputs()[1].name: hubert_length,
87
- self.model.get_inputs()[2].name: pitch,
88
- self.model.get_inputs()[3].name: pitchf,
89
- self.model.get_inputs()[4].name: ds,
90
- self.model.get_inputs()[5].name: rnd,
91
- }
92
- return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
93
-
94
- def inference(
95
- self,
96
- raw_path,
97
- sid,
98
- f0_method="dio",
99
- f0_up_key=0,
100
- pad_time=0.5,
101
- cr_threshold=0.02,
102
- ):
103
- f0_min = 50
104
- f0_max = 1100
105
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
106
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
107
- f0_predictor = get_f0_predictor(
108
- f0_method,
109
- hop_length=self.hop_size,
110
- sampling_rate=self.sampling_rate,
111
- threshold=cr_threshold,
112
- )
113
- wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
114
- org_length = len(wav)
115
- if org_length / sr > 50.0:
116
- raise RuntimeError("Reached Max Length")
117
-
118
- wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
119
- wav16k = wav16k
120
-
121
- hubert = self.vec_model(wav16k)
122
- hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
123
- hubert_length = hubert.shape[1]
124
-
125
- pitchf = f0_predictor.compute_f0(wav, hubert_length)
126
- pitchf = pitchf * 2 ** (f0_up_key / 12)
127
- pitch = pitchf.copy()
128
- f0_mel = 1127 * np.log(1 + pitch / 700)
129
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
130
- f0_mel_max - f0_mel_min
131
- ) + 1
132
- f0_mel[f0_mel <= 1] = 1
133
- f0_mel[f0_mel > 255] = 255
134
- pitch = np.rint(f0_mel).astype(np.int64)
135
-
136
- pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
137
- pitch = pitch.reshape(1, len(pitch))
138
- ds = np.array([sid]).astype(np.int64)
139
-
140
- rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
141
- hubert_length = np.array([hubert_length]).astype(np.int64)
142
-
143
- out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
144
- out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
145
- return out_wav[0:org_length]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/infer_pack/transforms.py DELETED
@@ -1,209 +0,0 @@
1
- import torch
2
- from torch.nn import functional as F
3
-
4
- import numpy as np
5
-
6
-
7
- DEFAULT_MIN_BIN_WIDTH = 1e-3
8
- DEFAULT_MIN_BIN_HEIGHT = 1e-3
9
- DEFAULT_MIN_DERIVATIVE = 1e-3
10
-
11
-
12
- def piecewise_rational_quadratic_transform(
13
- inputs,
14
- unnormalized_widths,
15
- unnormalized_heights,
16
- unnormalized_derivatives,
17
- inverse=False,
18
- tails=None,
19
- tail_bound=1.0,
20
- min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21
- min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22
- min_derivative=DEFAULT_MIN_DERIVATIVE,
23
- ):
24
- if tails is None:
25
- spline_fn = rational_quadratic_spline
26
- spline_kwargs = {}
27
- else:
28
- spline_fn = unconstrained_rational_quadratic_spline
29
- spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30
-
31
- outputs, logabsdet = spline_fn(
32
- inputs=inputs,
33
- unnormalized_widths=unnormalized_widths,
34
- unnormalized_heights=unnormalized_heights,
35
- unnormalized_derivatives=unnormalized_derivatives,
36
- inverse=inverse,
37
- min_bin_width=min_bin_width,
38
- min_bin_height=min_bin_height,
39
- min_derivative=min_derivative,
40
- **spline_kwargs
41
- )
42
- return outputs, logabsdet
43
-
44
-
45
- def searchsorted(bin_locations, inputs, eps=1e-6):
46
- bin_locations[..., -1] += eps
47
- return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48
-
49
-
50
- def unconstrained_rational_quadratic_spline(
51
- inputs,
52
- unnormalized_widths,
53
- unnormalized_heights,
54
- unnormalized_derivatives,
55
- inverse=False,
56
- tails="linear",
57
- tail_bound=1.0,
58
- min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59
- min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60
- min_derivative=DEFAULT_MIN_DERIVATIVE,
61
- ):
62
- inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63
- outside_interval_mask = ~inside_interval_mask
64
-
65
- outputs = torch.zeros_like(inputs)
66
- logabsdet = torch.zeros_like(inputs)
67
-
68
- if tails == "linear":
69
- unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70
- constant = np.log(np.exp(1 - min_derivative) - 1)
71
- unnormalized_derivatives[..., 0] = constant
72
- unnormalized_derivatives[..., -1] = constant
73
-
74
- outputs[outside_interval_mask] = inputs[outside_interval_mask]
75
- logabsdet[outside_interval_mask] = 0
76
- else:
77
- raise RuntimeError("{} tails are not implemented.".format(tails))
78
-
79
- (
80
- outputs[inside_interval_mask],
81
- logabsdet[inside_interval_mask],
82
- ) = rational_quadratic_spline(
83
- inputs=inputs[inside_interval_mask],
84
- unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85
- unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86
- unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87
- inverse=inverse,
88
- left=-tail_bound,
89
- right=tail_bound,
90
- bottom=-tail_bound,
91
- top=tail_bound,
92
- min_bin_width=min_bin_width,
93
- min_bin_height=min_bin_height,
94
- min_derivative=min_derivative,
95
- )
96
-
97
- return outputs, logabsdet
98
-
99
-
100
- def rational_quadratic_spline(
101
- inputs,
102
- unnormalized_widths,
103
- unnormalized_heights,
104
- unnormalized_derivatives,
105
- inverse=False,
106
- left=0.0,
107
- right=1.0,
108
- bottom=0.0,
109
- top=1.0,
110
- min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111
- min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112
- min_derivative=DEFAULT_MIN_DERIVATIVE,
113
- ):
114
- if torch.min(inputs) < left or torch.max(inputs) > right:
115
- raise ValueError("Input to a transform is not within its domain")
116
-
117
- num_bins = unnormalized_widths.shape[-1]
118
-
119
- if min_bin_width * num_bins > 1.0:
120
- raise ValueError("Minimal bin width too large for the number of bins")
121
- if min_bin_height * num_bins > 1.0:
122
- raise ValueError("Minimal bin height too large for the number of bins")
123
-
124
- widths = F.softmax(unnormalized_widths, dim=-1)
125
- widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126
- cumwidths = torch.cumsum(widths, dim=-1)
127
- cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128
- cumwidths = (right - left) * cumwidths + left
129
- cumwidths[..., 0] = left
130
- cumwidths[..., -1] = right
131
- widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132
-
133
- derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134
-
135
- heights = F.softmax(unnormalized_heights, dim=-1)
136
- heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137
- cumheights = torch.cumsum(heights, dim=-1)
138
- cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139
- cumheights = (top - bottom) * cumheights + bottom
140
- cumheights[..., 0] = bottom
141
- cumheights[..., -1] = top
142
- heights = cumheights[..., 1:] - cumheights[..., :-1]
143
-
144
- if inverse:
145
- bin_idx = searchsorted(cumheights, inputs)[..., None]
146
- else:
147
- bin_idx = searchsorted(cumwidths, inputs)[..., None]
148
-
149
- input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150
- input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151
-
152
- input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153
- delta = heights / widths
154
- input_delta = delta.gather(-1, bin_idx)[..., 0]
155
-
156
- input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157
- input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158
-
159
- input_heights = heights.gather(-1, bin_idx)[..., 0]
160
-
161
- if inverse:
162
- a = (inputs - input_cumheights) * (
163
- input_derivatives + input_derivatives_plus_one - 2 * input_delta
164
- ) + input_heights * (input_delta - input_derivatives)
165
- b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166
- input_derivatives + input_derivatives_plus_one - 2 * input_delta
167
- )
168
- c = -input_delta * (inputs - input_cumheights)
169
-
170
- discriminant = b.pow(2) - 4 * a * c
171
- assert (discriminant >= 0).all()
172
-
173
- root = (2 * c) / (-b - torch.sqrt(discriminant))
174
- outputs = root * input_bin_widths + input_cumwidths
175
-
176
- theta_one_minus_theta = root * (1 - root)
177
- denominator = input_delta + (
178
- (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179
- * theta_one_minus_theta
180
- )
181
- derivative_numerator = input_delta.pow(2) * (
182
- input_derivatives_plus_one * root.pow(2)
183
- + 2 * input_delta * theta_one_minus_theta
184
- + input_derivatives * (1 - root).pow(2)
185
- )
186
- logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187
-
188
- return outputs, -logabsdet
189
- else:
190
- theta = (inputs - input_cumwidths) / input_bin_widths
191
- theta_one_minus_theta = theta * (1 - theta)
192
-
193
- numerator = input_heights * (
194
- input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195
- )
196
- denominator = input_delta + (
197
- (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198
- * theta_one_minus_theta
199
- )
200
- outputs = input_cumheights + numerator / denominator
201
-
202
- derivative_numerator = input_delta.pow(2) * (
203
- input_derivatives_plus_one * theta.pow(2)
204
- + 2 * input_delta * theta_one_minus_theta
205
- + input_derivatives * (1 - theta).pow(2)
206
- )
207
- logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208
-
209
- return outputs, logabsdet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/rmvpe.py DELETED
@@ -1,432 +0,0 @@
1
- import torch, numpy as np
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
-
5
-
6
-
7
- class BiGRU(nn.Module):
8
- def __init__(self, input_features, hidden_features, num_layers):
9
- super(BiGRU, self).__init__()
10
- self.gru = nn.GRU(
11
- input_features,
12
- hidden_features,
13
- num_layers=num_layers,
14
- batch_first=True,
15
- bidirectional=True,
16
- )
17
-
18
- def forward(self, x):
19
- return self.gru(x)[0]
20
-
21
-
22
- class ConvBlockRes(nn.Module):
23
- def __init__(self, in_channels, out_channels, momentum=0.01):
24
- super(ConvBlockRes, self).__init__()
25
- self.conv = nn.Sequential(
26
- nn.Conv2d(
27
- in_channels=in_channels,
28
- out_channels=out_channels,
29
- kernel_size=(3, 3),
30
- stride=(1, 1),
31
- padding=(1, 1),
32
- bias=False,
33
- ),
34
- nn.BatchNorm2d(out_channels, momentum=momentum),
35
- nn.ReLU(),
36
- nn.Conv2d(
37
- in_channels=out_channels,
38
- out_channels=out_channels,
39
- kernel_size=(3, 3),
40
- stride=(1, 1),
41
- padding=(1, 1),
42
- bias=False,
43
- ),
44
- nn.BatchNorm2d(out_channels, momentum=momentum),
45
- nn.ReLU(),
46
- )
47
- if in_channels != out_channels:
48
- self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
49
- self.is_shortcut = True
50
- else:
51
- self.is_shortcut = False
52
-
53
- def forward(self, x):
54
- if self.is_shortcut:
55
- return self.conv(x) + self.shortcut(x)
56
- else:
57
- return self.conv(x) + x
58
-
59
-
60
- class Encoder(nn.Module):
61
- def __init__(
62
- self,
63
- in_channels,
64
- in_size,
65
- n_encoders,
66
- kernel_size,
67
- n_blocks,
68
- out_channels=16,
69
- momentum=0.01,
70
- ):
71
- super(Encoder, self).__init__()
72
- self.n_encoders = n_encoders
73
- self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
74
- self.layers = nn.ModuleList()
75
- self.latent_channels = []
76
- for i in range(self.n_encoders):
77
- self.layers.append(
78
- ResEncoderBlock(
79
- in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
80
- )
81
- )
82
- self.latent_channels.append([out_channels, in_size])
83
- in_channels = out_channels
84
- out_channels *= 2
85
- in_size //= 2
86
- self.out_size = in_size
87
- self.out_channel = out_channels
88
-
89
- def forward(self, x):
90
- concat_tensors = []
91
- x = self.bn(x)
92
- for i in range(self.n_encoders):
93
- _, x = self.layers[i](x)
94
- concat_tensors.append(_)
95
- return x, concat_tensors
96
-
97
-
98
- class ResEncoderBlock(nn.Module):
99
- def __init__(
100
- self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
101
- ):
102
- super(ResEncoderBlock, self).__init__()
103
- self.n_blocks = n_blocks
104
- self.conv = nn.ModuleList()
105
- self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
106
- for i in range(n_blocks - 1):
107
- self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
108
- self.kernel_size = kernel_size
109
- if self.kernel_size is not None:
110
- self.pool = nn.AvgPool2d(kernel_size=kernel_size)
111
-
112
- def forward(self, x):
113
- for i in range(self.n_blocks):
114
- x = self.conv[i](x)
115
- if self.kernel_size is not None:
116
- return x, self.pool(x)
117
- else:
118
- return x
119
-
120
-
121
- class Intermediate(nn.Module): #
122
- def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
123
- super(Intermediate, self).__init__()
124
- self.n_inters = n_inters
125
- self.layers = nn.ModuleList()
126
- self.layers.append(
127
- ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
128
- )
129
- for i in range(self.n_inters - 1):
130
- self.layers.append(
131
- ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
132
- )
133
-
134
- def forward(self, x):
135
- for i in range(self.n_inters):
136
- x = self.layers[i](x)
137
- return x
138
-
139
-
140
- class ResDecoderBlock(nn.Module):
141
- def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
142
- super(ResDecoderBlock, self).__init__()
143
- out_padding = (0, 1) if stride == (1, 2) else (1, 1)
144
- self.n_blocks = n_blocks
145
- self.conv1 = nn.Sequential(
146
- nn.ConvTranspose2d(
147
- in_channels=in_channels,
148
- out_channels=out_channels,
149
- kernel_size=(3, 3),
150
- stride=stride,
151
- padding=(1, 1),
152
- output_padding=out_padding,
153
- bias=False,
154
- ),
155
- nn.BatchNorm2d(out_channels, momentum=momentum),
156
- nn.ReLU(),
157
- )
158
- self.conv2 = nn.ModuleList()
159
- self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
160
- for i in range(n_blocks - 1):
161
- self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
162
-
163
- def forward(self, x, concat_tensor):
164
- x = self.conv1(x)
165
- x = torch.cat((x, concat_tensor), dim=1)
166
- for i in range(self.n_blocks):
167
- x = self.conv2[i](x)
168
- return x
169
-
170
-
171
- class Decoder(nn.Module):
172
- def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
173
- super(Decoder, self).__init__()
174
- self.layers = nn.ModuleList()
175
- self.n_decoders = n_decoders
176
- for i in range(self.n_decoders):
177
- out_channels = in_channels // 2
178
- self.layers.append(
179
- ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
180
- )
181
- in_channels = out_channels
182
-
183
- def forward(self, x, concat_tensors):
184
- for i in range(self.n_decoders):
185
- x = self.layers[i](x, concat_tensors[-1 - i])
186
- return x
187
-
188
-
189
- class DeepUnet(nn.Module):
190
- def __init__(
191
- self,
192
- kernel_size,
193
- n_blocks,
194
- en_de_layers=5,
195
- inter_layers=4,
196
- in_channels=1,
197
- en_out_channels=16,
198
- ):
199
- super(DeepUnet, self).__init__()
200
- self.encoder = Encoder(
201
- in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
202
- )
203
- self.intermediate = Intermediate(
204
- self.encoder.out_channel // 2,
205
- self.encoder.out_channel,
206
- inter_layers,
207
- n_blocks,
208
- )
209
- self.decoder = Decoder(
210
- self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
211
- )
212
-
213
- def forward(self, x):
214
- x, concat_tensors = self.encoder(x)
215
- x = self.intermediate(x)
216
- x = self.decoder(x, concat_tensors)
217
- return x
218
-
219
-
220
- class E2E(nn.Module):
221
- def __init__(
222
- self,
223
- n_blocks,
224
- n_gru,
225
- kernel_size,
226
- en_de_layers=5,
227
- inter_layers=4,
228
- in_channels=1,
229
- en_out_channels=16,
230
- ):
231
- super(E2E, self).__init__()
232
- self.unet = DeepUnet(
233
- kernel_size,
234
- n_blocks,
235
- en_de_layers,
236
- inter_layers,
237
- in_channels,
238
- en_out_channels,
239
- )
240
- self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
241
- if n_gru:
242
- self.fc = nn.Sequential(
243
- BiGRU(3 * 128, 256, n_gru),
244
- nn.Linear(512, 360),
245
- nn.Dropout(0.25),
246
- nn.Sigmoid(),
247
- )
248
- else:
249
- self.fc = nn.Sequential(
250
- nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
251
- )
252
-
253
- def forward(self, mel):
254
- mel = mel.transpose(-1, -2).unsqueeze(1)
255
- x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
256
- x = self.fc(x)
257
- return x
258
-
259
-
260
- from librosa.filters import mel
261
-
262
-
263
- class MelSpectrogram(torch.nn.Module):
264
- def __init__(
265
- self,
266
- is_half,
267
- n_mel_channels,
268
- sampling_rate,
269
- win_length,
270
- hop_length,
271
- n_fft=None,
272
- mel_fmin=0,
273
- mel_fmax=None,
274
- clamp=1e-5,
275
- ):
276
- super().__init__()
277
- n_fft = win_length if n_fft is None else n_fft
278
- self.hann_window = {}
279
- mel_basis = mel(
280
- sr=sampling_rate,
281
- n_fft=n_fft,
282
- n_mels=n_mel_channels,
283
- fmin=mel_fmin,
284
- fmax=mel_fmax,
285
- htk=True,
286
- )
287
- mel_basis = torch.from_numpy(mel_basis).float()
288
- self.register_buffer("mel_basis", mel_basis)
289
- self.n_fft = win_length if n_fft is None else n_fft
290
- self.hop_length = hop_length
291
- self.win_length = win_length
292
- self.sampling_rate = sampling_rate
293
- self.n_mel_channels = n_mel_channels
294
- self.clamp = clamp
295
- self.is_half = is_half
296
-
297
- def forward(self, audio, keyshift=0, speed=1, center=True):
298
- factor = 2 ** (keyshift / 12)
299
- n_fft_new = int(np.round(self.n_fft * factor))
300
- win_length_new = int(np.round(self.win_length * factor))
301
- hop_length_new = int(np.round(self.hop_length * speed))
302
- keyshift_key = str(keyshift) + "_" + str(audio.device)
303
- if keyshift_key not in self.hann_window:
304
- self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
305
- audio.device
306
- )
307
- fft = torch.stft(
308
- audio,
309
- n_fft=n_fft_new,
310
- hop_length=hop_length_new,
311
- win_length=win_length_new,
312
- window=self.hann_window[keyshift_key],
313
- center=center,
314
- return_complex=True,
315
- )
316
- magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
317
- if keyshift != 0:
318
- size = self.n_fft // 2 + 1
319
- resize = magnitude.size(1)
320
- if resize < size:
321
- magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
322
- magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
323
- mel_output = torch.matmul(self.mel_basis, magnitude)
324
- if self.is_half == True:
325
- mel_output = mel_output.half()
326
- log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
327
- return log_mel_spec
328
-
329
-
330
- class RMVPE:
331
- def __init__(self, model_path, is_half, device=None):
332
- self.resample_kernel = {}
333
- model = E2E(4, 1, (2, 2))
334
- ckpt = torch.load(model_path, map_location="cpu")
335
- model.load_state_dict(ckpt)
336
- model.eval()
337
- if is_half == True:
338
- model = model.half()
339
- self.model = model
340
- self.resample_kernel = {}
341
- self.is_half = is_half
342
- if device is None:
343
- device = "cuda" if torch.cuda.is_available() else "cpu"
344
- self.device = device
345
- self.mel_extractor = MelSpectrogram(
346
- is_half, 128, 16000, 1024, 160, None, 30, 8000
347
- ).to(device)
348
- self.model = self.model.to(device)
349
- cents_mapping = 20 * np.arange(360) + 1997.3794084376191
350
- self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
351
-
352
- def mel2hidden(self, mel):
353
- with torch.no_grad():
354
- n_frames = mel.shape[-1]
355
- mel = F.pad(
356
- mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
357
- )
358
- hidden = self.model(mel)
359
- return hidden[:, :n_frames]
360
-
361
- def decode(self, hidden, thred=0.03):
362
- cents_pred = self.to_local_average_cents(hidden, thred=thred)
363
- f0 = 10 * (2 ** (cents_pred / 1200))
364
- f0[f0 == 10] = 0
365
- # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
366
- return f0
367
-
368
- def infer_from_audio(self, audio, thred=0.03):
369
- audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
370
- # torch.cuda.synchronize()
371
- # t0=ttime()
372
- mel = self.mel_extractor(audio, center=True)
373
- # torch.cuda.synchronize()
374
- # t1=ttime()
375
- hidden = self.mel2hidden(mel)
376
- # torch.cuda.synchronize()
377
- # t2=ttime()
378
- hidden = hidden.squeeze(0).cpu().numpy()
379
- if self.is_half == True:
380
- hidden = hidden.astype("float32")
381
- f0 = self.decode(hidden, thred=thred)
382
- # torch.cuda.synchronize()
383
- # t3=ttime()
384
- # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
385
- return f0
386
-
387
- def to_local_average_cents(self, salience, thred=0.05):
388
- # t0 = ttime()
389
- center = np.argmax(salience, axis=1) # frame length#index
390
- salience = np.pad(salience, ((0, 0), (4, 4))) # frame length,368
391
- # t1 = ttime()
392
- center += 4
393
- todo_salience = []
394
- todo_cents_mapping = []
395
- starts = center - 4
396
- ends = center + 5
397
- for idx in range(salience.shape[0]):
398
- todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
399
- todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
400
- # t2 = ttime()
401
- todo_salience = np.array(todo_salience) # frame length,9
402
- todo_cents_mapping = np.array(todo_cents_mapping) # frame length,9
403
- product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
404
- weight_sum = np.sum(todo_salience, 1) # frame length
405
- devided = product_sum / weight_sum # frame length
406
- # t3 = ttime()
407
- maxx = np.max(salience, axis=1) # frame length
408
- devided[maxx <= thred] = 0
409
- # t4 = ttime()
410
- # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
411
- return devided
412
-
413
-
414
- # if __name__ == '__main__':
415
- # audio, sampling_rate = sf.read("Quotations~1.wav") ### edit
416
- # if len(audio.shape) > 1:
417
- # audio = librosa.to_mono(audio.transpose(1, 0))
418
- # audio_bak = audio.copy()
419
- # if sampling_rate != 16000:
420
- # audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
421
- # model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt"
422
- # thred = 0.03 # 0.01
423
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
424
- # rmvpe = RMVPE(model_path,is_half=False, device=device)
425
- # t0=ttime()
426
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
427
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
428
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
429
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
430
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
431
- # t1=ttime()
432
- # print(f0.shape,t1-t0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
no_requirements.txt DELETED
@@ -1,12 +0,0 @@
1
- torch
2
- torchvision
3
- git+https://github.com/m-bain/whisperx.git
4
- yt-dlp
5
- gTTS
6
- pydub
7
- edge_tts
8
- deep_translator
9
- torchaudio==2.0.0
10
- gradio
11
- nest_asyncio
12
- gradio_client==0.2.7
 
 
 
 
 
 
 
 
 
 
 
 
 
packages.txt DELETED
@@ -1,3 +0,0 @@
1
- git-lfs
2
- aria2 -y
3
- ffmpeg
 
 
 
 
requirements_colab.txt DELETED
@@ -1,11 +0,0 @@
1
- --extra-index-url https://download.pytorch.org/whl/cu117 torch==2.0.0+cu117 torchvision==0.15.2+cu117 torchaudio==2.0.0
2
- yt-dlp
3
- nest_asyncio
4
- gradio==3.35.2
5
- pydub==0.25.1
6
- edge_tts==6.1.7
7
- deep_translator==1.11.4
8
- git+https://github.com/R3gm/whisperX.git@cuda_11_8
9
- gTTS
10
- gradio_client==0.2.7
11
- ipython
 
 
 
 
 
 
 
 
 
 
 
 
requirements_extra.txt DELETED
@@ -1,8 +0,0 @@
1
- praat-parselmouth>=0.4.3
2
- pyworld==0.3.2
3
- faiss-cpu==1.7.3
4
- torchcrepe==0.0.20
5
- ffmpeg-python>=0.2.0
6
- fairseq==0.12.2
7
- gdown
8
- rarfile
 
 
 
 
 
 
 
 
 
soni_translate/audio_segments.py DELETED
@@ -1,27 +0,0 @@
1
- from pydub import AudioSegment
2
- from tqdm import tqdm
3
- import os
4
-
5
- def create_translated_audio(result_diarize, audio_files, Output_name_file):
6
- total_duration = result_diarize['segments'][-1]['end'] # in seconds
7
-
8
- # silent audio with total_duration
9
- combined_audio = AudioSegment.silent(duration=int(total_duration * 1000))
10
- print(round((total_duration / 60),2), 'minutes of video')
11
-
12
- for line, audio_file in tqdm(zip(result_diarize['segments'], audio_files)):
13
- start = float(line['start'])
14
-
15
- # Overlay each audio at the corresponding time
16
- try:
17
- audio = AudioSegment.from_file(audio_file)
18
- ###audio_a = audio.speedup(playback_speed=1.5)
19
- start_time = start * 1000 # to ms
20
- combined_audio = combined_audio.overlay(audio, position=start_time)
21
- except:
22
- print(f'ERROR AUDIO FILE {audio_file}')
23
-
24
- os.system("rm -rf audio/*")
25
-
26
- # combined audio as a file
27
- combined_audio.export(Output_name_file, format="wav") # best than ogg, change if the audio is anomalous
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
soni_translate/text_to_speech.py DELETED
@@ -1,33 +0,0 @@
1
- from gtts import gTTS
2
- import edge_tts
3
- import asyncio
4
- import nest_asyncio
5
-
6
- def make_voice(tts_text, tts_voice, filename,language):
7
- #print(tts_text, filename)
8
- try:
9
- nest_asyncio.apply()
10
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
11
- except:
12
- try:
13
- tts = gTTS(tts_text, lang=language)
14
- tts.save(filename)
15
- print(f'No audio was received. Please change the tts voice for {tts_voice}. TTS auxiliary will be used in the segment')
16
- except:
17
- tts = gTTS('a', lang=language)
18
- tts.save(filename)
19
- print('Error: Audio will be replaced.')
20
-
21
- def make_voice_gradio(tts_text, tts_voice, filename, language):
22
- print(tts_text, filename)
23
- try:
24
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
25
- except:
26
- try:
27
- tts = gTTS(tts_text, lang=language)
28
- tts.save(filename)
29
- print(f'No audio was received. Please change the tts voice for {tts_voice}. TTS auxiliary will be used in the segment')
30
- except:
31
- tts = gTTS('a', lang=language)
32
- tts.save(filename)
33
- print('Error: Audio will be replaced.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
soni_translate/translate_segments.py DELETED
@@ -1,13 +0,0 @@
1
- from tqdm import tqdm
2
- from deep_translator import GoogleTranslator
3
-
4
- def translate_text(segments, TRANSLATE_AUDIO_TO):
5
-
6
- translator = GoogleTranslator(source='auto', target=TRANSLATE_AUDIO_TO)
7
-
8
- for line in tqdm(range(len(segments))):
9
- text = segments[line]['text']
10
- translated_line = translator.translate(text.strip())
11
- segments[line]['text'] = translated_line
12
-
13
- return segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
soni_translate/video_dubbing.py DELETED
@@ -1,217 +0,0 @@
1
- import numpy as np
2
- import gradio as gr
3
- import whisperx
4
- import torch
5
- from gtts import gTTS
6
- import librosa
7
- import edge_tts
8
- import gc
9
- from pydub import AudioSegment
10
- from tqdm import tqdm
11
- from deep_translator import GoogleTranslator
12
- import os
13
- from soni_translate.audio_segments import create_translated_audio
14
- from soni_translate.text_to_speech import make_voice
15
- from soni_translate.translate_segments import translate_text
16
- import time
17
-
18
- def translate_from_video(
19
- video,
20
- YOUR_HF_TOKEN,
21
- preview=False,
22
- WHISPER_MODEL_SIZE="large-v1",
23
- batch_size=16,
24
- compute_type="float16",
25
- SOURCE_LANGUAGE= "Automatic detection",
26
- TRANSLATE_AUDIO_TO="en",
27
- min_speakers=1,
28
- max_speakers=2,
29
- tts_voice00="en-AU-WilliamNeural-Male",
30
- tts_voice01="en-CA-ClaraNeural-Female",
31
- tts_voice02="en-GB-ThomasNeural-Male",
32
- tts_voice03="en-GB-SoniaNeural-Female",
33
- tts_voice04="en-NZ-MitchellNeural-Male",
34
- tts_voice05="en-GB-MaisieNeural-Female",
35
- video_output="video_dub.mp4"
36
- ):
37
-
38
- if YOUR_HF_TOKEN == "" or YOUR_HF_TOKEN == None:
39
- YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN")
40
-
41
- if not os.path.exists('audio'):
42
- os.makedirs('audio')
43
-
44
- if not os.path.exists('audio2/audio'):
45
- os.makedirs('audio2/audio')
46
-
47
- # Check GPU
48
- device = "cuda" if torch.cuda.is_available() else "cpu"
49
- compute_type = "float32" if device == "cpu" else compute_type
50
-
51
- OutputFile = 'Video.mp4'
52
- audio_wav = "audio.wav"
53
- Output_name_file = "audio_dub_solo.ogg"
54
- mix_audio = "audio_mix.mp3"
55
-
56
- os.system("rm Video.mp4")
57
- os.system("rm audio.webm")
58
- os.system("rm audio.wav")
59
-
60
- if os.path.exists(video):
61
- if preview:
62
- print('Creating preview video, 10 seconds')
63
- os.system(f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4')
64
- else:
65
- os.system(f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4')
66
-
67
- os.system("ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav")
68
- else:
69
- if preview:
70
- print('Creating preview from link, 10 seconds')
71
- #https://github.com/yt-dlp/yt-dlp/issues/2220
72
- mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
73
- wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav"
74
- os.system(mp4_)
75
- os.system(wav_)
76
- else:
77
- mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}'
78
- wav_ = f'python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}'
79
-
80
- os.system(wav_)
81
-
82
- for i in range (120):
83
- time.sleep(1)
84
- print('process audio')
85
- if os.path.exists(audio_wav) and not os.path.exists('audio.webm'):
86
- time.sleep(1)
87
- os.system(mp4_)
88
- break
89
- if i == 119:
90
- print('Error donwloading the audio')
91
- return
92
-
93
- print("Set file complete.")
94
-
95
- SOURCE_LANGUAGE = None if SOURCE_LANGUAGE == 'Automatic detection' else SOURCE_LANGUAGE
96
-
97
- # 1. Transcribe with original whisper (batched)
98
- model = whisperx.load_model(
99
- WHISPER_MODEL_SIZE,
100
- device,
101
- compute_type=compute_type,
102
- language= SOURCE_LANGUAGE,
103
- )
104
- audio = whisperx.load_audio(audio_wav)
105
- result = model.transcribe(audio, batch_size=batch_size)
106
- gc.collect(); torch.cuda.empty_cache(); del model
107
- print("Transcript complete")
108
-
109
- # 2. Align whisper output
110
- model_a, metadata = whisperx.load_align_model(
111
- language_code=result["language"],
112
- device=device
113
- )
114
- result = whisperx.align(
115
- result["segments"],
116
- model_a,
117
- metadata,
118
- audio,
119
- device,
120
- return_char_alignments=True,
121
- )
122
- gc.collect(); torch.cuda.empty_cache(); del model_a
123
- print("Align complete")
124
-
125
- if result['segments'] == []:
126
- print('No active speech found in audio')
127
- return
128
-
129
- # 3. Assign speaker labels
130
- diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
131
- diarize_segments = diarize_model(
132
- audio_wav,
133
- min_speakers=min_speakers,
134
- max_speakers=max_speakers)
135
- result_diarize = whisperx.assign_word_speakers(diarize_segments, result)
136
- gc.collect(); torch.cuda.empty_cache(); del diarize_model
137
- print("Diarize complete")
138
-
139
- result_diarize['segments'] = translate_text(result_diarize['segments'], TRANSLATE_AUDIO_TO)
140
- print("Translation complete")
141
-
142
- audio_files = []
143
-
144
- # Mapping speakers to voice variables
145
- speaker_to_voice = {
146
- 'SPEAKER_00': tts_voice00,
147
- 'SPEAKER_01': tts_voice01,
148
- 'SPEAKER_02': tts_voice02,
149
- 'SPEAKER_03': tts_voice03,
150
- 'SPEAKER_04': tts_voice04,
151
- 'SPEAKER_05': tts_voice05
152
- }
153
-
154
- for segment in tqdm(result_diarize['segments']):
155
-
156
- text = segment['text']
157
- start = segment['start']
158
- end = segment['end']
159
-
160
- try:
161
- speaker = segment['speaker']
162
- except KeyError:
163
- segment['speaker'] = "SPEAKER_99"
164
- speaker = segment['speaker']
165
- print("NO SPEAKER DETECT IN SEGMENT")
166
-
167
- # make the tts audio
168
- filename = f"audio/{start}.ogg"
169
-
170
- if speaker in speaker_to_voice and speaker_to_voice[speaker] != 'None':
171
- make_voice(text, speaker_to_voice[speaker], filename, TRANSLATE_AUDIO_TO)
172
- elif speaker == "SPEAKER_99":
173
- try:
174
- tts = gTTS(text, lang=TRANSLATE_AUDIO_TO)
175
- tts.save(filename)
176
- print('Using GTTS')
177
- except:
178
- tts = gTTS('a', lang=TRANSLATE_AUDIO_TO)
179
- tts.save(filename)
180
- print('Error: Audio will be replaced.')
181
-
182
- # duration
183
- duration_true = end - start
184
- duration_tts = librosa.get_duration(filename=filename)
185
-
186
- # porcentaje
187
- porcentaje = duration_tts / duration_true
188
-
189
- if porcentaje > 2.1:
190
- porcentaje = 2.1
191
- elif porcentaje <= 1.2 and porcentaje >= 0.8:
192
- porcentaje = 1.0
193
- elif porcentaje <= 0.79:
194
- porcentaje = 0.8
195
-
196
- # Smoth and round
197
- porcentaje = round(porcentaje+0.0, 1)
198
-
199
- # apply aceleration or opposite to the audio file in audio2 folder
200
- os.system(f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={porcentaje} audio2/{filename}")
201
-
202
- duration_create = librosa.get_duration(filename=f"audio2/{filename}")
203
- audio_files.append(filename)
204
-
205
- # replace files with the accelerates
206
- os.system("mv -f audio2/audio/*.ogg audio/")
207
-
208
- os.system(f"rm {Output_name_file}")
209
- create_translated_audio(result_diarize, audio_files, Output_name_file)
210
-
211
- os.system(f"rm {mix_audio}")
212
- os.system(f'ffmpeg -i {audio_wav} -i {Output_name_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio}')
213
-
214
- os.system(f"rm {video_output}")
215
- os.system(f"ffmpeg -i {OutputFile} -i {mix_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output}")
216
-
217
- return video_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vc_infer_pipeline.py DELETED
@@ -1,445 +0,0 @@
1
- import numpy as np, parselmouth, torch, pdb, sys, os
2
- from time import time as ttime
3
- import torch.nn.functional as F
4
- import scipy.signal as signal
5
- import pyworld, os, traceback, faiss, librosa, torchcrepe
6
- from scipy import signal
7
- from functools import lru_cache
8
-
9
- now_dir = os.getcwd()
10
- sys.path.append(now_dir)
11
-
12
- bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
13
-
14
- input_audio_path2wav = {}
15
-
16
-
17
- @lru_cache
18
- def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
19
- audio = input_audio_path2wav[input_audio_path]
20
- f0, t = pyworld.harvest(
21
- audio,
22
- fs=fs,
23
- f0_ceil=f0max,
24
- f0_floor=f0min,
25
- frame_period=frame_period,
26
- )
27
- f0 = pyworld.stonemask(audio, f0, t, fs)
28
- return f0
29
-
30
-
31
- def change_rms(data1, sr1, data2, sr2, rate): # 1 is the input audio, 2 is the output audio, rate is the proportion of 2
32
- # print(data1.max(),data2.max())
33
- rms1 = librosa.feature.rms(
34
- y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
35
- ) # one dot every half second
36
- rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
37
- rms1 = torch.from_numpy(rms1)
38
- rms1 = F.interpolate(
39
- rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
40
- ).squeeze()
41
- rms2 = torch.from_numpy(rms2)
42
- rms2 = F.interpolate(
43
- rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
44
- ).squeeze()
45
- rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
46
- data2 *= (
47
- torch.pow(rms1, torch.tensor(1 - rate))
48
- * torch.pow(rms2, torch.tensor(rate - 1))
49
- ).numpy()
50
- return data2
51
-
52
-
53
- class VC(object):
54
- def __init__(self, tgt_sr, config):
55
- self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
56
- config.x_pad,
57
- config.x_query,
58
- config.x_center,
59
- config.x_max,
60
- config.is_half,
61
- )
62
- self.sr = 16000 # hubert input sampling rate
63
- self.window = 160 # points per frame
64
- self.t_pad = self.sr * self.x_pad # Pad time before and after each bar
65
- self.t_pad_tgt = tgt_sr * self.x_pad
66
- self.t_pad2 = self.t_pad * 2
67
- self.t_query = self.sr * self.x_query # Query time before and after the cut point
68
- self.t_center = self.sr * self.x_center # Query point cut position
69
- self.t_max = self.sr * self.x_max # Query-free duration threshold
70
- self.device = config.device
71
-
72
- def get_f0(
73
- self,
74
- input_audio_path,
75
- x,
76
- p_len,
77
- f0_up_key,
78
- f0_method,
79
- filter_radius,
80
- inp_f0=None,
81
- ):
82
- global input_audio_path2wav
83
- time_step = self.window / self.sr * 1000
84
- f0_min = 50
85
- f0_max = 1100
86
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
87
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
88
- if f0_method == "pm":
89
- f0 = (
90
- parselmouth.Sound(x, self.sr)
91
- .to_pitch_ac(
92
- time_step=time_step / 1000,
93
- voicing_threshold=0.6,
94
- pitch_floor=f0_min,
95
- pitch_ceiling=f0_max,
96
- )
97
- .selected_array["frequency"]
98
- )
99
- pad_size = (p_len - len(f0) + 1) // 2
100
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
101
- f0 = np.pad(
102
- f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
103
- )
104
- elif f0_method == "harvest":
105
- input_audio_path2wav[input_audio_path] = x.astype(np.double)
106
- f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
107
- if filter_radius > 2:
108
- f0 = signal.medfilt(f0, 3)
109
- elif f0_method == "crepe":
110
- model = "full"
111
- # Pick a batch size that doesn't cause memory errors on your gpu
112
- batch_size = 512
113
- # Compute pitch using first gpu
114
- audio = torch.tensor(np.copy(x))[None].float()
115
- f0, pd = torchcrepe.predict(
116
- audio,
117
- self.sr,
118
- self.window,
119
- f0_min,
120
- f0_max,
121
- model,
122
- batch_size=batch_size,
123
- device=self.device,
124
- return_periodicity=True,
125
- )
126
- pd = torchcrepe.filter.median(pd, 3)
127
- f0 = torchcrepe.filter.mean(f0, 3)
128
- f0[pd < 0.1] = 0
129
- f0 = f0[0].cpu().numpy()
130
- elif f0_method == "rmvpe":
131
- if hasattr(self, "model_rmvpe") == False:
132
- from lib.rmvpe import RMVPE
133
-
134
- print("loading rmvpe model")
135
- self.model_rmvpe = RMVPE(
136
- "rmvpe.pt", is_half=self.is_half, device=self.device
137
- )
138
- f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
139
- f0 *= pow(2, f0_up_key / 12)
140
- # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
141
- tf0 = self.sr // self.window # f0 points per second
142
- if inp_f0 is not None:
143
- delta_t = np.round(
144
- (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
145
- ).astype("int16")
146
- replace_f0 = np.interp(
147
- list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
148
- )
149
- shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
150
- f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
151
- :shape
152
- ]
153
- # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
154
- f0bak = f0.copy()
155
- f0_mel = 1127 * np.log(1 + f0 / 700)
156
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
157
- f0_mel_max - f0_mel_min
158
- ) + 1
159
- f0_mel[f0_mel <= 1] = 1
160
- f0_mel[f0_mel > 255] = 255
161
- f0_coarse = np.rint(f0_mel).astype(int) # change np.int
162
- return f0_coarse, f0bak # 1-0
163
-
164
- def vc(
165
- self,
166
- model,
167
- net_g,
168
- sid,
169
- audio0,
170
- pitch,
171
- pitchf,
172
- times,
173
- index,
174
- big_npy,
175
- index_rate,
176
- version,
177
- protect,
178
- ): # ,file_index,file_big_npy
179
- feats = torch.from_numpy(audio0)
180
- if self.is_half:
181
- feats = feats.half()
182
- else:
183
- feats = feats.float()
184
- if feats.dim() == 2: # double channels
185
- feats = feats.mean(-1)
186
- assert feats.dim() == 1, feats.dim()
187
- feats = feats.view(1, -1)
188
- padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
189
-
190
- inputs = {
191
- "source": feats.to(self.device),
192
- "padding_mask": padding_mask,
193
- "output_layer": 9 if version == "v1" else 12,
194
- }
195
- t0 = ttime()
196
- with torch.no_grad():
197
- logits = model.extract_features(**inputs)
198
- feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
199
- if protect < 0.5 and pitch != None and pitchf != None:
200
- feats0 = feats.clone()
201
- if (
202
- isinstance(index, type(None)) == False
203
- and isinstance(big_npy, type(None)) == False
204
- and index_rate != 0
205
- ):
206
- npy = feats[0].cpu().numpy()
207
- if self.is_half:
208
- npy = npy.astype("float32")
209
-
210
- # _, I = index.search(npy, 1)
211
- # npy = big_npy[I.squeeze()]
212
-
213
- score, ix = index.search(npy, k=8)
214
- weight = np.square(1 / score)
215
- weight /= weight.sum(axis=1, keepdims=True)
216
- npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
217
-
218
- if self.is_half:
219
- npy = npy.astype("float16")
220
- feats = (
221
- torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
222
- + (1 - index_rate) * feats
223
- )
224
-
225
- feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
226
- if protect < 0.5 and pitch != None and pitchf != None:
227
- feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
228
- 0, 2, 1
229
- )
230
- t1 = ttime()
231
- p_len = audio0.shape[0] // self.window
232
- if feats.shape[1] < p_len:
233
- p_len = feats.shape[1]
234
- if pitch != None and pitchf != None:
235
- pitch = pitch[:, :p_len]
236
- pitchf = pitchf[:, :p_len]
237
-
238
- if protect < 0.5 and pitch != None and pitchf != None:
239
- pitchff = pitchf.clone()
240
- pitchff[pitchf > 0] = 1
241
- pitchff[pitchf < 1] = protect
242
- pitchff = pitchff.unsqueeze(-1)
243
- feats = feats * pitchff + feats0 * (1 - pitchff)
244
- feats = feats.to(feats0.dtype)
245
- p_len = torch.tensor([p_len], device=self.device).long()
246
- with torch.no_grad():
247
- if pitch != None and pitchf != None:
248
- audio1 = (
249
- (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
250
- .data.cpu()
251
- .float()
252
- .numpy()
253
- )
254
- else:
255
- audio1 = (
256
- (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
257
- )
258
- del feats, p_len, padding_mask
259
- if torch.cuda.is_available():
260
- torch.cuda.empty_cache()
261
- t2 = ttime()
262
- times[0] += t1 - t0
263
- times[2] += t2 - t1
264
- return audio1
265
-
266
- def pipeline(
267
- self,
268
- model,
269
- net_g,
270
- sid,
271
- audio,
272
- input_audio_path,
273
- times,
274
- f0_up_key,
275
- f0_method,
276
- file_index,
277
- # file_big_npy,
278
- index_rate,
279
- if_f0,
280
- filter_radius,
281
- tgt_sr,
282
- resample_sr,
283
- rms_mix_rate,
284
- version,
285
- protect,
286
- f0_file=None,
287
- ):
288
- if (
289
- file_index != ""
290
- # and file_big_npy != ""
291
- # and os.path.exists(file_big_npy) == True
292
- and os.path.exists(file_index) == True
293
- and index_rate != 0
294
- ):
295
- try:
296
- index = faiss.read_index(file_index)
297
- # big_npy = np.load(file_big_npy)
298
- big_npy = index.reconstruct_n(0, index.ntotal)
299
- except:
300
- traceback.print_exc()
301
- index = big_npy = None
302
- else:
303
- index = big_npy = None
304
- print("File index Not found, set None")
305
-
306
- audio = signal.filtfilt(bh, ah, audio)
307
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
308
- opt_ts = []
309
- if audio_pad.shape[0] > self.t_max:
310
- audio_sum = np.zeros_like(audio)
311
- for i in range(self.window):
312
- audio_sum += audio_pad[i : i - self.window]
313
- for t in range(self.t_center, audio.shape[0], self.t_center):
314
- opt_ts.append(
315
- t
316
- - self.t_query
317
- + np.where(
318
- np.abs(audio_sum[t - self.t_query : t + self.t_query])
319
- == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
320
- )[0][0]
321
- )
322
- s = 0
323
- audio_opt = []
324
- t = None
325
- t1 = ttime()
326
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
327
- p_len = audio_pad.shape[0] // self.window
328
- inp_f0 = None
329
- if hasattr(f0_file, "name") == True:
330
- try:
331
- with open(f0_file.name, "r") as f:
332
- lines = f.read().strip("\n").split("\n")
333
- inp_f0 = []
334
- for line in lines:
335
- inp_f0.append([float(i) for i in line.split(",")])
336
- inp_f0 = np.array(inp_f0, dtype="float32")
337
- except:
338
- traceback.print_exc()
339
- sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
340
- pitch, pitchf = None, None
341
- if if_f0 == 1:
342
- pitch, pitchf = self.get_f0(
343
- input_audio_path,
344
- audio_pad,
345
- p_len,
346
- f0_up_key,
347
- f0_method,
348
- filter_radius,
349
- inp_f0,
350
- )
351
- pitch = pitch[:p_len]
352
- pitchf = pitchf[:p_len]
353
- if self.device == "mps":
354
- pitchf = pitchf.astype(np.float32)
355
- pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
356
- pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
357
- t2 = ttime()
358
- times[1] += t2 - t1
359
- for t in opt_ts:
360
- t = t // self.window * self.window
361
- if if_f0 == 1:
362
- audio_opt.append(
363
- self.vc(
364
- model,
365
- net_g,
366
- sid,
367
- audio_pad[s : t + self.t_pad2 + self.window],
368
- pitch[:, s // self.window : (t + self.t_pad2) // self.window],
369
- pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
370
- times,
371
- index,
372
- big_npy,
373
- index_rate,
374
- version,
375
- protect,
376
- )[self.t_pad_tgt : -self.t_pad_tgt]
377
- )
378
- else:
379
- audio_opt.append(
380
- self.vc(
381
- model,
382
- net_g,
383
- sid,
384
- audio_pad[s : t + self.t_pad2 + self.window],
385
- None,
386
- None,
387
- times,
388
- index,
389
- big_npy,
390
- index_rate,
391
- version,
392
- protect,
393
- )[self.t_pad_tgt : -self.t_pad_tgt]
394
- )
395
- s = t
396
- if if_f0 == 1:
397
- audio_opt.append(
398
- self.vc(
399
- model,
400
- net_g,
401
- sid,
402
- audio_pad[t:],
403
- pitch[:, t // self.window :] if t is not None else pitch,
404
- pitchf[:, t // self.window :] if t is not None else pitchf,
405
- times,
406
- index,
407
- big_npy,
408
- index_rate,
409
- version,
410
- protect,
411
- )[self.t_pad_tgt : -self.t_pad_tgt]
412
- )
413
- else:
414
- audio_opt.append(
415
- self.vc(
416
- model,
417
- net_g,
418
- sid,
419
- audio_pad[t:],
420
- None,
421
- None,
422
- times,
423
- index,
424
- big_npy,
425
- index_rate,
426
- version,
427
- protect,
428
- )[self.t_pad_tgt : -self.t_pad_tgt]
429
- )
430
- audio_opt = np.concatenate(audio_opt)
431
- if rms_mix_rate != 1:
432
- audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
433
- if resample_sr >= 16000 and tgt_sr != resample_sr:
434
- audio_opt = librosa.resample(
435
- audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
436
- )
437
- audio_max = np.abs(audio_opt).max() / 0.99
438
- max_int16 = 32768
439
- if audio_max > 1:
440
- max_int16 /= audio_max
441
- audio_opt = (audio_opt * max_int16).astype(np.int16)
442
- del pitch, pitchf, sid
443
- if torch.cuda.is_available():
444
- torch.cuda.empty_cache()
445
- return audio_opt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
voice_main.py DELETED
@@ -1,554 +0,0 @@
1
- import torch
2
- from lib.infer_pack.models import (
3
- SynthesizerTrnMs256NSFsid,
4
- SynthesizerTrnMs256NSFsid_nono,
5
- SynthesizerTrnMs768NSFsid,
6
- SynthesizerTrnMs768NSFsid_nono,
7
- )
8
- from vc_infer_pipeline import VC
9
- import traceback, pdb
10
- from lib.audio import load_audio
11
- import numpy as np
12
- import os
13
- from fairseq import checkpoint_utils
14
- import soundfile as sf
15
- from gtts import gTTS
16
- import edge_tts
17
- import asyncio
18
- import nest_asyncio
19
-
20
- # model load
21
- def get_vc(sid, to_return_protect0, to_return_protect1):
22
- global n_spk, tgt_sr, net_g, vc, cpt, version
23
- if sid == "" or sid == []:
24
- global hubert_model
25
- if hubert_model is not None: # change model or not
26
- print("clean_empty_cache")
27
- del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
28
- hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
29
- if torch.cuda.is_available():
30
- torch.cuda.empty_cache()
31
- ### if clean
32
- if_f0 = cpt.get("f0", 1)
33
- version = cpt.get("version", "v1")
34
- if version == "v1":
35
- if if_f0 == 1:
36
- net_g = SynthesizerTrnMs256NSFsid(
37
- *cpt["config"], is_half=config.is_half
38
- )
39
- else:
40
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
41
- elif version == "v2":
42
- if if_f0 == 1:
43
- net_g = SynthesizerTrnMs768NSFsid(
44
- *cpt["config"], is_half=config.is_half
45
- )
46
- else:
47
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
48
- del net_g, cpt
49
- if torch.cuda.is_available():
50
- torch.cuda.empty_cache()
51
- return {"visible": False, "__type__": "update"}
52
- person = "%s/%s" % (weight_root, sid)
53
- print("loading %s" % person)
54
- cpt = torch.load(person, map_location="cpu")
55
- tgt_sr = cpt["config"][-1]
56
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
57
- if_f0 = cpt.get("f0", 1)
58
- if if_f0 == 0:
59
- to_return_protect0 = to_return_protect1 = {
60
- "visible": False,
61
- "value": 0.5,
62
- "__type__": "update",
63
- }
64
- else:
65
- to_return_protect0 = {
66
- "visible": True,
67
- "value": to_return_protect0,
68
- "__type__": "update",
69
- }
70
- to_return_protect1 = {
71
- "visible": True,
72
- "value": to_return_protect1,
73
- "__type__": "update",
74
- }
75
- version = cpt.get("version", "v1")
76
- if version == "v1":
77
- if if_f0 == 1:
78
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
79
- else:
80
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
81
- elif version == "v2":
82
- if if_f0 == 1:
83
- net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
84
- else:
85
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
86
- del net_g.enc_q
87
- print(net_g.load_state_dict(cpt["weight"], strict=False))
88
- net_g.eval().to(config.device)
89
- if config.is_half:
90
- net_g = net_g.half()
91
- else:
92
- net_g = net_g.float()
93
- vc = VC(tgt_sr, config)
94
- n_spk = cpt["config"][-3]
95
- return (
96
- {"visible": True, "maximum": n_spk, "__type__": "update"},
97
- to_return_protect0,
98
- to_return_protect1,
99
- )
100
-
101
-
102
-
103
- # inference
104
- def vc_single(
105
- sid,
106
- input_audio_path,
107
- f0_up_key,
108
- f0_file,
109
- f0_method,
110
- file_index,
111
- file_index2,
112
- # file_big_npy,
113
- index_rate,
114
- filter_radius,
115
- resample_sr,
116
- rms_mix_rate,
117
- protect,
118
- ):
119
- global tgt_sr, net_g, vc, hubert_model, version, cpt
120
- if input_audio_path is None:
121
- return "You need to upload an audio", None
122
- f0_up_key = int(f0_up_key)
123
- try:
124
- audio = load_audio(input_audio_path, 16000)
125
- audio_max = np.abs(audio).max() / 0.95
126
- if audio_max > 1:
127
- audio /= audio_max
128
- times = [0, 0, 0]
129
- if not hubert_model:
130
- load_hubert()
131
- if_f0 = cpt.get("f0", 1)
132
- file_index = (
133
- (
134
- file_index.strip(" ")
135
- .strip('"')
136
- .strip("\n")
137
- .strip('"')
138
- .strip(" ")
139
- .replace("trained", "added")
140
- )
141
- if file_index != ""
142
- else file_index2
143
- ) # reemplace for 2
144
- # file_big_npy = (
145
- # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
146
- # )
147
- audio_opt = vc.pipeline(
148
- hubert_model,
149
- net_g,
150
- sid,
151
- audio,
152
- input_audio_path,
153
- times,
154
- f0_up_key,
155
- f0_method,
156
- file_index,
157
- # file_big_npy,
158
- index_rate,
159
- if_f0,
160
- filter_radius,
161
- tgt_sr,
162
- resample_sr,
163
- rms_mix_rate,
164
- version,
165
- protect,
166
- f0_file=f0_file,
167
- )
168
- if tgt_sr != resample_sr >= 16000:
169
- tgt_sr = resample_sr
170
- index_info = (
171
- "Using index:%s." % file_index
172
- if os.path.exists(file_index)
173
- else "Index not used."
174
- )
175
- return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % (
176
- index_info,
177
- times[0],
178
- times[1],
179
- times[2],
180
- ), (tgt_sr, audio_opt)
181
- except:
182
- info = traceback.format_exc()
183
- print(info)
184
- return info, (None, None)
185
-
186
-
187
-
188
- # hubert model
189
- def load_hubert():
190
- global hubert_model
191
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
192
- ["hubert_base.pt"],
193
- suffix="",
194
- )
195
- hubert_model = models[0]
196
- hubert_model = hubert_model.to(config.device)
197
- if config.is_half:
198
- hubert_model = hubert_model.half()
199
- else:
200
- hubert_model = hubert_model.float()
201
- hubert_model.eval()
202
-
203
- # config cpu
204
- def use_fp32_config():
205
- for config_file in [
206
- "32k.json",
207
- "40k.json",
208
- "48k.json",
209
- "48k_v2.json",
210
- "32k_v2.json",
211
- ]:
212
- with open(f"configs/{config_file}", "r") as f:
213
- strr = f.read().replace("true", "false")
214
- with open(f"configs/{config_file}", "w") as f:
215
- f.write(strr)
216
-
217
- # config device and torch type
218
- class Config:
219
- def __init__(self, device, is_half):
220
- self.device = device
221
- self.is_half = is_half
222
- self.n_cpu = 2 # set cpu cores ####################
223
- self.gpu_name = None
224
- self.gpu_mem = None
225
- self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
226
-
227
- def device_config(self) -> tuple:
228
- if torch.cuda.is_available():
229
- i_device = int(self.device.split(":")[-1])
230
- self.gpu_name = torch.cuda.get_device_name(i_device)
231
- if (
232
- ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
233
- or "P40" in self.gpu_name.upper()
234
- or "1060" in self.gpu_name
235
- or "1070" in self.gpu_name
236
- or "1080" in self.gpu_name
237
- ):
238
- print("16 series / 10 series graphics cards and P40 force single precision")
239
- self.is_half = False
240
- for config_file in ["32k.json", "40k.json", "48k.json"]:
241
- with open(f"configs/{config_file}", "r") as f:
242
- strr = f.read().replace("true", "false")
243
- with open(f"configs/{config_file}", "w") as f:
244
- f.write(strr)
245
- with open("trainset_preprocess_pipeline_print.py", "r") as f:
246
- strr = f.read().replace("3.7", "3.0")
247
- with open("trainset_preprocess_pipeline_print.py", "w") as f:
248
- f.write(strr)
249
- else:
250
- self.gpu_name = None
251
- self.gpu_mem = int(
252
- torch.cuda.get_device_properties(i_device).total_memory
253
- / 1024
254
- / 1024
255
- / 1024
256
- + 0.4
257
- )
258
- if self.gpu_mem <= 4:
259
- with open("trainset_preprocess_pipeline_print.py", "r") as f:
260
- strr = f.read().replace("3.7", "3.0")
261
- with open("trainset_preprocess_pipeline_print.py", "w") as f:
262
- f.write(strr)
263
- elif torch.backends.mps.is_available():
264
- print("Supported N-card not found, using MPS for inference")
265
- self.device = "mps"
266
- else:
267
- print("No supported N-card found, using CPU for inference")
268
- self.device = "cpu"
269
- self.is_half = False
270
- use_fp32_config()
271
-
272
- if self.n_cpu == 0:
273
- self.n_cpu = cpu_count()
274
-
275
- if self.is_half:
276
- # 6GB VRAM configuration
277
- x_pad = 3
278
- x_query = 10
279
- x_center = 60
280
- x_max = 65
281
- else:
282
- # 5GB VRAM configuration
283
- x_pad = 1
284
- x_query = 6
285
- x_center = 38
286
- x_max = 41
287
-
288
- if self.gpu_mem != None and self.gpu_mem <= 4:
289
- x_pad = 1
290
- x_query = 5
291
- x_center = 30
292
- x_max = 32
293
-
294
-
295
-
296
-
297
- print(self.device, self.is_half)
298
-
299
- return x_pad, x_query, x_center, x_max
300
-
301
- # call inference
302
- class ClassVoices:
303
- def __init__(self):
304
- self.file_index = "" # root
305
-
306
- def apply_conf(self, f0method,
307
- model_voice_path00, transpose00, file_index2_00,
308
- model_voice_path01, transpose01, file_index2_01,
309
- model_voice_path02, transpose02, file_index2_02,
310
- model_voice_path03, transpose03, file_index2_03,
311
- model_voice_path04, transpose04, file_index2_04,
312
- model_voice_path05, transpose05, file_index2_05,
313
- model_voice_path99, transpose99, file_index2_99):
314
-
315
- #self.filename = filename
316
- self.f0method = f0method # pm
317
-
318
- self.model_voice_path00 = model_voice_path00
319
- self.transpose00 = transpose00
320
- self.file_index200 = file_index2_00
321
-
322
- self.model_voice_path01 = model_voice_path01
323
- self.transpose01 = transpose01
324
- self.file_index201 = file_index2_01
325
-
326
- self.model_voice_path02 = model_voice_path02
327
- self.transpose02 = transpose02
328
- self.file_index202 = file_index2_02
329
-
330
- self.model_voice_path03 = model_voice_path03
331
- self.transpose03 = transpose03
332
- self.file_index203 = file_index2_03
333
-
334
- self.model_voice_path04 = model_voice_path04
335
- self.transpose04 = transpose04
336
- self.file_index204 = file_index2_04
337
-
338
- self.model_voice_path05 = model_voice_path05
339
- self.transpose05 = transpose05
340
- self.file_index205 = file_index2_05
341
-
342
- self.model_voice_path99 = model_voice_path99
343
- self.transpose99 = transpose99
344
- self.file_index299 = file_index2_99
345
- return "CONFIGURATION APPLIED"
346
-
347
- def custom_voice(self,
348
- _values, # filter indices
349
- audio_files, # all audio files
350
- model_voice_path='',
351
- transpose=0,
352
- f0method='pm',
353
- file_index='',
354
- file_index2='',
355
- ):
356
-
357
- #hubert_model = None
358
-
359
- get_vc(
360
- sid=model_voice_path, # model path
361
- to_return_protect0=0.33,
362
- to_return_protect1=0.33
363
- )
364
-
365
- for _value_item in _values:
366
- filename = "audio2/"+audio_files[_value_item] if _value_item != "test" else audio_files[0]
367
- #filename = "audio2/"+audio_files[_value_item]
368
- try:
369
- print(audio_files[_value_item], model_voice_path)
370
- except:
371
- pass
372
-
373
- info_, (sample_, audio_output_) = vc_single(
374
- sid=0,
375
- input_audio_path=filename, #f"audio2/{filename}",
376
- f0_up_key=transpose, # transpose for m to f and reverse 0 12
377
- f0_file=None,
378
- f0_method= f0method,
379
- file_index= file_index, # dir pwd?
380
- file_index2= file_index2,
381
- # file_big_npy1,
382
- index_rate= float(0.66),
383
- filter_radius= int(3),
384
- resample_sr= int(0),
385
- rms_mix_rate= float(0.25),
386
- protect= float(0.33),
387
- )
388
-
389
- sf.write(
390
- file= filename, #f"audio2/{filename}",
391
- samplerate=sample_,
392
- data=audio_output_
393
- )
394
-
395
- # detele the model
396
-
397
- def make_test(self,
398
- tts_text,
399
- tts_voice,
400
- model_path,
401
- index_path,
402
- transpose,
403
- f0_method,
404
- ):
405
- os.system("rm -rf test")
406
- filename = "test/test.wav"
407
-
408
- if "SET_LIMIT" == os.getenv("DEMO"):
409
- if len(tts_text) > 60:
410
- tts_text = tts_text[:60]
411
- print("DEMO; limit to 60 characters")
412
-
413
- language = tts_voice[:2]
414
- try:
415
- os.system("mkdir test")
416
- #nest_asyncio.apply() # gradio;not
417
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename))
418
- except:
419
- try:
420
- tts = gTTS(tts_text, lang=language)
421
- tts.save(filename)
422
- tts.save
423
- print(f'No audio was received. Please change the tts voice for {tts_voice}. USING gTTS.')
424
- except:
425
- tts = gTTS('a', lang=language)
426
- tts.save(filename)
427
- print('Error: Audio will be replaced.')
428
-
429
- os.system("cp test/test.wav test/real_test.wav")
430
-
431
- self([],[]) # start modules
432
-
433
- self.custom_voice(
434
- ["test"], # filter indices
435
- ["test/test.wav"], # all audio files
436
- model_voice_path=model_path,
437
- transpose=transpose,
438
- f0method=f0_method,
439
- file_index='',
440
- file_index2=index_path,
441
- )
442
- return "test/test.wav", "test/real_test.wav"
443
-
444
- def __call__(self, speakers_list, audio_files):
445
-
446
- speakers_indices = {}
447
-
448
- for index, speak_ in enumerate(speakers_list):
449
- if speak_ in speakers_indices:
450
- speakers_indices[speak_].append(index)
451
- else:
452
- speakers_indices[speak_] = [index]
453
-
454
-
455
- # find models and index
456
- global weight_root, index_root, config, hubert_model
457
- weight_root = "weights"
458
- names = []
459
- for name in os.listdir(weight_root):
460
- if name.endswith(".pth"):
461
- names.append(name)
462
-
463
- index_root = "logs"
464
- index_paths = []
465
- for name in os.listdir(index_root):
466
- if name.endswith(".index"):
467
- index_paths.append(name)
468
-
469
- print(names, index_paths)
470
- # config machine
471
- hubert_model = None
472
- config = Config('cuda:0', is_half=True) # config = Config('cpu', is_half=False) # cpu
473
-
474
- # filter by speaker
475
- for _speak, _values in speakers_indices.items():
476
- #print(_speak, _values)
477
- #for _value_item in _values:
478
- # self.filename = "audio2/"+audio_files[_value_item]
479
- ###print(audio_files[_value_item])
480
-
481
- #vc(_speak, _values, audio_files)
482
-
483
- if _speak == "SPEAKER_00":
484
- self.custom_voice(
485
- _values, # filteredd
486
- audio_files,
487
- model_voice_path=self.model_voice_path00,
488
- file_index2=self.file_index200,
489
- transpose=self.transpose00,
490
- f0method=self.f0method,
491
- file_index=self.file_index,
492
- )
493
- elif _speak == "SPEAKER_01":
494
- self.custom_voice(
495
- _values,
496
- audio_files,
497
- model_voice_path=self.model_voice_path01,
498
- file_index2=self.file_index201,
499
- transpose=self.transpose01,
500
- f0method=self.f0method,
501
- file_index=self.file_index,
502
- )
503
- elif _speak == "SPEAKER_02":
504
- self.custom_voice(
505
- _values,
506
- audio_files,
507
- model_voice_path=self.model_voice_path02,
508
- file_index2=self.file_index202,
509
- transpose=self.transpose02,
510
- f0method=self.f0method,
511
- file_index=self.file_index,
512
- )
513
- elif _speak == "SPEAKER_03":
514
- self.custom_voice(
515
- _values,
516
- audio_files,
517
- model_voice_path=self.model_voice_path03,
518
- file_index2=self.file_index203,
519
- transpose=self.transpose03,
520
- f0method=self.f0method,
521
- file_index=self.file_index,
522
- )
523
- elif _speak == "SPEAKER_04":
524
- self.custom_voice(
525
- _values,
526
- audio_files,
527
- model_voice_path=self.model_voice_path04,
528
- file_index2=self.file_index204,
529
- transpose=self.transpose04,
530
- f0method=self.f0method,
531
- file_index=self.file_index,
532
- )
533
- elif _speak == "SPEAKER_05":
534
- self.custom_voice(
535
- _values,
536
- audio_files,
537
- model_voice_path=self.model_voice_path05,
538
- file_index2=self.file_index205,
539
- transpose=self.transpose05,
540
- f0method=self.f0method,
541
- file_index=self.file_index,
542
- )
543
- elif _speak == "SPEAKER_99":
544
- self.custom_voice(
545
- _values,
546
- audio_files,
547
- model_voice_path=self.model_voice_path99,
548
- file_index2=self.file_index299,
549
- transpose=self.transpose99,
550
- f0method=self.f0method,
551
- file_index=self.file_index,
552
- )
553
- else:
554
- pass