<!doctype html> <html lang="en"> <head> <meta name="viewport" content="width=device-width" /> <link rel="stylesheet" href="style.css" /> <meta charset="UTF-8"> <title>Match-TTS Onnx Benchmarks</title> </head> <body> <h1>Match-TTS Onnx Benchmarks</h1> <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.webgpu.min.js" ></script> <script type="module"> import { MatchaTTSRaw } from "./js-esm/matcha_tts_raw.js"; import { webWavPlay } from "./js-esm/web_wav_play.js"; import { arpa_to_ipa } from "./js-esm/arpa_to_ipa.js"; import { loadCmudict } from "./js-esm/cmudict_loader.js"; import { env,textToArpa} from "./js-esm/text_to_arpa.js"; env.allowLocalModels = true; env.localModelPath = "./models/"; env.backends.onnx.logLevel = "error"; let matcha_tts_raw let cmudict ={} let speaking = false let total_infer_time=0 let count_infer=0 let loaded_model_name let load_time async function main(model_name) { if (typeof model_name !== 'string') {//via button click model_name ="en001_ep6399_univ_simplify" } console.log(model_name) if (speaking){ console.log("speaking return") } speaking = true console.log("main called") if(!matcha_tts_raw){ const load_startTime = performance.now(); matcha_tts_raw = new MatchaTTSRaw() console.time("load model"); const model_path = `./models/matcha-tts/${model_name}.onnx` console.log(model_path) await matcha_tts_raw.load_model(model_path,{ executionProviders: ['webgpu','wasm'] }); console.timeEnd("load model"); load_time = (performance.now() - load_startTime)/1000 //sec loaded_model_name = model_name let cmudictReady = loadCmudict(cmudict,'./dictionaries/cmudict-0.7b') await cmudictReady update_infer_bench1() }else{ console.log("session exist skip load model") } const startTime = performance.now(); const text = document.getElementById('textInput').value console.log("### textToArpa call") const arpa_text = await textToArpa(cmudict,text) console.log("### arpa returned") const ipa_text = arpa_to_ipa(arpa_text).replace(/\s/g, ""); //console.log(ipa_text) const spks = 0 const speed = document.getElementById('speed').value const tempature = document.getElementById('temperature').value console.time("infer"); const result = await matcha_tts_raw.infer(ipa_text, tempature, speed,spks); if (result!=null){ console.timeEnd("infer"); const endTime = performance.now(); const infer_time = endTime-startTime total_infer_time+=infer_time count_infer += 1 update_infer_bench2() webWavPlay(result) } speaking = false } function update_infer_bench1(){ const text = `${loaded_model_name} load time ${load_time.toFixed(1)} sec`; document.getElementById('result1').innerText=text } function update_infer_bench2(){ const avg = (total_infer_time/count_infer)/1000 const text = `Infer Count ${count_infer} avg infer-time ${avg.toFixed(1)} sec`; document.getElementById('result2').innerText=text } function update_range(){ const value = document.getElementById('spks').value let formattedNumber = value.toString().padStart(3, '0'); document.getElementById('spks_label').textContent = formattedNumber } function update_range2(){ const value = document.getElementById('temperature').value //let formattedNumber = value.toString().padStart(3, '0'); document.getElementById('tempature_label').textContent = value//formattedNumber } function update_range3(){ const value = document.getElementById('speed').value //let formattedNumber = value.toString().padStart(3, '0'); document.getElementById('speed_label').textContent = value//sformattedNumber } window.onload = async function(){ //document.getElementById('textInput').onchange = main; document.getElementById('myButton').onclick = main; document.getElementById('temperature').onchange = update_range2 document.getElementById('speed').onchange = update_range3 } function loadModel(model_name){ total_infer_time=0 count_infer=0 matcha_tts_raw=null main(model_name) } function create_button(label, model_name) { // ボタンの作成 const button = document.createElement('button'); button.style ="margin:4px;" button.textContent = label; // クリックイベントハンドラの設定 button.onclick = function() { loadModel(model_name); }; return button } document.getElementById('buttons').appendChild(create_button("ljspeech","ljspeech_sim")) document.getElementById('buttons').appendChild(create_button("ljspeech-quantized","ljspeech_sim_q8")) document.getElementById('buttons').appendChild(create_button("vctk","vctk_univ_simplify")) document.getElementById('buttons').appendChild(create_button("vctk-quantized","vctk_univ_simplify_q8")) document.getElementById('buttons').appendChild(create_button("en001","en001_ep6399_univ_simplify")) document.getElementById('buttons').appendChild(create_button("en001-quantized","en001_ep6399_univ_simplify_q8")) document.getElementById('buttons').appendChild(document.createElement('br')) document.getElementById('buttons').appendChild(create_button("en001-t2-step01","en001_6399_T2_step01")) document.getElementById('buttons').appendChild(create_button("en001-t2-step05","en001_6399_T2_step05")) document.getElementById('buttons').appendChild(create_button("en001-t2-step10","en001_6399_T2_step10")) //document.getElementById('buttons').appendChild(create_button("en001-t2-step20","en001_6399_T2_step20")) document.getElementById('buttons').appendChild(document.createElement('br')) document.getElementById('buttons').appendChild(create_button("en001-univ-step01","en001_6399_univ_step01")) document.getElementById('buttons').appendChild(create_button("en001-univ-step05","en001_6399_univ_step05")) document.getElementById('buttons').appendChild(create_button("en001-univ-step10","en001_6399_univ_step10")) //document.getElementById('buttons').appendChild(create_button("en001-univ-step20","en001_6399_univ_step20")) </script> <div id="result1">Click button to load a model</div> <div id="buttons"></div> <br> <div id="result2">en001-T2 and en001-univ are experimental</div> <br><br> <input type="text" id="textInput" value ="Hello Huggingface." placeholder="Enter some text here..."> <button id="myButton">Text To Speak</button><br> <label for ="temperature" style="width: 110px;display: inline-block;">Temperature</label> <input type="range" id="temperature" min="0" max="1.0" value="0.5" step="0.1"/> <label for ="temperature" id="tempature_label">0.5</label><br> <label for ="speed" style="width: 110px;display: inline-block;">Speed</label> <input type="range" id="speed" min="0.1" max="2.0" value="1.0" step="0.1"/> <label for ="speed" id="speed_label">1.0</label> <br> <br> <div>almost load time 15 sec,short text TTS time 2 sec(my 2070super-gpu)</div><br> <div>Quantized version is too slow and exist just for Github Page 100MB limitation so far</div><br> <div>Multispeaker(vctk) is little bit slow than singlespeaker.default timesteps is 5(smallest 1 is 300msec fast,but audio become low quality)</div> <br> <div id="footer"> <b>Spaces</b><br> <a href="https://huggingface.co/spaces/Akjava/matcha-tts_vctk-onnx" style="font-size: 9px" target="link">Match-TTS VCTK-ONNX</a> | <a href="https://huggingface.co/spaces/Akjava/matcha-tts-onnx-benchmarks" style="font-size: 9px" target="link">Match-TTS ONNX-Benchmark</a> | <br><br> <b>Credits</b><br> <a href="https://github.com/akjava/Matcha-TTS-Japanese" style="font-size: 9px" target="link">Matcha-TTS-Japanese</a> | <a href = "http://www.udialogue.org/download/cstr-vctk-corpus.html" style="font-size: 9px" target="link">CSTR VCTK Corpus</a> | <a href = "https://github.com/cmusphinx/cmudict" style="font-size: 9px" target="link">CMUDict</a> | <a href = "https://huggingface.co/docs/transformers.js/index" style="font-size: 9px" target="link">Transformer.js</a> | <a href = "https://huggingface.co/cisco-ai/mini-bart-g2p" style="font-size: 9px" target="link">mini-bart-g2p</a> | <a href = "https://onnxruntime.ai/docs/get-started/with-javascript/web.html" style="font-size: 9px" target="link">ONNXRuntime-Web</a> | <a href = "https://github.com/akjava/English-To-IPA-Collections" style="font-size: 9px" target="link">English-To-IPA-Collections</a> | <a href ="https://huggingface.co/papers/2309.03199" style="font-size: 9px" target="link">Matcha-TTS Paper</a> </div> </body> </html>