Spaces:

FunAudioLLM
/

CosyVoice-300M

Runtime error

App Files Files Community

change model repo to hf

by wenmengzhou - opened Jul 18

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-11

Files changed (3) hide show

cosyvoice/cli/cosyvoice.py +2 -2
cosyvoice/cli/model.py +4 -6
css/utils.py +3 -3

cosyvoice/cli/cosyvoice.py CHANGED Viewed

@@ -14,7 +14,7 @@
 import os
 import torch
 from hyperpyyaml import load_hyperpyyaml
-from modelscope import snapshot_download
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
 from cosyvoice.cli.model import CosyVoiceModel
@@ -24,7 +24,7 @@ class CosyVoice:
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
-            model_dir = snapshot_download(model_dir)
         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
             configs = load_hyperpyyaml(f)
         self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],

 import os
 import torch
 from hyperpyyaml import load_hyperpyyaml
+from huggingface_hub import snapshot_download
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
 from cosyvoice.cli.model import CosyVoiceModel
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir, local_dir=model_dir)
         with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
             configs = load_hyperpyyaml(f)
         self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],

cosyvoice/cli/model.py CHANGED Viewed

@@ -19,18 +19,17 @@ class CosyVoiceModel:
                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module):
-        #self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.device = 'cpu'
         self.llm = llm
         self.flow = flow
         self.hift = hift
     def load(self, llm_model, flow_model, hift_model):
-        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
         self.llm.to(self.device).eval()
-        self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
         self.flow.to(self.device).eval()
-        self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
         self.hift.to(self.device).eval()
     def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
@@ -38,7 +37,6 @@ class CosyVoiceModel:
                   llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         tts_speech_token = self.llm.inference(text=text.to(self.device),
                                               text_len=text_len.to(self.device),
                                               prompt_text=prompt_text.to(self.device),

                  llm: torch.nn.Module,
                  flow: torch.nn.Module,
                  hift: torch.nn.Module):
+        self.device = torch.device('cuda')
         self.llm = llm
         self.flow = flow
         self.hift = hift
     def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location='cpu'))
         self.llm.to(self.device).eval()
+        self.flow.load_state_dict(torch.load(flow_model, map_location='cpu'))
         self.flow.to(self.device).eval()
+        self.hift.load_state_dict(torch.load(hift_model, map_location='cpu'))
         self.hift.to(self.device).eval()
     def inference(self, text, text_len, flow_embedding, llm_embedding=torch.zeros(0, 192),
                   llm_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), llm_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   flow_prompt_speech_token=torch.zeros(1, 0, dtype=torch.int32), flow_prompt_speech_token_len=torch.zeros(1, dtype=torch.int32),
                   prompt_speech_feat=torch.zeros(1, 0, 80), prompt_speech_feat_len=torch.zeros(1, dtype=torch.int32)):
         tts_speech_token = self.llm.inference(text=text.to(self.device),
                                               text_len=text_len.to(self.device),
                                               prompt_text=prompt_text.to(self.device),

css/utils.py CHANGED Viewed

@@ -13,9 +13,9 @@ from cosyvoice.utils.file_utils import load_wav
 from cosyvoice.cli.cosyvoice import CosyVoice
-cosyvoice= CosyVoice('speech_tts/CosyVoice-300M')
-cosyvoice_sft= CosyVoice('speech_tts/CosyVoice-300M-SFT')
-cosyvoice_instruct= CosyVoice('speech_tts/CosyVoice-300M-Instruct')
 example_tts_text = ["我们走的每一步，都是我们策略的一部分；你看到的所有一切，包括我此刻与你交谈，所做的一切，所说的每一句话，都有深远的含义。",
                     "那位喜剧演员真有才，[laughter]一开口就让全场观众爆笑。",

 from cosyvoice.cli.cosyvoice import CosyVoice
+cosyvoice= CosyVoice('FunAudioLLM/CosyVoice-300M')
+cosyvoice_sft= CosyVoice('FunAudioLLM/CosyVoice-300M-SFT')
+cosyvoice_instruct= CosyVoice('FunAudioLLM/CosyVoice-300M-Instruct')
 example_tts_text = ["我们走的每一步，都是我们策略的一部分；你看到的所有一切，包括我此刻与你交谈，所做的一切，所说的每一句话，都有深远的含义。",
                     "那位喜剧演员真有才，[laughter]一开口就让全场观众爆笑。",