gdnartea commited on
Commit
c5f8e1d
1 Parent(s): 80cc08d

Create Chatty_Ashe.py

Browse files
Files changed (1) hide show
  1. Chatty_Ashe.py +40 -0
Chatty_Ashe.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, GPT2LMHeadModel, GPT2Tokenizer, VitsProcessor, VitsForConditionalGeneration
3
+
4
+ # Load the ASR model and processor
5
+ asr_processor = Wav2Vec2Processor.from_pretrained("/path/to/canary/processor")
6
+ asr_model = Wav2Vec2ForCTC.from_pretrained("/path/to/canary/model")
7
+
8
+ # Load the text processing model and tokenizer
9
+ proc_tokenizer = GPT2Tokenizer.from_pretrained("/path/to/phi3/tokenizer")
10
+ proc_model = GPT2LMHeadModel.from_pretrained("/path/to/phi3/model")
11
+
12
+ # Load the TTS model and processor
13
+ tts_processor = VitsProcessor.from_pretrained("facebook/vits-base")
14
+ tts_model = VitsForConditionalGeneration.from_pretrained("facebook/vits-base")
15
+
16
+ def process_speech(speech):
17
+ # Convert the speech to text
18
+ inputs = asr_processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
19
+ with torch.no_grad():
20
+ logits = asr_model(inputs.input_values, attention_mask=inputs.attention_mask).logits
21
+ predicted_ids = torch.argmax(logits, dim=-1)
22
+ transcription = asr_processor.decode(predicted_ids[0])
23
+
24
+ # Process the text
25
+ inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')
26
+ outputs = proc_model.generate(inputs, max_length=100, temperature=0.7, pad_token_id=proc_tokenizer.eos_token_id)
27
+ processed_text = proc_tokenizer.decode(outputs[0], skip_special_tokens=True)
28
+
29
+ # Convert the processed text to speech
30
+ inputs = tts_processor(processed_text, return_tensors="pt")
31
+ with torch.no_grad():
32
+ logits = tts_model(inputs["input_ids"]).logits
33
+ predicted_ids = torch.argmax(logits, dim=-1)
34
+ audio = tts_processor.decode(predicted_ids)
35
+
36
+ return audio
37
+
38
+ iface = gr.Interface(fn=process_speech, inputs=gr.inputs.Audio(source="microphone"), outputs="audio")
39
+
40
+ iface.launch()