KingNish commited on
Commit
5c47ebc
1 Parent(s): 29ebca1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -41
app.py CHANGED
@@ -12,13 +12,6 @@ from huggingface_hub import hf_hub_download, InferenceClient
12
  import requests
13
  from bs4 import BeautifulSoup
14
  import urllib
15
- import random
16
- from functools import lru_cache
17
- import concurrent.futures
18
-
19
- # Configuration for concurrency
20
- MAX_WORKERS = 4 # Adjust based on your system resources
21
- executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
22
 
23
  def extract_text_from_webpage(html_content):
24
  """Extracts visible text from HTML content using BeautifulSoup."""
@@ -37,7 +30,7 @@ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="activ
37
  start = 0
38
  all_results = []
39
  # Limit the number of characters from each webpage to stay under the token limit
40
- max_chars_per_page = 4000 # Adjust this value based on your token limit and average webpage length
41
 
42
  with requests.Session() as session:
43
  while start < num_results:
@@ -71,29 +64,24 @@ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="activ
71
  # Truncate text if it's too long
72
  if len(visible_text) > max_chars_per_page:
73
  visible_text = visible_text[:max_chars_per_page] + "..."
74
- all_results.append({"link": link, "text": visible_text})
75
  except requests.exceptions.RequestException as e:
76
  print(f"Error fetching or processing {link}: {e}")
77
- all_results.append({"link": link, "text": None})
78
  else:
79
- all_results.append({"link": None, "text": None})
80
  start += len(result_block)
81
  return all_results
82
 
83
- @lru_cache(maxsize=1) # Cache the models to avoid reloading
84
- def load_speech_recognition_models():
85
- """Loads and caches speech recognition models."""
86
- model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
87
- sample_rate = 16000
88
- preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
89
- encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
90
- tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
91
- return preprocessor, encoder, tokenizer
92
-
93
  # Speech Recognition Model Configuration
94
  model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
95
  sample_rate = 16000
96
 
 
 
 
 
 
97
  # Mistral Model Configuration
98
  client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
99
  system_instructions1 = "<s>[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. The request asks you to provide friendly responses. The expectation is that I will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
@@ -105,8 +93,6 @@ def to_float32(audio_buffer):
105
  return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
106
 
107
  def transcribe(audio_path):
108
- """Transcribes audio using cached models."""
109
- preprocessor, encoder, tokenizer = load_speech_recognition_models()
110
  audio_file = AudioSegment.from_file(audio_path)
111
  sr = audio_file.frame_rate
112
  audio_buffer = np.array(audio_file.get_array_of_samples())
@@ -126,38 +112,34 @@ def transcribe(audio_path):
126
 
127
  return text
128
 
129
- async def run_model(text, web_search):
130
- """Runs the language model asynchronously."""
131
- if web_search:
132
- web_results = await asyncio.get_event_loop().run_in_executor(executor, search, text) # Run search in executor
133
- web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
134
  formatted_prompt = system_instructions1 + text + "[WEB]" + str(web2) + "[ANSWER]"
 
 
135
  else:
136
  formatted_prompt = system_instructions1 + text + "[JARVIS]"
137
- stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
138
- return "".join([response.token.text for response in stream if response.token.text != "</s>"])
139
 
140
- async def generate_speech(reply):
141
- """Generates speech asynchronously."""
 
142
  communicate = edge_tts.Communicate(reply)
143
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
144
  tmp_path = tmp_file.name
145
  await communicate.save(tmp_path)
146
  return tmp_path
147
 
148
- async def respond(audio, web_search):
149
- """Handles user input, model processing, and response generation."""
150
- user = await asyncio.get_event_loop().run_in_executor(executor, transcribe, audio) # Run transcription in executor
151
- reply = await run_model(user, web_search)
152
- audio_path = await generate_speech(reply)
153
- return audio_path
154
-
155
  with gr.Blocks() as demo:
156
  with gr.Row():
157
  web_search = gr.Checkbox(label="Web Search", value=False)
158
- input = gr.Audio(label="Voice Chat", sources="microphone", type="numpy")
159
  output = gr.Audio(label="AI",autoplay=True)
160
- gr.Interface(fn=respond, inputs=[input, web_search], outputs=[output], live=True)
161
 
162
  if __name__ == "__main__":
163
  demo.queue(max_size=200).launch()
 
12
  import requests
13
  from bs4 import BeautifulSoup
14
  import urllib
 
 
 
 
 
 
 
15
 
16
  def extract_text_from_webpage(html_content):
17
  """Extracts visible text from HTML content using BeautifulSoup."""
 
30
  start = 0
31
  all_results = []
32
  # Limit the number of characters from each webpage to stay under the token limit
33
+ max_chars_per_page = 3000 # Adjust this value based on your token limit and average webpage length
34
 
35
  with requests.Session() as session:
36
  while start < num_results:
 
64
  # Truncate text if it's too long
65
  if len(visible_text) > max_chars_per_page:
66
  visible_text = visible_text[:max_chars_per_page] + "..."
67
+ all_results.append({"text": visible_text})
68
  except requests.exceptions.RequestException as e:
69
  print(f"Error fetching or processing {link}: {e}")
70
+ all_results.append({"text": None})
71
  else:
72
+ all_results.append({"text": None})
73
  start += len(result_block)
74
  return all_results
75
 
 
 
 
 
 
 
 
 
 
 
76
  # Speech Recognition Model Configuration
77
  model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
78
  sample_rate = 16000
79
 
80
+ # Download preprocessor, encoder and tokenizer
81
+ preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
82
+ encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
83
+ tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
84
+
85
  # Mistral Model Configuration
86
  client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
87
  system_instructions1 = "<s>[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. The request asks you to provide friendly responses. The expectation is that I will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
 
93
  return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
94
 
95
  def transcribe(audio_path):
 
 
96
  audio_file = AudioSegment.from_file(audio_path)
97
  sr = audio_file.frame_rate
98
  audio_buffer = np.array(audio_file.get_array_of_samples())
 
112
 
113
  return text
114
 
115
+ def model(text, web_search):
116
+ if web_search is True:
117
+ """Performs a web search, feeds the results to a language model, and returns the answer."""
118
+ web_results = search(text)
119
+ web2 = ' '.join([f"Text: {res['text']}\n\n" for res in web_results])
120
  formatted_prompt = system_instructions1 + text + "[WEB]" + str(web2) + "[ANSWER]"
121
+ stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
122
+ return "".join([response.token.text for response in stream if response.token.text != "</s>"])
123
  else:
124
  formatted_prompt = system_instructions1 + text + "[JARVIS]"
125
+ stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
126
+ return "".join([response.token.text for response in stream if response.token.text != "</s>"])
127
 
128
+ async def respond(audio, web_search):
129
+ user = transcribe(audio)
130
+ reply = model(user, web_search)
131
  communicate = edge_tts.Communicate(reply)
132
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
133
  tmp_path = tmp_file.name
134
  await communicate.save(tmp_path)
135
  return tmp_path
136
 
 
 
 
 
 
 
 
137
  with gr.Blocks() as demo:
138
  with gr.Row():
139
  web_search = gr.Checkbox(label="Web Search", value=False)
140
+ input = gr.Audio(label="Voice Chat", sources="microphone")
141
  output = gr.Audio(label="AI",autoplay=True)
142
+ gr.Interface(fn=respond, inputs=[input, web_search], outputs=[output], live=True, batch=True, max_batch_size=20, delete_cache=(60,60))
143
 
144
  if __name__ == "__main__":
145
  demo.queue(max_size=200).launch()