KingNish commited on
Commit
7e36853
1 Parent(s): c787a85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -14
app.py CHANGED
@@ -21,7 +21,7 @@ import io
21
  import datasets
22
 
23
  import gradio as gr
24
- from transformers import AutoProcessor, TextIteratorStreamer
25
  from transformers import Idefics2ForConditionalGeneration
26
  import tempfile
27
  from streaming_stt_nemo import Model
@@ -30,17 +30,24 @@ import edge_tts
30
  import asyncio
31
  from transformers import pipeline
32
 
33
- oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
34
-
35
- async def answer_question(image, question):
36
- response = oracle(question=question, image=image)
37
- response2 = response[0]['answer']
38
- answer2 = str(response2)
39
- communicate = edge_tts.Communicate(answer2)
40
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
41
- tmp_path = tmp_file.name
42
- await communicate.save(tmp_path)
43
- yield tmp_path
 
 
 
 
 
 
 
44
 
45
  from gradio import Image, Textbox
46
 
@@ -307,7 +314,7 @@ def extract_images_from_msg_list(msg_list):
307
  return all_images
308
 
309
 
310
- @spaces.GPU(duration=60, queue=False)
311
  def model_inference(
312
  user_prompt,
313
  chat_history,
@@ -535,7 +542,7 @@ with gr.Blocks() as voice2:
535
  outputs=[output], live=True)
536
 
537
  with gr.Blocks() as video:
538
- gr.Markdown(" ## Live Chat Beta")
539
  gr.Markdown("### Click camera option to update image")
540
  gr.Interface(
541
  fn=answer_question,
 
21
  import datasets
22
 
23
  import gradio as gr
24
+ from transformers import AutoModel, AutoProcessor, TextIteratorStreamer
25
  from transformers import Idefics2ForConditionalGeneration
26
  import tempfile
27
  from streaming_stt_nemo import Model
 
30
  import asyncio
31
  from transformers import pipeline
32
 
33
+ model = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
34
+ processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
35
+
36
+ @spaces.GPU(duration=10, queue=False)
37
+ def answer_question(image, prompt):
38
+ inputs = processor(text=[prompt], images=[image], return_tensors="pt")
39
+ with torch.inference_mode():
40
+ output = model.generate(
41
+ **inputs,
42
+ do_sample=False,
43
+ use_cache=True,
44
+ max_new_tokens=256,
45
+ eos_token_id=151645,
46
+ pad_token_id=processor.tokenizer.pad_token_id
47
+ )
48
+ prompt_len = inputs["input_ids"].shape[1]
49
+ decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
50
+ return decoded_text
51
 
52
  from gradio import Image, Textbox
53
 
 
314
  return all_images
315
 
316
 
317
+ @spaces.GPU(duration=30, queue=False)
318
  def model_inference(
319
  user_prompt,
320
  chat_history,
 
542
  outputs=[output], live=True)
543
 
544
  with gr.Blocks() as video:
545
+ gr.Markdown(" ## Live Chat")
546
  gr.Markdown("### Click camera option to update image")
547
  gr.Interface(
548
  fn=answer_question,