Files changed (1) hide show
  1. app.py +26 -31
app.py CHANGED
@@ -2,35 +2,33 @@ import time
2
  from threading import Thread
3
 
4
  import gradio as gr
 
5
  import torch
6
  from PIL import Image
7
- from transformers import AutoProcessor, LlavaForConditionalGeneration
8
  from transformers import TextIteratorStreamer
9
 
10
- import spaces
11
-
12
-
13
  PLACEHOLDER = """
14
  <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
15
- <img src="https://cdn-uploads.huggingface.co/production/uploads/64ccdc322e592905f922a06e/DDIW0kbWmdOQWwy4XMhwX.png" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55; ">
16
- <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">LLaVA-Llama-3-8B</h1>
17
- <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Llava-Llama-3-8b is a LLaVA model fine-tuned from Meta-Llama-3-8B-Instruct and CLIP-ViT-Large-patch14-336 with ShareGPT4V-PT and InternVL-SFT by XTuner</p>
18
  </div>
19
  """
 
 
 
20
 
 
21
 
22
- model_id = "xtuner/llava-llama-3-8b-v1_1-transformers"
23
-
24
- processor = AutoProcessor.from_pretrained(model_id)
25
 
26
- model = LlavaForConditionalGeneration.from_pretrained(
27
  model_id,
28
  torch_dtype=torch.float16,
29
  low_cpu_mem_usage=True,
 
30
  )
31
 
32
  model.to("cuda:0")
33
- model.generation_config.eos_token_id = 128009
34
 
35
 
36
  @spaces.GPU
@@ -51,15 +49,15 @@ def bot_streaming(message, history):
51
  try:
52
  if image is None:
53
  # Handle the case where image is None
54
- gr.Error("You need to upload an image for LLaVA to work.")
55
  except NameError:
56
  # Handle the case where 'image' is not defined at all
57
- gr.Error("You need to upload an image for LLaVA to work.")
58
 
59
- prompt = f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
60
  # print(f"prompt: {prompt}")
61
  image = Image.open(image)
62
- inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16)
63
 
64
  streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True})
65
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False)
@@ -67,9 +65,6 @@ def bot_streaming(message, history):
67
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
68
  thread.start()
69
 
70
- text_prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
71
- # print(f"text_prompt: {text_prompt}")
72
-
73
  buffer = ""
74
  time.sleep(0.5)
75
  for new_text in streamer:
@@ -78,7 +73,6 @@ def bot_streaming(message, history):
78
  new_text = new_text.split("<|eot_id|>")[0]
79
  buffer += new_text
80
 
81
- # generated_text_without_prompt = buffer[len(text_prompt):]
82
  generated_text_without_prompt = buffer
83
  # print(generated_text_without_prompt)
84
  time.sleep(0.06)
@@ -86,19 +80,20 @@ def bot_streaming(message, history):
86
  yield generated_text_without_prompt
87
 
88
 
89
- chatbot=gr.Chatbot(placeholder=PLACEHOLDER,scale=1)
90
- chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False)
 
91
  with gr.Blocks(fill_height=True, ) as demo:
92
  gr.ChatInterface(
93
- fn=bot_streaming,
94
- title="LLaVA Llama-3-8B",
95
- examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]},
96
- {"text": "How to make this pastry?", "files": ["./baklava.png"]}],
97
- description="Try [LLaVA Llama-3-8B](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
98
- stop_btn="Stop Generation",
99
- multimodal=True,
100
- textbox=chat_input,
101
- chatbot=chatbot,
102
  )
103
 
104
  demo.queue(api_open=False)
 
2
  from threading import Thread
3
 
4
  import gradio as gr
5
+ import spaces
6
  import torch
7
  from PIL import Image
8
+ from transformers import AutoProcessor, AutoModelForCausalLM
9
  from transformers import TextIteratorStreamer
10
 
 
 
 
11
  PLACEHOLDER = """
12
  <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
13
+ <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">microsoft/Phi-3-vision-128k-instruct</h1>
 
 
14
  </div>
15
  """
16
+ user_prompt = '<|user|>\n'
17
+ assistant_prompt = '<|assistant|>\n'
18
+ prompt_suffix = "<|end|>\n"
19
 
20
+ model_id = "microsoft/Phi-3-vision-128k-instruct"
21
 
22
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 
23
 
24
+ model = AutoModelForCausalLM.from_pretrained(
25
  model_id,
26
  torch_dtype=torch.float16,
27
  low_cpu_mem_usage=True,
28
+ trust_remote_code=True,
29
  )
30
 
31
  model.to("cuda:0")
 
32
 
33
 
34
  @spaces.GPU
 
49
  try:
50
  if image is None:
51
  # Handle the case where image is None
52
+ gr.Error("You need to upload an image for Phi-3-vision to work.")
53
  except NameError:
54
  # Handle the case where 'image' is not defined at all
55
+ gr.Error("You need to upload an image for Phi-3-vision to work.")
56
 
57
+ prompt = f"{message['text']}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
58
  # print(f"prompt: {prompt}")
59
  image = Image.open(image)
60
+ inputs = processor(prompt, [image], return_tensors='pt').to(0, torch.float16)
61
 
62
  streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True})
63
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False)
 
65
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
66
  thread.start()
67
 
 
 
 
68
  buffer = ""
69
  time.sleep(0.5)
70
  for new_text in streamer:
 
73
  new_text = new_text.split("<|eot_id|>")[0]
74
  buffer += new_text
75
 
 
76
  generated_text_without_prompt = buffer
77
  # print(generated_text_without_prompt)
78
  time.sleep(0.06)
 
80
  yield generated_text_without_prompt
81
 
82
 
83
+ chatbot = gr.Chatbot(placeholder=PLACEHOLDER, scale=1)
84
+ chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...",
85
+ show_label=False)
86
  with gr.Blocks(fill_height=True, ) as demo:
87
  gr.ChatInterface(
88
+ fn=bot_streaming,
89
+ title="Phi-3 Vision 128k Instruct",
90
+ examples=[{"text": "What is on the flower?", "files": ["./bee.jpg"]},
91
+ {"text": "How to make this pastry?", "files": ["./baklava.png"]}],
92
+ description="Try [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
93
+ stop_btn="Stop Generation",
94
+ multimodal=True,
95
+ textbox=chat_input,
96
+ chatbot=chatbot,
97
  )
98
 
99
  demo.queue(api_open=False)