Spaces:
Running
on
Zero
Running
on
Zero
moondream
#2
by
harshnarayan12
- opened
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
-
import spaces
|
4 |
import torch
|
5 |
from PIL import Image
|
6 |
from einops import rearrange
|
@@ -1164,7 +1163,7 @@ from transformers import TextIteratorStreamer
|
|
1164 |
import hashlib
|
1165 |
import os
|
1166 |
|
1167 |
-
model_path = snapshot_download("vikhyatk/moondream1"
|
1168 |
|
1169 |
vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
|
1170 |
text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
|
@@ -1186,7 +1185,6 @@ def cached_vision_encoder(image):
|
|
1186 |
return image_vec.to(DEVICE, dtype=DTYPE)
|
1187 |
|
1188 |
|
1189 |
-
@spaces.GPU(duration=10)
|
1190 |
def answer_question(image, question):
|
1191 |
yield "Encoding image..."
|
1192 |
|
@@ -1205,9 +1203,9 @@ def answer_question(image, question):
|
|
1205 |
|
1206 |
|
1207 |
with gr.Blocks() as demo:
|
1208 |
-
gr.HTML("<h1 class='gradio-heading'><center>π
|
1209 |
gr.HTML(
|
1210 |
-
"<center><p class='gradio-sub-heading'>moondream1 is
|
1211 |
)
|
1212 |
with gr.Group():
|
1213 |
with gr.Row():
|
@@ -1228,3 +1226,24 @@ with gr.Blocks() as demo:
|
|
1228 |
|
1229 |
demo.queue().launch(debug=True)
|
1230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from __future__ import annotations
|
2 |
|
|
|
3 |
import torch
|
4 |
from PIL import Image
|
5 |
from einops import rearrange
|
|
|
1163 |
import hashlib
|
1164 |
import os
|
1165 |
|
1166 |
+
model_path = snapshot_download("vikhyatk/moondream1")
|
1167 |
|
1168 |
vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
|
1169 |
text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
|
|
|
1185 |
return image_vec.to(DEVICE, dtype=DTYPE)
|
1186 |
|
1187 |
|
|
|
1188 |
def answer_question(image, question):
|
1189 |
yield "Encoding image..."
|
1190 |
|
|
|
1203 |
|
1204 |
|
1205 |
with gr.Blocks() as demo:
|
1206 |
+
gr.HTML("<h1 class='gradio-heading'><center>π moondream</center></h1>")
|
1207 |
gr.HTML(
|
1208 |
+
"<center><p class='gradio-sub-heading'>moondream1 is a tiny (1.6B parameter) vision language model trained by <a href='https://x.com/vikhyatk'>@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href='https://huggingface.co/vikhyatk/moondream1'>HuggingFace model card</a> for more details.</p></center>"
|
1209 |
)
|
1210 |
with gr.Group():
|
1211 |
with gr.Row():
|
|
|
1226 |
|
1227 |
demo.queue().launch(debug=True)
|
1228 |
|
1229 |
+
# gr.Interface(
|
1230 |
+
# title="π moondream1",
|
1231 |
+
# description="""
|
1232 |
+
# moondream1 is a tiny (1.6B parameter) vision language model trained by <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder. Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace model card</a> for more details.
|
1233 |
+
# """,
|
1234 |
+
# fn=answer_question,
|
1235 |
+
# inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
|
1236 |
+
# examples=[
|
1237 |
+
# [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
|
1238 |
+
# [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
|
1239 |
+
# [
|
1240 |
+
# Image.open("assets/demo-3.jpg"),
|
1241 |
+
# "What kind of public transportation is in the image?",
|
1242 |
+
# ],
|
1243 |
+
# [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
|
1244 |
+
# [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
|
1245 |
+
# ],
|
1246 |
+
# outputs=gr.TextArea(label="Answer"),
|
1247 |
+
# allow_flagging="never",
|
1248 |
+
# cache_examples=False,
|
1249 |
+
# ).launch()
|