Spaces:

vikhyatk
/

moondream1

Running on Zero

App Files Files Community

moondream

by harshnarayan12 - opened Feb 7

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+24

-5

This PR is in draft mode

Files changed (1) hide show

app.py +24 -5

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-import spaces
 import torch
 from PIL import Image
 from einops import rearrange
@@ -1164,7 +1163,7 @@ from transformers import TextIteratorStreamer
 import hashlib
 import os
-model_path = snapshot_download("vikhyatk/moondream1", revision="3b9dfe7f7fc461b17aa5f16aadefe60cfc2150c9")
 vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
 text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
@@ -1186,7 +1185,6 @@ def cached_vision_encoder(image):
         return image_vec.to(DEVICE, dtype=DTYPE)
-@spaces.GPU(duration=10)
 def answer_question(image, question):
     yield "Encoding image..."
@@ -1205,9 +1203,9 @@ def answer_question(image, question):
 with gr.Blocks() as demo:
-    gr.HTML("<h1 class='gradio-heading'><center>🌔 moondream1</center></h1>")
     gr.HTML(
-        "<center><p class='gradio-sub-heading'>moondream1 is an older version of the moondream model. Check out the <a href='https://huggingface.co/spaces/vikhyatk/moondream2'>moondream2</a> space for an improved version.</p></center>"
     )
     with gr.Group():
         with gr.Row():
@@ -1228,3 +1226,24 @@ with gr.Blocks() as demo:
 demo.queue().launch(debug=True)

 from __future__ import annotations
 import torch
 from PIL import Image
 from einops import rearrange
 import hashlib
 import os
+model_path = snapshot_download("vikhyatk/moondream1")
 vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
 text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
         return image_vec.to(DEVICE, dtype=DTYPE)
 def answer_question(image, question):
     yield "Encoding image..."
 with gr.Blocks() as demo:
+    gr.HTML("<h1 class='gradio-heading'><center>🌔 moondream</center></h1>")
     gr.HTML(
+        "<center><p class='gradio-sub-heading'>moondream1 is a tiny (1.6B parameter) vision language model trained by <a href='https://x.com/vikhyatk'>@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder.  Check out the <a href='https://huggingface.co/vikhyatk/moondream1'>HuggingFace model card</a> for more details.</p></center>"
     )
     with gr.Group():
         with gr.Row():
 demo.queue().launch(debug=True)
+# gr.Interface(
+#     title="🌔 moondream1",
+#     description="""
+#         moondream1 is a tiny (1.6B parameter) vision language model trained by <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with models twice its size. It is trained on the LLaVa training dataset, and initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder.  Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace model card</a> for more details.
+#     """,
+#     fn=answer_question,
+#     inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
+#     examples=[
+#         [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
+#         [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
+#         [
+#             Image.open("assets/demo-3.jpg"),
+#             "What kind of public transportation is in the image?",
+#         ],
+#         [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
+#         [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
+#     ],
+#     outputs=gr.TextArea(label="Answer"),
+#     allow_flagging="never",
+#     cache_examples=False,
+# ).launch()