Spaces:

h2oai
/

h2ovl-mississippi

Running on A10G

App Files Files Community

Shanshan Wang commited on 4 days ago

Commit

bcfef20

•

1 Parent(s): d6bfd67

Track binary files with Git LFS

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +69 -4
assets/rental_application.png +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/handwritten-note-example.jpg filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/handwritten-note-example.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -30,6 +30,29 @@ example_prompts = [
 ]
 def load_model_and_set_image_function(model_name):
     # Get the model path from the model_paths dictionary
     model_path = model_paths[model_name]
@@ -245,10 +268,34 @@ def regenerate_response(chatbot,
 def clear_all():
     return [], None, None, ""  # Clear chatbot, state, reset image_input
 # Build the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# **H2OVL-Mississippi**")
     state= gr.State()
     model_state = gr.State()
@@ -258,7 +305,12 @@ with gr.Blocks() as demo:
             label="Select Model",
             value="H2OVL-Mississippi-2B"
         )
     with gr.Row(equal_height=True):
         # First column with image input
@@ -293,7 +345,7 @@ with gr.Blocks() as demo:
         inputs=None,
         outputs=[chatbot, state]
     )
     # Reset chatbot and state when image input changes
     image_input.change(
@@ -343,6 +395,18 @@ with gr.Blocks() as demo:
                 label="Tile Number (default: 6)"
             )
     with gr.Row():
         submit_button = gr.Button("Submit")
         regenerate_button = gr.Button("Regenerate")
@@ -394,6 +458,7 @@ with gr.Blocks() as demo:
     gr.Examples(
         examples=[
             ["assets/handwritten-note-example.jpg", "Read the text on the image"],
             ["assets/receipt.jpg", "Extract the text from the image."],
             ["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
             ["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],

 ]
+# Function to handle task type logic
+def handle_task_type(task_type, model_name):
+    max_new_tokens = 1024  # Default value
+    if task_type == "OCR":
+        max_new_tokens = 3072  # Adjust for OCR
+    return max_new_tokens
+# Function to handle task type logic and default question
+def handle_task_type_and_prompt(task_type, model_name):
+    max_new_tokens = handle_task_type(task_type, model_name)
+    default_question = example_prompts[0] if task_type == "OCR" else None
+    return max_new_tokens, default_question
+def update_task_type_on_model_change(model_name):
+    # Set default task type and max_new_tokens based on the model
+    if '2b' in model_name.lower():
+        return "Document extractor", handle_task_type("Document extractor", model_name)
+    elif '0.8b' in model_name.lower():
+        return "OCR", handle_task_type("OCR", model_name)
+    else:
+        return "Chat", handle_task_type("Chat", model_name)
 def load_model_and_set_image_function(model_name):
     # Get the model path from the model_paths dictionary
     model_path = model_paths[model_name]
 def clear_all():
     return [], None, None, ""  # Clear chatbot, state, reset image_input
+title_html = """
+<h1> <span class="gradient-text" id="text">H2OVL-Mississippi</span><span class="plain-text">: Lightweight Vision Language Models for OCR and Doc AI tasks</span></h1>
+<a href="https://huggingface.co/collections/h2oai/h2ovl-mississippi-66e492da45da0a1b7ea7cf39">[😊 Hugging Face]</a>
+<a href="https://arxiv.org/abs/2410.13611">[📜 Paper]</a>
+<a href="https://huggingface.co/spaces/h2oai/h2ovl-mississippi-benchmarks">[🌟 Benchmarks]</a>
+"""
 # Build the Gradio interface
 with gr.Blocks() as demo:
+    gr.HTML(title_html)
+    gr.HTML("""
+        <style>
+        .gradient-text {
+            font-size: 36px !important;
+            font-weight: bold !important;
+        }
+        .plain-text {
+            font-size: 32px !important;
+        }
+        h1 {
+            margin-bottom: 20px !important;
+        }
+        </style>
+    """)
     state= gr.State()
     model_state = gr.State()
             label="Select Model",
             value="H2OVL-Mississippi-2B"
         )
+        task_type_dropdown = gr.Dropdown(
+            choices=["OCR", "Document extractor", "Chat"],
+            label="Select Task Type",
+            value="Document extractor"
+        )
     with gr.Row(equal_height=True):
         # First column with image input
         inputs=None,
         outputs=[chatbot, state]
     )
     # Reset chatbot and state when image input changes
     image_input.change(
                 label="Tile Number (default: 6)"
             )
+    model_dropdown.change(
+        fn=update_task_type_on_model_change,
+        inputs=[model_dropdown],
+        outputs=[task_type_dropdown, max_new_tokens_input]
+    )
+    task_type_dropdown.change(
+        fn=handle_task_type_and_prompt,
+        inputs=[task_type_dropdown, model_dropdown],
+        outputs=[max_new_tokens_input, user_input]
+    )
     with gr.Row():
         submit_button = gr.Button("Submit")
         regenerate_button = gr.Button("Regenerate")
     gr.Examples(
         examples=[
             ["assets/handwritten-note-example.jpg", "Read the text on the image"],
+            ["assets/rental_application.png", "Read the text and provide word by word ocr for the document. <doc>"],
             ["assets/receipt.jpg", "Extract the text from the image."],
             ["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
             ["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],

assets/rental_application.png ADDED Viewed

Git LFS Details

SHA256: ca7934a2f163d98ac0f893f73cd3dfcf45b4b94001a779b735ad8b481d4aebab
Pointer size: 132 Bytes
Size of remote file: 2.34 MB