Shanshan Wang commited on
Commit
bcfef20
β€’
1 Parent(s): d6bfd67

Track binary files with Git LFS

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +69 -4
  3. assets/rental_application.png +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  assets/handwritten-note-example.jpg filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  assets/handwritten-note-example.jpg filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -30,6 +30,29 @@ example_prompts = [
30
  ]
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def load_model_and_set_image_function(model_name):
34
  # Get the model path from the model_paths dictionary
35
  model_path = model_paths[model_name]
@@ -245,10 +268,34 @@ def regenerate_response(chatbot,
245
  def clear_all():
246
  return [], None, None, "" # Clear chatbot, state, reset image_input
247
 
 
 
 
 
 
 
 
 
 
 
248
  # Build the Gradio interface
249
  with gr.Blocks() as demo:
250
- gr.Markdown("# **H2OVL-Mississippi**")
251
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  state= gr.State()
253
  model_state = gr.State()
254
 
@@ -258,7 +305,12 @@ with gr.Blocks() as demo:
258
  label="Select Model",
259
  value="H2OVL-Mississippi-2B"
260
  )
261
-
 
 
 
 
 
262
 
263
  with gr.Row(equal_height=True):
264
  # First column with image input
@@ -293,7 +345,7 @@ with gr.Blocks() as demo:
293
  inputs=None,
294
  outputs=[chatbot, state]
295
  )
296
-
297
 
298
  # Reset chatbot and state when image input changes
299
  image_input.change(
@@ -343,6 +395,18 @@ with gr.Blocks() as demo:
343
  label="Tile Number (default: 6)"
344
  )
345
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  with gr.Row():
347
  submit_button = gr.Button("Submit")
348
  regenerate_button = gr.Button("Regenerate")
@@ -394,6 +458,7 @@ with gr.Blocks() as demo:
394
  gr.Examples(
395
  examples=[
396
  ["assets/handwritten-note-example.jpg", "Read the text on the image"],
 
397
  ["assets/receipt.jpg", "Extract the text from the image."],
398
  ["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
399
  ["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],
 
30
  ]
31
 
32
 
33
+ # Function to handle task type logic
34
+ def handle_task_type(task_type, model_name):
35
+ max_new_tokens = 1024 # Default value
36
+ if task_type == "OCR":
37
+ max_new_tokens = 3072 # Adjust for OCR
38
+ return max_new_tokens
39
+
40
+ # Function to handle task type logic and default question
41
+ def handle_task_type_and_prompt(task_type, model_name):
42
+ max_new_tokens = handle_task_type(task_type, model_name)
43
+ default_question = example_prompts[0] if task_type == "OCR" else None
44
+ return max_new_tokens, default_question
45
+
46
+ def update_task_type_on_model_change(model_name):
47
+ # Set default task type and max_new_tokens based on the model
48
+ if '2b' in model_name.lower():
49
+ return "Document extractor", handle_task_type("Document extractor", model_name)
50
+ elif '0.8b' in model_name.lower():
51
+ return "OCR", handle_task_type("OCR", model_name)
52
+ else:
53
+ return "Chat", handle_task_type("Chat", model_name)
54
+
55
+
56
  def load_model_and_set_image_function(model_name):
57
  # Get the model path from the model_paths dictionary
58
  model_path = model_paths[model_name]
 
268
  def clear_all():
269
  return [], None, None, "" # Clear chatbot, state, reset image_input
270
 
271
+
272
+ title_html = """
273
+ <h1> <span class="gradient-text" id="text">H2OVL-Mississippi</span><span class="plain-text">: Lightweight Vision Language Models for OCR and Doc AI tasks</span></h1>
274
+ <a href="https://huggingface.co/collections/h2oai/h2ovl-mississippi-66e492da45da0a1b7ea7cf39">[😊 Hugging Face]</a>
275
+ <a href="https://arxiv.org/abs/2410.13611">[πŸ“œ Paper]</a>
276
+ <a href="https://huggingface.co/spaces/h2oai/h2ovl-mississippi-benchmarks">[🌟 Benchmarks]</a>
277
+ """
278
+
279
+
280
+
281
  # Build the Gradio interface
282
  with gr.Blocks() as demo:
283
+ gr.HTML(title_html)
284
+ gr.HTML("""
285
+ <style>
286
+ .gradient-text {
287
+ font-size: 36px !important;
288
+ font-weight: bold !important;
289
+ }
290
+ .plain-text {
291
+ font-size: 32px !important;
292
+ }
293
+ h1 {
294
+ margin-bottom: 20px !important;
295
+ }
296
+ </style>
297
+ """)
298
+
299
  state= gr.State()
300
  model_state = gr.State()
301
 
 
305
  label="Select Model",
306
  value="H2OVL-Mississippi-2B"
307
  )
308
+
309
+ task_type_dropdown = gr.Dropdown(
310
+ choices=["OCR", "Document extractor", "Chat"],
311
+ label="Select Task Type",
312
+ value="Document extractor"
313
+ )
314
 
315
  with gr.Row(equal_height=True):
316
  # First column with image input
 
345
  inputs=None,
346
  outputs=[chatbot, state]
347
  )
348
+
349
 
350
  # Reset chatbot and state when image input changes
351
  image_input.change(
 
395
  label="Tile Number (default: 6)"
396
  )
397
 
398
+ model_dropdown.change(
399
+ fn=update_task_type_on_model_change,
400
+ inputs=[model_dropdown],
401
+ outputs=[task_type_dropdown, max_new_tokens_input]
402
+ )
403
+
404
+ task_type_dropdown.change(
405
+ fn=handle_task_type_and_prompt,
406
+ inputs=[task_type_dropdown, model_dropdown],
407
+ outputs=[max_new_tokens_input, user_input]
408
+ )
409
+
410
  with gr.Row():
411
  submit_button = gr.Button("Submit")
412
  regenerate_button = gr.Button("Regenerate")
 
458
  gr.Examples(
459
  examples=[
460
  ["assets/handwritten-note-example.jpg", "Read the text on the image"],
461
+ ["assets/rental_application.png", "Read the text and provide word by word ocr for the document. <doc>"],
462
  ["assets/receipt.jpg", "Extract the text from the image."],
463
  ["assets/driver_license.png", "Extract the text from the image and fill the following json {'license_number':'',\n'full_name':'',\n'date_of_birth':'',\n'address':'',\n'issue_date':'',\n'expiration_date':'',\n}"],
464
  ["assets/invoice.png", "Please extract the following fields, and return the result in JSON format: supplier_name, supplier_address, customer_name, customer_address, invoice_number, invoice_total_amount, invoice_tax_amount"],
assets/rental_application.png ADDED

Git LFS Details

  • SHA256: ca7934a2f163d98ac0f893f73cd3dfcf45b4b94001a779b735ad8b481d4aebab
  • Pointer size: 132 Bytes
  • Size of remote file: 2.34 MB