Spaces:
Running
Running
File size: 5,018 Bytes
c542962 1fedf30 c542962 1fedf30 c542962 7316288 c542962 7316288 c542962 1729495 c542962 1fedf30 c542962 1fedf30 c542962 1fedf30 c542962 d96edb1 1fedf30 c542962 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor, MllamaForConditionalGeneration
import streamlit as st
import os
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
import base64
@st.cache_resource
def init_model():
tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval()
return model, tokenizer
def init_gpu_model():
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
model = model.eval().cuda()
return model, tokenizer
def init_qwen_model():
model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="cpu", torch_dtype=torch.float16)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
return model, processor
def get_quen_op(image_file, model, processor):
try:
image = Image.open(image_file).convert('RGB')
conversation = [
{
"role":"user",
"content":[
{
"type":"image",
},
{
"type":"text",
"text":"Extract text from this image."
}
]
}
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
inputs = {k: v.to(torch.float32) if torch.is_floating_point(v) else v for k, v in inputs.items()}
generation_config = {
"max_new_tokens": 32,
"do_sample": False,
"top_k": 20,
"top_p": 0.90,
"temperature": 0.4,
"num_return_sequences": 1,
"pad_token_id": processor.tokenizer.pad_token_id,
"eos_token_id": processor.tokenizer.eos_token_id,
}
output_ids = model.generate(**inputs, **generation_config)
if 'input_ids' in inputs:
generated_ids = output_ids[:, inputs['input_ids'].shape[1]:]
else:
generated_ids = output_ids
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
return output_text[:] if output_text else "No text extracted from the image."
except Exception as e:
return f"An error occurred: {str(e)}"
@st.cache_resource
def init_llama():
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
token=os.getenv("access_token")
)
processor = AutoProcessor.from_pretrained(model_id, token=os.getenv("access_token"))
return model, processor
def get_llama_op(image_file, model, processor):
with open(image_file, "rb") as f:
image = base64.b64encode(f.read()).decode('utf-8')
image = Image.open(image_file)
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "You are an accurate OCR engine. From the given image, extract the text."}
]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(images=image, text=input_text, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=20)
return processor.decode(output[0])
def get_text(image_file, model, tokenizer):
res = model.chat(tokenizer, image_file, ocr_type='ocr')
return res
st.title("Image - Text OCR (General OCR Theory - GOT)")
st.write("Upload an image for OCR")
MODEL, PROCESSOR = init_model()
image_file = st.file_uploader("Upload Image", type=['jpg', 'png', 'jpeg'])
if image_file:
if not os.path.exists("images"):
os.makedirs("images")
with open(f"images/{image_file.name}", "wb") as f:
f.write(image_file.getbuffer())
image_file = f"images/{image_file.name}"
# model, tokenizer = init_gpu_model()
# model, tokenizer = init_model()
text = get_text(image_file, MODEL, PROCESSOR)
# model, processor = init_llama()
# text = get_llama_op(image_file, MODEL, PROCESSOR)
# model, processor = init_qwen_model()
# text = get_quen_op(image_file, model, processor)
print(text)
st.write(text) |