File size: 5,064 Bytes
c542962
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7316288
 
c542962
 
 
 
 
 
 
7316288
c542962
7316288
c542962
 
 
 
 
 
 
 
 
70b0612
c542962
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70b0612
880d93b
c542962
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor, MllamaForConditionalGeneration
import streamlit as st
import os
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
import base64
import random

def init_model():
    tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
    model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
    model = model.eval()
    return model, tokenizer

def init_gpu_model():
    tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
    model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
    model = model.eval().cuda()
    return model, tokenizer

def init_qwen_model():
    model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="cpu", torch_dtype=torch.float16)
    processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
    return model, processor

def get_quen_op(image_file, model, processor):
    try: 
        image = Image.open(image_file).convert('RGB')
        conversation = [
            {
                "role":"user",
                "content":[
                    {
                        "type":"image",
                    },
                    {
                        "type":"text",
                        "text":"Extract text from this image."
                    }
                ]
            }
        ]
        text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
        inputs = {k: v.to(torch.float32) if torch.is_floating_point(v) else v for k, v in inputs.items()}

        generation_config = {
            "max_new_tokens": 32,
            "do_sample": False,
            "top_k": 20,
            "top_p": 0.90,
            "temperature": 0.4,
            "num_return_sequences": 1,
            "pad_token_id": processor.tokenizer.pad_token_id,
            "eos_token_id": processor.tokenizer.eos_token_id,
        }

        output_ids = model.generate(**inputs, **generation_config)
        if 'input_ids' in inputs:
                generated_ids = output_ids[:, inputs['input_ids'].shape[1]:]
        else:
            generated_ids = output_ids
            
        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            
        return output_text[:] if output_text else "No text extracted from the image."
    
    except Exception as e:
        return f"An error occurred: {str(e)}"

def init_llama():
    model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

    model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        token=os.getenv("access_token")
    )
    processor = AutoProcessor.from_pretrained(model_id, token=os.getenv("access_token"))
    return model, processor

def get_llama_op(image_file, model, processor):

    with open(image_file, "rb") as f:
        image = base64.b64encode(f.read()).decode('utf-8')

    image = Image.open(image_file)
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": "You are an accurate OCR engine. From the given image, extract the text."}
        ]}
    ]
    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(images=image, text=input_text, return_tensors="pt").to(model.device)

    output = model.generate(**inputs, max_new_tokens=128)
    return processor.decode(output[0])

def get_text(image_file, model, tokenizer):
    res = model.chat(tokenizer, image_file, ocr_type='ocr')
    return res

st.title("Image - Text OCR")
st.write("Upload an image for OCR")

# MODEL, PROCESSOR = init_llama()
random_value = random.randint(0, 100)
st.write(f"Model loaded: build number - {random_value}")

image_file = st.file_uploader("Upload Image", type=['jpg', 'png', 'jpeg'])

if image_file:

    if not os.path.exists("images"):
        os.makedirs("images")
    with open(f"images/{image_file.name}", "wb") as f:
        f.write(image_file.getbuffer())

    image_file = f"images/{image_file.name}"

    # model, tokenizer = init_gpu_model()
    # model, tokenizer = init_model()
    # text = get_text(image_file, model, tokenizer)

    model, processor = init_llama()
    text = get_llama_op(image_file, model, processor)

    # model, processor = init_qwen_model()
    # text = get_quen_op(image_file, model, processor)
    print(text)
    st.write(text)