akhil-vaidya commited on
Commit
c542962
0 Parent(s):

commit message

Browse files
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Python 3",
3
+ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
4
+ "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
5
+ "customizations": {
6
+ "codespaces": {
7
+ "openFiles": [
8
+ "README.md",
9
+ "app.py"
10
+ ]
11
+ },
12
+ "vscode": {
13
+ "settings": {},
14
+ "extensions": [
15
+ "ms-python.python",
16
+ "ms-python.vscode-pylance"
17
+ ]
18
+ }
19
+ },
20
+ "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21
+ "postAttachCommand": {
22
+ "server": "streamlit run app.py --server.enableCORS false --server.enableXsrfProtection false"
23
+ },
24
+ "portsAttributes": {
25
+ "8501": {
26
+ "label": "Application",
27
+ "onAutoForward": "openPreview"
28
+ }
29
+ },
30
+ "forwardPorts": [
31
+ 8501
32
+ ]
33
+ }
.github/workflows/sync_to_huggingface.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
20
+ run: git push --force https://akhil-vaidya:$HF_TOKEN@huggingface.co/spaces/akhil-vaidya/GOT-OCR main
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GOT OCR
3
+ emoji: 👀
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.38.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor, MllamaForConditionalGeneration
2
+ import streamlit as st
3
+ import os
4
+ from PIL import Image
5
+ import requests
6
+ import torch
7
+ from torchvision import io
8
+ from typing import Dict
9
+ import base64
10
+ import random
11
+
12
+ def init_model():
13
+ tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
14
+ model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
15
+ model = model.eval()
16
+ return model, tokenizer
17
+
18
+ def init_gpu_model():
19
+ tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
20
+ model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
21
+ model = model.eval().cuda()
22
+ return model, tokenizer
23
+
24
+ def init_qwen_model():
25
+ model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="cpu", torch_dtype=torch.float16)
26
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
27
+ return model, processor
28
+
29
+ def get_quen_op(image_file, model, processor):
30
+ try:
31
+ image = Image.open(image_file).convert('RGB')
32
+ conversation = [
33
+ {
34
+ "role":"user",
35
+ "content":[
36
+ {
37
+ "type":"image",
38
+ },
39
+ {
40
+ "type":"text",
41
+ "text":"Extract text from this image."
42
+ }
43
+ ]
44
+ }
45
+ ]
46
+ text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
47
+ inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
48
+ inputs = {k: v.to(torch.float32) if torch.is_floating_point(v) else v for k, v in inputs.items()}
49
+
50
+ generation_config = {
51
+ "max_new_tokens": 32,
52
+ "do_sample": False,
53
+ "top_k": 20,
54
+ "top_p": 0.90,
55
+ "temperature": 0.4,
56
+ "num_return_sequences": 1,
57
+ "pad_token_id": processor.tokenizer.pad_token_id,
58
+ "eos_token_id": processor.tokenizer.eos_token_id,
59
+ }
60
+
61
+ output_ids = model.generate(**inputs, **generation_config)
62
+ if 'input_ids' in inputs:
63
+ generated_ids = output_ids[:, inputs['input_ids'].shape[1]:]
64
+ else:
65
+ generated_ids = output_ids
66
+
67
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
68
+
69
+ return output_text[:] if output_text else "No text extracted from the image."
70
+
71
+ except Exception as e:
72
+ return f"An error occurred: {str(e)}"
73
+
74
+ def init_llama():
75
+ model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
76
+
77
+ model = MllamaForConditionalGeneration.from_pretrained(
78
+ model_id,
79
+ torch_dtype=torch.bfloat16,
80
+ device_map="auto",
81
+ token=os.getenv("access_token")
82
+ )
83
+ processor = AutoProcessor.from_pretrained(model_id, token=os.getenv("access_token"))
84
+ return model, processor
85
+
86
+ def get_llama_op(image_file, model, processor):
87
+
88
+ with open(image_file, "rb") as f:
89
+ image = base64.b64encode(f.read()).decode('utf-8')
90
+ messages = [
91
+ {"role": "user", "content": [
92
+ {"type": "image"},
93
+ {"type": "text", "text": "You are an accurate OCR engine. From the given image, extract the text."}
94
+ ]}
95
+ ]
96
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
97
+ inputs = processor(image, input_text, return_tensors="pt").to(model.device)
98
+
99
+ output = model.generate(**inputs, max_new_tokens=30)
100
+ return processor.decode(output[0])
101
+
102
+ def get_text(image_file, model, tokenizer):
103
+ res = model.chat(tokenizer, image_file, ocr_type='ocr')
104
+ return res
105
+
106
+ st.title("Image - Text OCR")
107
+ st.write("Upload an image for OCR")
108
+
109
+ MODEL, PROCESSOR = init_llama()
110
+ random_value = random.randint(0, 100)
111
+ st.write(f"Model loaded: build number - {random_value}")
112
+
113
+ image_file = st.file_uploader("Upload Image", type=['jpg', 'png', 'jpeg'])
114
+
115
+ if image_file:
116
+
117
+ if not os.path.exists("images"):
118
+ os.makedirs("images")
119
+ with open(f"images/{image_file.name}", "wb") as f:
120
+ f.write(image_file.getbuffer())
121
+
122
+ image_file = f"images/{image_file.name}"
123
+
124
+ # model, tokenizer = init_gpu_model()
125
+ # model, tokenizer = init_model()
126
+ # text = get_text(image_file, model, tokenizer)
127
+
128
+ # model, processor = init_llama()
129
+ text = get_llama_op(image_file, MODEL, PROCESSOR)
130
+
131
+ # model, processor = init_qwen_model()
132
+ # text = get_quen_op(image_file, model, processor)
133
+ print(text)
134
+ st.write(text)
archive/qwen_test.ipynb ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from PIL import Image\n",
10
+ "import requests\n",
11
+ "import torch\n",
12
+ "from torchvision import io\n",
13
+ "from typing import Dict\n",
14
+ "from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "data": {
24
+ "application/vnd.jupyter.widget-view+json": {
25
+ "model_id": "29ac356cdb05492d8a2da9bceea03b37",
26
+ "version_major": 2,
27
+ "version_minor": 0
28
+ },
29
+ "text/plain": [
30
+ "config.json: 0%| | 0.00/1.20k [00:00<?, ?B/s]"
31
+ ]
32
+ },
33
+ "metadata": {},
34
+ "output_type": "display_data"
35
+ },
36
+ {
37
+ "name": "stderr",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "c:\\Users\\Akhil PC\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\huggingface_hub\\file_download.py:157: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Akhil PC\\.cache\\huggingface\\hub\\models--Qwen--Qwen2-VL-2B-Instruct. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
41
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
42
+ " warnings.warn(message)\n",
43
+ "Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}\n"
44
+ ]
45
+ },
46
+ {
47
+ "data": {
48
+ "application/vnd.jupyter.widget-view+json": {
49
+ "model_id": "3ca08388cd3a4bc58b5b3c84b57fcd7f",
50
+ "version_major": 2,
51
+ "version_minor": 0
52
+ },
53
+ "text/plain": [
54
+ "model.safetensors.index.json: 0%| | 0.00/56.4k [00:00<?, ?B/s]"
55
+ ]
56
+ },
57
+ "metadata": {},
58
+ "output_type": "display_data"
59
+ },
60
+ {
61
+ "data": {
62
+ "application/vnd.jupyter.widget-view+json": {
63
+ "model_id": "7f667bff4c014fce85cb222f40508c78",
64
+ "version_major": 2,
65
+ "version_minor": 0
66
+ },
67
+ "text/plain": [
68
+ "Downloading shards: 0%| | 0/2 [00:00<?, ?it/s]"
69
+ ]
70
+ },
71
+ "metadata": {},
72
+ "output_type": "display_data"
73
+ },
74
+ {
75
+ "data": {
76
+ "application/vnd.jupyter.widget-view+json": {
77
+ "model_id": "c4289d2bd8f0466586d20564fb8fef84",
78
+ "version_major": 2,
79
+ "version_minor": 0
80
+ },
81
+ "text/plain": [
82
+ "model-00001-of-00002.safetensors: 0%| | 0.00/3.99G [00:00<?, ?B/s]"
83
+ ]
84
+ },
85
+ "metadata": {},
86
+ "output_type": "display_data"
87
+ },
88
+ {
89
+ "data": {
90
+ "application/vnd.jupyter.widget-view+json": {
91
+ "model_id": "47d67996509f431abb0f99bab97a03d6",
92
+ "version_major": 2,
93
+ "version_minor": 0
94
+ },
95
+ "text/plain": [
96
+ "model-00002-of-00002.safetensors: 0%| | 0.00/429M [00:00<?, ?B/s]"
97
+ ]
98
+ },
99
+ "metadata": {},
100
+ "output_type": "display_data"
101
+ },
102
+ {
103
+ "name": "stderr",
104
+ "output_type": "stream",
105
+ "text": [
106
+ "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46\n"
107
+ ]
108
+ },
109
+ {
110
+ "data": {
111
+ "application/vnd.jupyter.widget-view+json": {
112
+ "model_id": "d3e49e52f64147e2b5043c76d9a507e6",
113
+ "version_major": 2,
114
+ "version_minor": 0
115
+ },
116
+ "text/plain": [
117
+ "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
118
+ ]
119
+ },
120
+ "metadata": {},
121
+ "output_type": "display_data"
122
+ },
123
+ {
124
+ "data": {
125
+ "application/vnd.jupyter.widget-view+json": {
126
+ "model_id": "5060e7d44d5b40fd8ca2d7e90542be21",
127
+ "version_major": 2,
128
+ "version_minor": 0
129
+ },
130
+ "text/plain": [
131
+ "generation_config.json: 0%| | 0.00/272 [00:00<?, ?B/s]"
132
+ ]
133
+ },
134
+ "metadata": {},
135
+ "output_type": "display_data"
136
+ },
137
+ {
138
+ "data": {
139
+ "application/vnd.jupyter.widget-view+json": {
140
+ "model_id": "ac0500d6289442d88db22065e94c6df2",
141
+ "version_major": 2,
142
+ "version_minor": 0
143
+ },
144
+ "text/plain": [
145
+ "preprocessor_config.json: 0%| | 0.00/347 [00:00<?, ?B/s]"
146
+ ]
147
+ },
148
+ "metadata": {},
149
+ "output_type": "display_data"
150
+ },
151
+ {
152
+ "data": {
153
+ "application/vnd.jupyter.widget-view+json": {
154
+ "model_id": "99ff45911ba848f2bd3ccd3f57029641",
155
+ "version_major": 2,
156
+ "version_minor": 0
157
+ },
158
+ "text/plain": [
159
+ "tokenizer_config.json: 0%| | 0.00/4.19k [00:00<?, ?B/s]"
160
+ ]
161
+ },
162
+ "metadata": {},
163
+ "output_type": "display_data"
164
+ },
165
+ {
166
+ "data": {
167
+ "application/vnd.jupyter.widget-view+json": {
168
+ "model_id": "9d484f67779348d7b242a12de0505324",
169
+ "version_major": 2,
170
+ "version_minor": 0
171
+ },
172
+ "text/plain": [
173
+ "vocab.json: 0%| | 0.00/2.78M [00:00<?, ?B/s]"
174
+ ]
175
+ },
176
+ "metadata": {},
177
+ "output_type": "display_data"
178
+ },
179
+ {
180
+ "data": {
181
+ "application/vnd.jupyter.widget-view+json": {
182
+ "model_id": "b0e6345cf4cd4b61b7d6b10ab7ae6f23",
183
+ "version_major": 2,
184
+ "version_minor": 0
185
+ },
186
+ "text/plain": [
187
+ "merges.txt: 0%| | 0.00/1.67M [00:00<?, ?B/s]"
188
+ ]
189
+ },
190
+ "metadata": {},
191
+ "output_type": "display_data"
192
+ },
193
+ {
194
+ "data": {
195
+ "application/vnd.jupyter.widget-view+json": {
196
+ "model_id": "c108ffe24eab4d82a8aa8d5bda088bf7",
197
+ "version_major": 2,
198
+ "version_minor": 0
199
+ },
200
+ "text/plain": [
201
+ "tokenizer.json: 0%| | 0.00/7.03M [00:00<?, ?B/s]"
202
+ ]
203
+ },
204
+ "metadata": {},
205
+ "output_type": "display_data"
206
+ },
207
+ {
208
+ "data": {
209
+ "application/vnd.jupyter.widget-view+json": {
210
+ "model_id": "c9385ab1782f49fcb59fbe2aa73a81c5",
211
+ "version_major": 2,
212
+ "version_minor": 0
213
+ },
214
+ "text/plain": [
215
+ "chat_template.json: 0%| | 0.00/1.05k [00:00<?, ?B/s]"
216
+ ]
217
+ },
218
+ "metadata": {},
219
+ "output_type": "display_data"
220
+ }
221
+ ],
222
+ "source": [
223
+ "# Load the model in half-precision on the available device(s)\n",
224
+ "model = Qwen2VLForConditionalGeneration.from_pretrained(\"Qwen/Qwen2-VL-2B-Instruct\", device_map=\"cpu\", torch_dtype=torch.float16)\n",
225
+ "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2-VL-2B-Instruct\")"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 3,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "# Image\n",
235
+ "url = \"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg\"\n",
236
+ "image = Image.open(requests.get(url, stream=True).raw)\n",
237
+ "\n",
238
+ "conversation = [\n",
239
+ " {\n",
240
+ " \"role\":\"user\",\n",
241
+ " \"content\":[\n",
242
+ " {\n",
243
+ " \"type\":\"image\",\n",
244
+ " },\n",
245
+ " {\n",
246
+ " \"type\":\"text\",\n",
247
+ " \"text\":\"Describe this image.\"\n",
248
+ " }\n",
249
+ " ]\n",
250
+ " }\n",
251
+ "]"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": 4,
257
+ "metadata": {},
258
+ "outputs": [],
259
+ "source": [
260
+ "# Preprocess the inputs\n",
261
+ "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 5,
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors=\"pt\")\n",
271
+ "# inputs = inputs.to('cuda')"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": null,
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "# Inference: Generation of the output\n",
281
+ "output_ids = model.generate(**inputs, max_new_tokens=128)\n",
282
+ "generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "metadata": {},
289
+ "outputs": [],
290
+ "source": [
291
+ "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
292
+ "print(output_text)"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "code",
297
+ "execution_count": null,
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": []
301
+ }
302
+ ],
303
+ "metadata": {
304
+ "kernelspec": {
305
+ "display_name": "Python 3",
306
+ "language": "python",
307
+ "name": "python3"
308
+ },
309
+ "language_info": {
310
+ "codemirror_mode": {
311
+ "name": "ipython",
312
+ "version": 3
313
+ },
314
+ "file_extension": ".py",
315
+ "mimetype": "text/x-python",
316
+ "name": "python",
317
+ "nbconvert_exporter": "python",
318
+ "pygments_lexer": "ipython3",
319
+ "version": "3.12.0"
320
+ }
321
+ },
322
+ "nbformat": 4,
323
+ "nbformat_minor": 2
324
+ }
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers==4.45.0
2
+ streamlit==1.30.0
3
+ torch --index-url https://download.pytorch.org/whl/cpu
4
+ torchvision --index-url https://download.pytorch.org/whl/cpu
5
+ tiktoken
6
+ verovio
7
+ accelerate==0.28.0
8
+ Pillow==10.3.0