Spaces:

Aekanun
/

Thai-HandWriting-to-Text

Running on Zero

App Files Files Community

Aekanun commited on Nov 16

Commit

a187193

•

1 Parent(s): 1a517f1

fixed app.py

Browse files

Files changed (1) hide show

app.py +86 -43

app.py CHANGED Viewed

@@ -1,81 +1,124 @@
 import os
-from huggingface_hub import login
-from transformers import AutoProcessor, AutoModelForVision2Seq
 import torch
 from PIL import Image
 import gradio as gr
-# Login to Hugging Face Hub
-if 'HUGGING_FACE_HUB_TOKEN' in os.environ:
-    print("Logging in to Hugging Face Hub...")
-    login(token=os.environ['HUGGING_FACE_HUB_TOKEN'])
-else:
-    print("Warning: HUGGING_FACE_HUB_TOKEN not found")
 # Global variables
 model = None
 processor = None
-def load_model():
     global model, processor
     try:
-        model_path = "Aekanun/thai-handwriting-llm"
-        print(f"Loading model and processor from {model_path}...")
-        processor = AutoProcessor.from_pretrained(model_path)
-        model = AutoModelForVision2Seq.from_pretrained(model_path)
-        if torch.cuda.is_available():
-            model = model.to("cuda")
         return True
     except Exception as e:
-        print(f"Error loading model: {str(e)}")
         return False
-def process_image(image):
     if image is None:
         return "กรุณาอัพโหลดรูปภาพ"
     try:
         # Ensure image is in PIL format
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
-        # Convert to RGB if needed
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        # Process image
-        inputs = processor(images=image, return_tensors="pt")
-        # Move to GPU if available
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        # Generate text
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
-                max_new_tokens=100,
-                num_beams=4,
-                pad_token_id=processor.tokenizer.pad_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id
             )
         # Decode output
-        predicted_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-        return predicted_text.strip()
     except Exception as e:
         return f"เกิดข้อผิดพลาด: {str(e)}"
-# Initialize
-print("Initializing application...")
-if load_model():
     # Create Gradio interface
     demo = gr.Interface(
-        fn=process_image,
         inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
         outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
         title="Thai Handwriting Recognition",
@@ -86,4 +129,4 @@ if load_model():
     if __name__ == "__main__":
         demo.launch()
 else:
-    print("Failed to initialize the application")

 import os
+import warnings
 import torch
+import gc
+from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
 from PIL import Image
 import gradio as gr
+from huggingface_hub import login
+# Basic settings
+warnings.filterwarnings('ignore')
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 # Global variables
 model = None
 processor = None
+# Clear CUDA cache
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
+    gc.collect()
+    print("เคลียร์ CUDA cache เรียบร้อยแล้ว")
+# Login to Hugging Face Hub
+if 'HUGGING_FACE_HUB_TOKEN' in os.environ:
+    print("กำลังเข้าสู่ระบบ Hugging Face Hub...")
+    login(token=os.environ['HUGGING_FACE_HUB_TOKEN'])
+else:
+    print("คำเตือน: ไม่พบ HUGGING_FACE_HUB_TOKEN")
+def load_model_and_processor():
+    """โหลดโมเดลและ processor"""
     global model, processor
+    print("กำลังโหลดโมเดลและ processor...")
     try:
+        # Model paths
+        base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+        hub_model_path = "Aekanun/thai-handwriting-llm"
+        # BitsAndBytes config
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16
+        )
+        # Load processor from base model
+        processor = AutoProcessor.from_pretrained(base_model_path)
+        # Load model from Hub
+        print("กำลังโหลดโมเดลจาก Hub...")
+        model = AutoModelForVision2Seq.from_pretrained(
+            hub_model_path,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            quantization_config=bnb_config,
+            trust_remote_code=True
+        )
+        print("โหลดโมเดลสำเร็จ!")
         return True
     except Exception as e:
+        print(f"เกิดข้อผิดพลาดในการโหลดโมเดล: {str(e)}")
         return False
+def process_handwriting(image):
+    """ฟังก์ชันสำหรับ Gradio interface"""
+    global model, processor
     if image is None:
         return "กรุณาอัพโหลดรูปภาพ"
     try:
         # Ensure image is in PIL format
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
+        # Create prompt
+        prompt = """Transcribe the Thai handwritten text from the provided image.
+Only return the transcription in Thai language."""
+        # Create model inputs
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {"type": "image", "image": image}
+                ],
+            }
+        ]
+        # Process with model
+        text = processor.apply_chat_template(messages, tokenize=False)
+        inputs = processor(text=text, images=image, return_tensors="pt")
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # Generate
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
+                max_new_tokens=256,
+                do_sample=False,
+                pad_token_id=processor.tokenizer.pad_token_id
             )
         # Decode output
+        transcription = processor.decode(outputs[0], skip_special_tokens=True)
+        return transcription.strip()
     except Exception as e:
         return f"เกิดข้อผิดพลาด: {str(e)}"
+# Initialize application
+print("กำลังเริ่มต้นแอปพลิเคชัน...")
+if load_model_and_processor():
     # Create Gradio interface
     demo = gr.Interface(
+        fn=process_handwriting,
         inputs=gr.Image(type="pil", label="อัพโหลดรูปลายมือเขียนภาษาไทย"),
         outputs=gr.Textbox(label="ข้อความที่แปลงได้"),
         title="Thai Handwriting Recognition",
     if __name__ == "__main__":
         demo.launch()
 else:
+    print("ไม่สามารถเริ่มต้นแอปพลิเคชันได้")