taesiri commited on
Commit
cc14163
1 Parent(s): c19b490
Files changed (1) hide show
  1. app.py +9 -33
app.py CHANGED
@@ -6,18 +6,15 @@ from transformers import MllamaForConditionalGeneration, AutoProcessor
6
  from peft import PeftModel
7
  from huggingface_hub import login
8
  import spaces
9
- import json
10
 
11
  # Login to Hugging Face
12
  if "HF_TOKEN" not in os.environ:
13
- raise ValueError(
14
- "Please set the HF_TOKEN environment variable with your Hugging Face token"
15
- )
16
  login(token=os.environ["HF_TOKEN"])
17
 
18
  # Load model and processor (do this outside the inference function to avoid reloading)
19
  base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
20
- lora_weights_path = "taesiri/BunsBunny-LLama-3.2-11B-Vision-Instruct-DummyTask2"
21
 
22
  processor = AutoProcessor.from_pretrained(base_model_path)
23
  model = MllamaForConditionalGeneration.from_pretrained(
@@ -27,50 +24,29 @@ model = MllamaForConditionalGeneration.from_pretrained(
27
  )
28
  model = PeftModel.from_pretrained(model, lora_weights_path)
29
 
30
-
31
  @spaces.GPU
32
  def inference(image, question):
33
  # Prepare input
34
  messages = [
35
- {
36
- "role": "user",
37
- "content": [{"type": "image"}, {"type": "text", "text": question}],
38
- }
39
  ]
40
  input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
41
- inputs = processor(
42
- image, input_text, add_special_tokens=False, return_tensors="pt"
43
- ).to(model.device)
44
-
45
  # Run inference
46
  with torch.no_grad():
47
  output = model.generate(**inputs, max_new_tokens=2048)
48
-
49
  # Decode output
50
  result = processor.decode(output[0], skip_special_tokens=True)
51
-
52
- # Try to extract and parse JSON from the response
53
- try:
54
- # Split the result to get content after "assistant"
55
- text_after_assistant = result.strip().split("assistant\n")[1].strip()
56
-
57
- # Attempt to parse as JSON (double-loaded)
58
- json_data = json.loads(json.loads(text_after_assistant))
59
- return json.dumps(json_data, indent=2)
60
- except (IndexError, json.JSONDecodeError):
61
- # If JSON parsing fails, return the text after "assistant" or the full result
62
- try:
63
- return result.strip().split("assistant\n")[1].strip()
64
- except IndexError:
65
- return result.strip()
66
-
67
 
68
  # Create Gradio interface
69
  demo = gr.Interface(
70
  fn=inference,
71
  inputs=[
72
  gr.Image(type="pil", label="Upload Image"),
73
- gr.Textbox(label="Enter your question"),
74
  ],
75
  outputs=gr.Textbox(label="Response"),
76
  title="Image Analysis AI",
@@ -78,4 +54,4 @@ demo = gr.Interface(
78
  )
79
 
80
  if __name__ == "__main__":
81
- demo.launch()
 
6
  from peft import PeftModel
7
  from huggingface_hub import login
8
  import spaces
 
9
 
10
  # Login to Hugging Face
11
  if "HF_TOKEN" not in os.environ:
12
+ raise ValueError("Please set the HF_TOKEN environment variable with your Hugging Face token")
 
 
13
  login(token=os.environ["HF_TOKEN"])
14
 
15
  # Load model and processor (do this outside the inference function to avoid reloading)
16
  base_model_path = "meta-llama/Llama-3.2-11B-Vision-Instruct"
17
+ lora_weights_path = "taesiri/BungsBunny-LLama-3.2-11B-Vision-Instruct-Medium"
18
 
19
  processor = AutoProcessor.from_pretrained(base_model_path)
20
  model = MllamaForConditionalGeneration.from_pretrained(
 
24
  )
25
  model = PeftModel.from_pretrained(model, lora_weights_path)
26
 
 
27
  @spaces.GPU
28
  def inference(image, question):
29
  # Prepare input
30
  messages = [
31
+ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]}
 
 
 
32
  ]
33
  input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
34
+ inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt").to(model.device)
35
+
 
 
36
  # Run inference
37
  with torch.no_grad():
38
  output = model.generate(**inputs, max_new_tokens=2048)
39
+
40
  # Decode output
41
  result = processor.decode(output[0], skip_special_tokens=True)
42
+ return result.strip().split("assistant\n")[1].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Create Gradio interface
45
  demo = gr.Interface(
46
  fn=inference,
47
  inputs=[
48
  gr.Image(type="pil", label="Upload Image"),
49
+ gr.Textbox(label="Enter your question")
50
  ],
51
  outputs=gr.Textbox(label="Response"),
52
  title="Image Analysis AI",
 
54
  )
55
 
56
  if __name__ == "__main__":
57
+ demo.launch()