AdrienB134 commited on
Commit
8573be6
·
1 Parent(s): e030870
app.py CHANGED
@@ -13,7 +13,8 @@ from pdf2image import convert_from_path
13
  from PIL import Image
14
  from torch.utils.data import DataLoader
15
  from tqdm import tqdm
16
- from transformers import AutoProcessor, Idefics3ForConditionalGeneration
 
17
  import re
18
  import time
19
  from PIL import Image
@@ -28,76 +29,70 @@ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENT
28
 
29
  @spaces.GPU
30
  def model_inference(
31
- images, text, assistant_prefix= "Réfléchis step by step. Répond uniquement avec les informations du document fourni.", decoding_strategy = "Greedy", temperature= 0.4, max_new_tokens=512,
32
- repetition_penalty=1.2, top_p=0.8
33
  ):
34
- ## Load idefics
35
- id_processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
36
-
37
- id_model = Idefics3ForConditionalGeneration.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3",
38
- torch_dtype=torch.bfloat16,
39
- #_attn_implementation="flash_attention_2"
40
- ).to("cuda")
41
-
42
- BAD_WORDS_IDS = id_processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
43
- EOS_WORDS_IDS = [id_processor.tokenizer.eos_token_id]
44
  print(type(images))
45
  print(images[0])
46
  images = Image.open(images[0][0])
47
  print(images)
48
  print(type(images))
49
- if text == "" and not images:
50
- gr.Error("Please input a query and optionally image(s).")
51
-
52
- if text == "" and images:
53
- gr.Error("Please input a text query along the image(s).")
 
 
 
 
 
 
54
 
55
- if isinstance(images, Image.Image):
56
- images = [images]
57
 
 
 
 
 
58
 
59
- resulting_messages = [
 
 
 
60
  {
61
- "role": "user",
62
- "content": [{"type": "image"}] + [
63
- {"type": "text", "text": text}
64
- ]
65
- }
66
- ]
67
-
68
- if assistant_prefix:
69
- text = f"{assistant_prefix} {text}"
70
-
71
-
72
- prompt = id_processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
73
- inputs = id_processor(text=prompt, images=[images], return_tensors="pt")
74
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
75
-
76
- generation_args = {
77
- "max_new_tokens": max_new_tokens,
78
- "repetition_penalty": repetition_penalty,
79
-
80
- }
81
-
82
- assert decoding_strategy in [
83
- "Greedy",
84
- "Top P Sampling",
85
  ]
86
- if decoding_strategy == "Greedy":
87
- generation_args["do_sample"] = False
88
- elif decoding_strategy == "Top P Sampling":
89
- generation_args["temperature"] = temperature
90
- generation_args["do_sample"] = True
91
- generation_args["top_p"] = top_p
92
-
93
-
94
- generation_args.update(inputs)
95
 
96
- # Generate
97
- generated_ids = id_model.generate(**generation_args)
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- generated_texts = id_processor.batch_decode(generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True)
100
- return generated_texts[0]
 
 
 
 
 
 
 
101
 
102
 
103
 
 
13
  from PIL import Image
14
  from torch.utils.data import DataLoader
15
  from tqdm import tqdm
16
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
17
+ from qwen_vl_utils import process_vision_info
18
  import re
19
  import time
20
  from PIL import Image
 
29
 
30
  @spaces.GPU
31
  def model_inference(
32
+ images, text,
 
33
  ):
34
+
 
 
 
 
 
 
 
 
 
35
  print(type(images))
36
  print(images[0])
37
  images = Image.open(images[0][0])
38
  print(images)
39
  print(type(images))
40
+ # model = Qwen2VLForConditionalGeneration.from_pretrained(
41
+ # "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
42
+ # )
43
+
44
+ #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
45
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
46
+ "Qwen/Qwen2-VL-7B-Instruct",
47
+ torch_dtype=torch.bfloat16,
48
+ attn_implementation="flash_attention_2",
49
+ device_map="auto",
50
+ )
51
 
52
+ # default processer
53
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
54
 
55
+ # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
56
+ # min_pixels = 256*28*28
57
+ # max_pixels = 1280*28*28
58
+ # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
59
 
60
+ messages = [
61
+ {
62
+ "role": "user",
63
+ "content": [
64
  {
65
+ "type": "image",
66
+ "image": images,
67
+ },
68
+ {"type": "text", "text": text},
69
+ ],
70
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  ]
 
 
 
 
 
 
 
 
 
72
 
73
+ # Preparation for inference
74
+ text = processor.apply_chat_template(
75
+ messages, tokenize=False, add_generation_prompt=True
76
+ )
77
+ image_inputs, video_inputs = process_vision_info(messages)
78
+ inputs = processor(
79
+ text=[text],
80
+ images=image_inputs,
81
+ videos=video_inputs,
82
+ padding=True,
83
+ return_tensors="pt",
84
+ )
85
+ inputs = inputs.to("cuda")
86
 
87
+ # Inference: Generation of the output
88
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
89
+ generated_ids_trimmed = [
90
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
91
+ ]
92
+ output_text = processor.batch_decode(
93
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
94
+ )
95
+ return output_text[0]
96
 
97
 
98
 
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor from qwen_vl_utils import process_vision_info ADDED
File without changes