LuozzzzzzzzzzzzzzY commited on
Commit
717dd41
·
verified ·
1 Parent(s): d7c6a54

Upload inference.py

Browse files
Files changed (1) hide show
  1. inference.py +94 -0
inference.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoConfig,
3
+ AutoTokenizer,
4
+ BitsAndBytesConfig,
5
+ AutoProcessor,
6
+ LlamaForCausalLM,
7
+ MllamaForConditionalGeneration,
8
+ AutoModelForCausalLM
9
+ )
10
+ import torch
11
+ from peft import PeftModel
12
+ from datasets import load_from_disk
13
+ import pandas as pd
14
+ from tqdm import tqdm
15
+ from torch.utils.data import DataLoader
16
+
17
+
18
+ mode_path = '/gemini/pretrain/meta-llamaLlama-3.2-11B-Vision-Instruct'
19
+ lora_path = '/gemini/code/FMD/model/final_model_4/checkpoint-2440' # lora 输出对应 checkpoint 路径
20
+
21
+ # 加载tokenizer
22
+ tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)
23
+
24
+ # 加载模型
25
+ model = MllamaForConditionalGeneration.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()
26
+
27
+ # 加载lora权重
28
+ model = PeftModel.from_pretrained(model, model_id=lora_path)
29
+ test_dataset = load_from_disk("/gemini/code/FMD/final_dataset/Test")
30
+ results = []
31
+ with torch.no_grad():
32
+ for data in tqdm(test_dataset):
33
+ model_input = tokenizer(
34
+ data['instruction_1'], # 输入文本
35
+ add_special_tokens=False, # 不添加特殊标记
36
+ truncation=True, # 启用截断
37
+ max_length=3000 # 设置最大长度
38
+ )
39
+ model_input = tokenizer.decode(model_input["input_ids"], skip_special_tokens=False)
40
+
41
+ model_inputs = tokenizer(f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an expert in financial misinformation detection.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{model_input}\nimage information: {data['image_info']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", truncation=True, max_length=3600, add_special_tokens=False,return_tensors="pt").to('cuda')
42
+ # 生成模型输出
43
+ generated_ids = model.generate(**model_inputs, max_new_tokens=1024)
44
+
45
+ # 去除输入部分的 token,以保留生成的预测结果
46
+ generated_ids = [
47
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
48
+ ]
49
+
50
+ # 解码生成的预测结果
51
+ responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
52
+ print(responses)
53
+ # 将每个结果按顺序存储到列表中
54
+ results.append({
55
+ "ID": data['ID'],
56
+ "response": responses
57
+ })
58
+ def split_response(text):
59
+ #获取Prediction的内容
60
+ prediction_pattern = r"Prediction:\s*(False|True|NEI)\s*$"
61
+ prediction_match = re.search(prediction_pattern, text, re.MULTILINE)
62
+ if prediction_match:
63
+ prediction = prediction_match.group(1).strip()
64
+ else:
65
+ prediction = 'None'
66
+ print("没有找到匹配的内容")
67
+ #获取Explanation的内容
68
+ explanation_pattern = r"Explanation:\s*(.*)"
69
+ explanation_match = re.search(explanation_pattern, text, re.MULTILINE)
70
+ if explanation_match:
71
+ explanation = explanation_match.group(1).strip()
72
+ else:
73
+ explanation = None # 如果没有匹配项,设置为 None
74
+ return prediction, explanation
75
+
76
+ if results:
77
+ df = pd.DataFrame(results)
78
+
79
+ for index, row in df.iterrows():
80
+ text = row['response']
81
+ prediction, explanation= split_response(text)
82
+ df.at[index, 'Prediction'] = prediction
83
+ df.at[index, 'Explanation'] = explanation
84
+
85
+ df['ID'] = df['ID'].str.replace('FMD_test_', '', regex=False)
86
+ df = df.rename(columns={'ID': 'id','Prediction': 'pred','Explanation': 'explanation'})
87
+ df = df.drop('response',axis=1)
88
+ mapping = {
89
+ 'False': 0,
90
+ 'True': 1,
91
+ 'NEI': 2
92
+ }
93
+ df['pred'] = df['pred'].replace(mapping)
94
+ df.to_csv("/gemini/code/FMD/inference/result_final_model_4/result.csv",index = False)