Spaces:
Running
Running
XufengDuan
commited on
Commit
•
9da8cd9
1
Parent(s):
63a1401
updated scripts
Browse files- .DS_Store +0 -0
- src/.DS_Store +0 -0
- src/backend/model_operations.py +147 -23
- src/envs.py +3 -2
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
src/.DS_Store
CHANGED
Binary files a/src/.DS_Store and b/src/.DS_Store differ
|
|
src/backend/model_operations.py
CHANGED
@@ -21,11 +21,19 @@ import cohere
|
|
21 |
from openai import OpenAI
|
22 |
# import google
|
23 |
import google.generativeai as genai
|
24 |
-
from huggingface_hub import InferenceClient
|
25 |
|
26 |
import src.backend.util as util
|
27 |
import src.envs as envs
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# litellm.set_verbose=False
|
30 |
litellm.set_verbose=True
|
31 |
# Set up basic configuration for logging
|
@@ -196,8 +204,19 @@ class SummaryGenerator:
|
|
196 |
break
|
197 |
if i == 5:
|
198 |
print(_response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
-
_response1, _response2 = _response.split('\n\n')
|
201 |
Experiment_ID.append(ID)
|
202 |
Questions_ID.append(q_column[j])
|
203 |
User_prompt.append(_user_prompt)
|
@@ -261,15 +280,8 @@ class SummaryGenerator:
|
|
261 |
|
262 |
def generate_summary(self, system_prompt: str, user_prompt: str):
|
263 |
# Using Together AI API
|
264 |
-
|
265 |
-
client = InferenceClient(self.model_id, token = envs.TOKEN)
|
266 |
-
result = client.chat_completion(messages=[{"role": "system", "content": system_prompt},
|
267 |
-
{"role": "user", "content": user_prompt}],max_tokens=50,stream=False)
|
268 |
-
print(result.choices[0].message.content)
|
269 |
-
return result.choices[0].message.content
|
270 |
-
|
271 |
using_together_api = False
|
272 |
-
together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm'
|
273 |
for together_ai_api_model in together_ai_api_models:
|
274 |
if together_ai_api_model in self.model_id.lower():
|
275 |
using_together_api = True
|
@@ -335,6 +347,7 @@ class SummaryGenerator:
|
|
335 |
max_tokens=250,
|
336 |
)
|
337 |
result = response['choices'][0]['message']['content']
|
|
|
338 |
print(result)
|
339 |
return result
|
340 |
|
@@ -379,17 +392,31 @@ class SummaryGenerator:
|
|
379 |
|
380 |
# Using HF API or download checkpoints
|
381 |
elif self.local_model is None:
|
382 |
-
|
383 |
-
|
|
|
|
|
384 |
try: # try use HuggingFace API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
response = litellm.completion(
|
386 |
-
|
387 |
-
|
|
|
388 |
{"role": "user", "content": user_prompt}],
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
)
|
|
|
|
|
393 |
result = response['choices'][0]['message']['content']
|
394 |
print(result)
|
395 |
return result
|
@@ -399,7 +426,7 @@ class SummaryGenerator:
|
|
399 |
print("Tokenizer loaded")
|
400 |
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
|
401 |
print("Local model loaded")
|
402 |
-
|
403 |
# Using local model
|
404 |
if self.local_model: # cannot call API. using local model
|
405 |
messages=[
|
@@ -1025,7 +1052,100 @@ class EvaluationModel:
|
|
1025 |
|
1026 |
|
1027 |
|
1028 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1029 |
def evaluate_humanlike(self, summaries_df, human_data_path, result_save_path):
|
1030 |
'''
|
1031 |
evaluate humanlike score
|
@@ -1036,13 +1156,15 @@ class EvaluationModel:
|
|
1036 |
'''coding human data'''
|
1037 |
# self.huamn_df = pd.read_csv(human_data_path)
|
1038 |
# self.data = self.code_results(self.huamn_df)
|
1039 |
-
|
|
|
1040 |
# if save_path is not None:
|
1041 |
# print(f'Save human coding results to {save_path}')
|
1042 |
# fpath = Path(save_path)
|
1043 |
# fpath.parent.mkdir(parents=True, exist_ok=True)
|
1044 |
# self.data.to_csv(fpath)
|
1045 |
|
|
|
1046 |
'''coding llm data'''
|
1047 |
save_path = result_save_path.replace('.csv','_coding.csv')
|
1048 |
self.llm_df = self.code_results_llm(summaries_df)
|
@@ -1051,9 +1173,11 @@ class EvaluationModel:
|
|
1051 |
fpath = Path(save_path)
|
1052 |
fpath.parent.mkdir(parents=True, exist_ok=True)
|
1053 |
self.llm_df.to_csv(fpath)
|
1054 |
-
#
|
|
|
|
|
1055 |
|
1056 |
-
return
|
1057 |
|
1058 |
|
1059 |
|
|
|
21 |
from openai import OpenAI
|
22 |
# import google
|
23 |
import google.generativeai as genai
|
|
|
24 |
|
25 |
import src.backend.util as util
|
26 |
import src.envs as envs
|
27 |
|
28 |
+
# import pandas as pd
|
29 |
+
import scipy
|
30 |
+
from scipy.spatial.distance import jensenshannon
|
31 |
+
# import numpy as np
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
# litellm.set_verbose=False
|
38 |
litellm.set_verbose=True
|
39 |
# Set up basic configuration for logging
|
|
|
204 |
break
|
205 |
if i == 5:
|
206 |
print(_response)
|
207 |
+
if _response == None:
|
208 |
+
_response1, _response2 = "", ""
|
209 |
+
else:
|
210 |
+
try:
|
211 |
+
import re
|
212 |
+
_response1,_response2 = re.split(r'\n\s*\n', _response.strip())
|
213 |
+
except:
|
214 |
+
_response1 = _response.split('\n\n')
|
215 |
+
if len(_response) == 2:
|
216 |
+
_response1, _response2 = _response[0], _response[1]
|
217 |
+
else:
|
218 |
+
_response1, _response2 = _response[0], ""
|
219 |
|
|
|
220 |
Experiment_ID.append(ID)
|
221 |
Questions_ID.append(q_column[j])
|
222 |
User_prompt.append(_user_prompt)
|
|
|
280 |
|
281 |
def generate_summary(self, system_prompt: str, user_prompt: str):
|
282 |
# Using Together AI API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
using_together_api = False
|
284 |
+
together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm']
|
285 |
for together_ai_api_model in together_ai_api_models:
|
286 |
if together_ai_api_model in self.model_id.lower():
|
287 |
using_together_api = True
|
|
|
347 |
max_tokens=250,
|
348 |
)
|
349 |
result = response['choices'][0]['message']['content']
|
350 |
+
# print()
|
351 |
print(result)
|
352 |
return result
|
353 |
|
|
|
392 |
|
393 |
# Using HF API or download checkpoints
|
394 |
elif self.local_model is None:
|
395 |
+
# print(self.model_id)
|
396 |
+
# print(self.api_base)
|
397 |
+
# mistralai/Mistral-7B-Instruct-v0.1
|
398 |
+
# https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1
|
399 |
try: # try use HuggingFace API
|
400 |
+
# response = litellm.completion(
|
401 |
+
# model="huggingface/"+'command-r-plus' if 'command' in self.model_id else self.model_id,
|
402 |
+
# messages=[{"role": "system", "content": system_prompt},
|
403 |
+
# {"role": "user", "content": user_prompt}],
|
404 |
+
# temperature=0.0,
|
405 |
+
# max_tokens=1024,
|
406 |
+
# api_base= "https://api-inference.huggingface.co/models/" + self.model_id,
|
407 |
+
# )
|
408 |
+
self.model_id = 'command-r-plus' if 'command' in self.model_id else self.model_id
|
409 |
response = litellm.completion(
|
410 |
+
model="huggingface/" + self.model_id,
|
411 |
+
# mistralai/Mistral-7B-Instruct-v0.1",
|
412 |
+
messages=[{"role": "system", "content": system_prompt},
|
413 |
{"role": "user", "content": user_prompt}],
|
414 |
+
temperature=0.0,
|
415 |
+
max_tokens=1024,
|
416 |
+
api_base="https://api-inference.huggingface.co/models/" + self.model_id)
|
417 |
+
print("模型返回结果",response)
|
418 |
+
print("模型返回结果结束")
|
419 |
+
# exit()
|
420 |
result = response['choices'][0]['message']['content']
|
421 |
print(result)
|
422 |
return result
|
|
|
426 |
print("Tokenizer loaded")
|
427 |
self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
|
428 |
print("Local model loaded")
|
429 |
+
# exit()
|
430 |
# Using local model
|
431 |
if self.local_model: # cannot call API. using local model
|
432 |
messages=[
|
|
|
1052 |
|
1053 |
|
1054 |
|
1055 |
+
|
1056 |
+
|
1057 |
+
def calculate_js_divergence(self, file_path_1, file_path_2):
|
1058 |
+
"""
|
1059 |
+
Calculate the Jensen-Shannon divergence for response distributions between two datasets.
|
1060 |
+
- Extracts E5 and E51 pairs, creates new data based on comparison,
|
1061 |
+
removes the original E5 and E51, and then calculates the JS divergence between the datasets.
|
1062 |
+
|
1063 |
+
Parameters:
|
1064 |
+
file_path_1 (str): Path to the first dataset file (Excel format).
|
1065 |
+
file_path_2 (str): Path to the second dataset file (CSV format).
|
1066 |
+
|
1067 |
+
Returns:
|
1068 |
+
float: The average JS divergence across all common Question_IDs.
|
1069 |
+
"""
|
1070 |
+
# Load the datasets
|
1071 |
+
human_df = pd.read_excel(file_path_1)
|
1072 |
+
llm_df = pd.read_csv(file_path_2)
|
1073 |
+
|
1074 |
+
def create_e5_entries(df):
|
1075 |
+
new_entries = []
|
1076 |
+
for i in range(len(df) - 1):
|
1077 |
+
if 'E51' in df.iloc[i]['Experiment']:
|
1078 |
+
priming_id = df.iloc[i][0]-1
|
1079 |
+
priming_row_id = df[df.iloc[:, 0] == priming_id].index[0]
|
1080 |
+
new_question_id = df.iloc[priming_row_id]['Question_ID']
|
1081 |
+
label = 1 if df.iloc[i]['Coding'] == df.iloc[priming_row_id]['Coding'] else 0
|
1082 |
+
new_entries.append({
|
1083 |
+
'Question_ID': new_question_id,
|
1084 |
+
'Response': f'{df.iloc[i]["Coding"]}-{df.iloc[priming_row_id]["Coding"]}',
|
1085 |
+
'Coding': label
|
1086 |
+
})
|
1087 |
+
return pd.DataFrame(new_entries)
|
1088 |
+
|
1089 |
+
# Create new E5 entries for both datasets
|
1090 |
+
human_e5 = create_e5_entries(human_df)
|
1091 |
+
llm_e5 = create_e5_entries(llm_df)
|
1092 |
+
|
1093 |
+
# Remove E5 and E51 entries from both datasets
|
1094 |
+
human_df = human_df[~human_df['Question_ID'].str.contains('E5')]
|
1095 |
+
llm_df = llm_df[~llm_df['Question_ID'].str.contains('E5')]
|
1096 |
+
|
1097 |
+
# Append new E5 entries to the cleaned dataframes
|
1098 |
+
human_df = pd.concat([human_df, human_e5], ignore_index=True)
|
1099 |
+
llm_df = pd.concat([llm_df, llm_e5], ignore_index=True)
|
1100 |
+
|
1101 |
+
### Calculate Average JS Divergence ###
|
1102 |
+
|
1103 |
+
# Extract the relevant columns for JS divergence calculation
|
1104 |
+
human_responses = human_df[['Question_ID', 'Coding']]
|
1105 |
+
llm_responses = llm_df[['Question_ID', 'Coding']]
|
1106 |
+
|
1107 |
+
# Get unique Question_IDs present in both datasets
|
1108 |
+
common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
1109 |
+
|
1110 |
+
# Initialize a list to store JS divergence for each Question_ID
|
1111 |
+
js_divergence_list = []
|
1112 |
+
js_divergence ={}
|
1113 |
+
|
1114 |
+
# Calculate JS divergence for each common Question_ID
|
1115 |
+
for q_id in common_question_ids:
|
1116 |
+
# Get response distributions for the current Question_ID in both datasets
|
1117 |
+
human_dist = human_responses[human_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
|
1118 |
+
llm_dist = llm_responses[llm_responses['Question_ID'] == q_id]['Coding'].value_counts(normalize=True)
|
1119 |
+
|
1120 |
+
# Reindex the distributions to have the same index, filling missing values with 0
|
1121 |
+
all_responses = set(human_dist.index).union(set(llm_dist.index))
|
1122 |
+
human_dist = human_dist.reindex(all_responses, fill_value=0)
|
1123 |
+
llm_dist = llm_dist.reindex(all_responses, fill_value=0)
|
1124 |
+
|
1125 |
+
# Calculate JS divergence and add to the list
|
1126 |
+
js_div = jensenshannon(human_dist, llm_dist, base=2)
|
1127 |
+
experiment_id = q_id.split('_')[1]
|
1128 |
+
if experiment_id not in js_divergence:
|
1129 |
+
js_divergence[experiment_id] = []
|
1130 |
+
js_divergence[experiment_id].append(js_div)
|
1131 |
+
|
1132 |
+
js_divergence_list.append(js_div)
|
1133 |
+
#js_divergence[q_id] = js_div
|
1134 |
+
|
1135 |
+
|
1136 |
+
|
1137 |
+
# Calculate the average JS divergence
|
1138 |
+
# JS per experiment
|
1139 |
+
avg_js_divergence_per_experiment = {exp: 1- np.nanmean(divs) for exp, divs in js_divergence.items()}
|
1140 |
+
print(avg_js_divergence_per_experiment)
|
1141 |
+
|
1142 |
+
# JS overall
|
1143 |
+
avg_js_divergence = 1 - np.nanmean(js_divergence_list)
|
1144 |
+
print("avg_js_divergence:", avg_js_divergence)
|
1145 |
+
|
1146 |
+
return avg_js_divergence
|
1147 |
+
|
1148 |
+
|
1149 |
def evaluate_humanlike(self, summaries_df, human_data_path, result_save_path):
|
1150 |
'''
|
1151 |
evaluate humanlike score
|
|
|
1156 |
'''coding human data'''
|
1157 |
# self.huamn_df = pd.read_csv(human_data_path)
|
1158 |
# self.data = self.code_results(self.huamn_df)
|
1159 |
+
save_path = human_data_path.replace('.csv','_coding.csv')
|
1160 |
+
human_save_path = "./src/datasets/coding_human.xlsx"
|
1161 |
# if save_path is not None:
|
1162 |
# print(f'Save human coding results to {save_path}')
|
1163 |
# fpath = Path(save_path)
|
1164 |
# fpath.parent.mkdir(parents=True, exist_ok=True)
|
1165 |
# self.data.to_csv(fpath)
|
1166 |
|
1167 |
+
|
1168 |
'''coding llm data'''
|
1169 |
save_path = result_save_path.replace('.csv','_coding.csv')
|
1170 |
self.llm_df = self.code_results_llm(summaries_df)
|
|
|
1173 |
fpath = Path(save_path)
|
1174 |
fpath.parent.mkdir(parents=True, exist_ok=True)
|
1175 |
self.llm_df.to_csv(fpath)
|
1176 |
+
# file_path_1 = '/Users/simon/Downloads/coding_human.xlsx'
|
1177 |
+
# file_path_2 = '/Users/simon/Downloads/Meta-Llama-3.1-70B-Instruct_coding.csv'
|
1178 |
+
avg_js_divergence = self.calculate_js_divergence("./src/datasets/coding_human.xlsx", save_path)
|
1179 |
|
1180 |
+
return avg_js_divergence
|
1181 |
|
1182 |
|
1183 |
|
src/envs.py
CHANGED
@@ -4,7 +4,8 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
|
6 |
# replace this with our token
|
7 |
-
TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
8 |
# print(TOKEN)
|
9 |
# OWNER = "vectara"
|
10 |
# REPO_ID = f"{OWNER}/Humanlike"
|
@@ -12,7 +13,7 @@ TOKEN = os.environ.get("HF_TOKEN", None)
|
|
12 |
# RESULTS_REPO = f"{OWNER}/results"
|
13 |
|
14 |
|
15 |
-
OWNER = "
|
16 |
# ----------------------------------
|
17 |
|
18 |
REPO_ID = f"{OWNER}/Humanlike"
|
|
|
4 |
|
5 |
|
6 |
# replace this with our token
|
7 |
+
# TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
+
TOKEN = os.getenv("HF_TOKEN")
|
9 |
# print(TOKEN)
|
10 |
# OWNER = "vectara"
|
11 |
# REPO_ID = f"{OWNER}/Humanlike"
|
|
|
13 |
# RESULTS_REPO = f"{OWNER}/results"
|
14 |
|
15 |
|
16 |
+
OWNER = "tangtang1995" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
17 |
# ----------------------------------
|
18 |
|
19 |
REPO_ID = f"{OWNER}/Humanlike"
|