Spaces:
Running
Running
XufengDuan
commited on
Commit
•
d0d5660
1
Parent(s):
3776314
update scripts
Browse files- src/backend/model_operations.py +383 -28
src/backend/model_operations.py
CHANGED
@@ -35,7 +35,7 @@ import spacy_transformers
|
|
35 |
import subprocess
|
36 |
|
37 |
# Run the command to download the spaCy model
|
38 |
-
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
|
39 |
# subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
|
40 |
# subprocess.run(["pip", "install", "spacy-transformers"], check=True)
|
41 |
# subprocess.run(["pip", "install", "curated-transformers"], check=True)
|
@@ -43,7 +43,7 @@ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=Tr
|
|
43 |
# Load spacy model for word tokenization
|
44 |
# nlp = spacy.load("en_core_web_sm")
|
45 |
try:
|
46 |
-
nlp1 = spacy.load("
|
47 |
except OSError:
|
48 |
print("Can not load spacy model")
|
49 |
|
@@ -171,7 +171,8 @@ class ResponseGenerator:
|
|
171 |
# print(ID, q_ID, prompt_value)
|
172 |
system_prompt = envs.SYSTEM_PROMPT
|
173 |
_user_prompt = prompt_value
|
174 |
-
|
|
|
175 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
176 |
while True:
|
177 |
try:
|
@@ -179,6 +180,7 @@ class ResponseGenerator:
|
|
179 |
print(self.model_id.lower(),'-',ID,'-',j,'-',ii)
|
180 |
|
181 |
_response = self.send_request(system_prompt, _user_prompt)
|
|
|
182 |
# print(f"Finish index {index}")
|
183 |
break
|
184 |
except Exception as e:
|
@@ -205,6 +207,7 @@ class ResponseGenerator:
|
|
205 |
time.sleep(wait_time)
|
206 |
try:
|
207 |
_response = self.send_request(system_prompt, _user_prompt)
|
|
|
208 |
break
|
209 |
except Exception as ee:
|
210 |
exceptions.append(ee)
|
@@ -512,7 +515,7 @@ class EvaluationModel:
|
|
512 |
self.scores = []
|
513 |
self.humanlike_score = None
|
514 |
|
515 |
-
def
|
516 |
'''code results from LLM's response'''
|
517 |
output = []
|
518 |
'''database for Exp4'''
|
@@ -738,25 +741,365 @@ class EvaluationModel:
|
|
738 |
doc = nlp1(sentence)
|
739 |
subject = "None"
|
740 |
obj = "None"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
742 |
|
743 |
for token in doc:
|
744 |
if token.dep_ == "nsubj":
|
745 |
subject = token.text
|
746 |
elif token.dep_ == "dobj":
|
747 |
obj = token.text
|
748 |
-
|
749 |
-
|
750 |
-
|
|
|
|
|
751 |
output.append("Other")
|
752 |
-
elif subject in
|
753 |
-
#print(rs, subject, obj, "VP")
|
754 |
output.append("VP")
|
755 |
-
elif obj in
|
756 |
-
#print(rs, subject, obj, "NP")
|
757 |
output.append("NP")
|
758 |
else:
|
759 |
-
#print(rs, subject, obj, "Other")
|
760 |
output.append("Other")
|
761 |
|
762 |
'''Exp7'''
|
@@ -834,11 +1177,22 @@ class EvaluationModel:
|
|
834 |
# exit()
|
835 |
'''LLM'''
|
836 |
print(len(output))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
837 |
self.data = pd.DataFrame(list(
|
838 |
-
zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],
|
839 |
-
|
840 |
-
columns=["Experiment", "Question_ID", "Item", "Response", "Factor 2", "Stimuli 1",
|
841 |
-
"Coding"])
|
842 |
|
843 |
return self.data
|
844 |
|
@@ -848,6 +1202,8 @@ class EvaluationModel:
|
|
848 |
|
849 |
|
850 |
|
|
|
|
|
851 |
def calculate_js_divergence(self, file_path_1, file_path_2):
|
852 |
"""
|
853 |
Calculate the Jensen-Shannon divergence for response distributions between two datasets.
|
@@ -855,7 +1211,7 @@ class EvaluationModel:
|
|
855 |
removes the original E5 and E51, and then calculates the JS divergence between the datasets.
|
856 |
|
857 |
Parameters:
|
858 |
-
file_path_1 (str): Path to the first dataset file (
|
859 |
file_path_2 (str): Path to the second dataset file (CSV format).
|
860 |
|
861 |
Returns:
|
@@ -893,17 +1249,15 @@ class EvaluationModel:
|
|
893 |
human_df = pd.concat([human_df, human_e5], ignore_index=True)
|
894 |
llm_df = pd.concat([llm_df, llm_e5], ignore_index=True)
|
895 |
|
896 |
-
|
897 |
### Calculate Average JS Divergence ###
|
898 |
|
899 |
-
|
900 |
# Extract the relevant columns for JS divergence calculation
|
901 |
human_responses = human_df[['Question_ID', 'Coding']]
|
902 |
llm_responses = llm_df[['Question_ID', 'Coding']]
|
903 |
|
904 |
# Remove 'Other' responses
|
905 |
-
human_responses = human_responses[human_responses['Coding'] != 'Other']
|
906 |
-
llm_responses = llm_responses[llm_responses['Coding'] != 'Other']
|
907 |
|
908 |
# Get unique Question_IDs present in both datasets
|
909 |
common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
@@ -933,6 +1287,7 @@ class EvaluationModel:
|
|
933 |
|
934 |
# Calculate the average JS divergence per experiment and the confidence interval
|
935 |
results = {}
|
|
|
936 |
for exp, divs in js_divergence.items():
|
937 |
avg_js_divergence = 1 - np.nanmean(divs)
|
938 |
ci_lower, ci_upper = bootstrap((divs,), np.nanmean, confidence_level=0.95,
|
@@ -941,14 +1296,14 @@ class EvaluationModel:
|
|
941 |
'average_js_divergence': avg_js_divergence,
|
942 |
'confidence_interval': (1 - ci_upper, 1 - ci_lower) # Adjust for 1 - score
|
943 |
}
|
|
|
944 |
|
945 |
-
# Calculate the
|
946 |
-
|
947 |
-
flattened_js_divergence = np.concatenate([np.array(divs) for divs in js_divergence.values()])
|
948 |
|
949 |
-
#
|
950 |
overall_ci_lower, overall_ci_upper = bootstrap(
|
951 |
-
(
|
952 |
np.nanmean,
|
953 |
confidence_level=0.95,
|
954 |
n_resamples=1000
|
@@ -957,8 +1312,8 @@ class EvaluationModel:
|
|
957 |
# Combine all results into one dictionary
|
958 |
all_results = {
|
959 |
'overall': {
|
960 |
-
'average_js_divergence':
|
961 |
-
'confidence_interval': (
|
962 |
},
|
963 |
'per_experiment': results
|
964 |
}
|
|
|
35 |
import subprocess
|
36 |
|
37 |
# Run the command to download the spaCy model
|
38 |
+
# subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True)
|
39 |
# subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
|
40 |
# subprocess.run(["pip", "install", "spacy-transformers"], check=True)
|
41 |
# subprocess.run(["pip", "install", "curated-transformers"], check=True)
|
|
|
43 |
# Load spacy model for word tokenization
|
44 |
# nlp = spacy.load("en_core_web_sm")
|
45 |
try:
|
46 |
+
nlp1 = spacy.load("en_core_web_sm")
|
47 |
except OSError:
|
48 |
print("Can not load spacy model")
|
49 |
|
|
|
171 |
# print(ID, q_ID, prompt_value)
|
172 |
system_prompt = envs.SYSTEM_PROMPT
|
173 |
_user_prompt = prompt_value
|
174 |
+
print(_user_prompt)
|
175 |
+
for ii in range(100):
|
176 |
# user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
|
177 |
while True:
|
178 |
try:
|
|
|
180 |
print(self.model_id.lower(),'-',ID,'-',j,'-',ii)
|
181 |
|
182 |
_response = self.send_request(system_prompt, _user_prompt)
|
183 |
+
# print(_response)
|
184 |
# print(f"Finish index {index}")
|
185 |
break
|
186 |
except Exception as e:
|
|
|
207 |
time.sleep(wait_time)
|
208 |
try:
|
209 |
_response = self.send_request(system_prompt, _user_prompt)
|
210 |
+
|
211 |
break
|
212 |
except Exception as ee:
|
213 |
exceptions.append(ee)
|
|
|
515 |
self.scores = []
|
516 |
self.humanlike_score = None
|
517 |
|
518 |
+
def code_results_llm_cleaned(self, responses_df):
|
519 |
'''code results from LLM's response'''
|
520 |
output = []
|
521 |
'''database for Exp4'''
|
|
|
741 |
doc = nlp1(sentence)
|
742 |
subject = "None"
|
743 |
obj = "None"
|
744 |
+
pobj_list = [] # To collect all prepositional objects
|
745 |
+
|
746 |
+
for token in doc:
|
747 |
+
if token.dep_ == "nsubj":
|
748 |
+
subject = token.text
|
749 |
+
elif token.dep_ == "dobj":
|
750 |
+
obj = token.text
|
751 |
+
elif token.dep_ == "pobj":
|
752 |
+
pobj_list.append(token.text) # Collect prepositional objects
|
753 |
+
|
754 |
+
rs_list = rs.lower().split()
|
755 |
+
if subject in rs_list and (obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list)):
|
756 |
+
output.append("Other")
|
757 |
+
elif subject in rs_list:
|
758 |
+
output.append("VP")
|
759 |
+
elif obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list):
|
760 |
+
output.append("NP")
|
761 |
+
else:
|
762 |
+
output.append("Other")
|
763 |
+
|
764 |
+
'''Exp7'''
|
765 |
+
elif responses_df["Experiment"][i] == "E7":
|
766 |
+
# rs = responses_df["Response"][i].strip().lower()
|
767 |
+
rs = rs.replace(".", "").replace(",", "").lower()
|
768 |
+
#print("E7", rs)
|
769 |
+
if "yes" in rs and "no" in rs:
|
770 |
+
output.append("Other")
|
771 |
+
elif "no" in rs:
|
772 |
+
output.append("0")
|
773 |
+
elif "yes" in rs:
|
774 |
+
output.append("1")
|
775 |
+
else:
|
776 |
+
output.append("Other")
|
777 |
+
|
778 |
+
'''Exp8'''
|
779 |
+
elif responses_df["Experiment"][i] == "E8":
|
780 |
+
# rs = responses_df["Response"][i].strip()
|
781 |
+
#print("E8", rs)
|
782 |
+
if "something is wrong with the question" in rs:
|
783 |
+
output.append("1")
|
784 |
+
else:
|
785 |
+
output.append("0")
|
786 |
+
|
787 |
+
'''Exp9'''
|
788 |
+
elif responses_df["Experiment"][i] == "E9":
|
789 |
+
male, female = 0, 0
|
790 |
+
|
791 |
+
# rs = responses_df["Response"][i].strip()
|
792 |
+
if "because" in rs:
|
793 |
+
rs = rs.replace("because because", "because").split("because")[1]
|
794 |
+
else:
|
795 |
+
rs = rs
|
796 |
+
condition = responses_df["Factor 2"][i].strip()
|
797 |
+
rs = rs.split(" ")
|
798 |
+
for w in rs:
|
799 |
+
if w in male_keyword and female != 1:
|
800 |
+
male = 1
|
801 |
+
break
|
802 |
+
if w in female_keyword and male != 1:
|
803 |
+
female = 1
|
804 |
+
break
|
805 |
+
#print("E9", "condition", condition, "male", male, "female", female)
|
806 |
+
if male == 0 and female == 0:
|
807 |
+
output.append('Other')
|
808 |
+
else:
|
809 |
+
if male == 1 and female == 0:
|
810 |
+
if condition == "MF":
|
811 |
+
output.append("Subject")
|
812 |
+
elif condition == "FM":
|
813 |
+
output.append("Object")
|
814 |
+
else:
|
815 |
+
output.append("Other")
|
816 |
+
elif female == 1 and male == 0:
|
817 |
+
if condition == "MF":
|
818 |
+
output.append("Object")
|
819 |
+
elif condition == "FM":
|
820 |
+
output.append("Subject")
|
821 |
+
else:
|
822 |
+
output.append("Other")
|
823 |
+
|
824 |
+
'''Exp10'''
|
825 |
+
elif responses_df["Experiment"][i] == "E10":
|
826 |
+
# rs = responses_df["Response"][i].strip()
|
827 |
+
rs = rs.replace(".", "")
|
828 |
+
if rs == "yes":
|
829 |
+
output.append("1")
|
830 |
+
else:
|
831 |
+
output.append("0")
|
832 |
+
else:
|
833 |
+
#print("can;t find the Exp:", responses_df["Experiment"][i])
|
834 |
+
output.append("NA")
|
835 |
+
# print(output)
|
836 |
+
# exit()
|
837 |
+
'''LLM'''
|
838 |
+
print(len(output))
|
839 |
+
import re
|
840 |
+
def clean_text(text):
|
841 |
+
if isinstance(text, str):
|
842 |
+
return re.sub(r'[^\x00-\x7F]+', '', text)
|
843 |
+
return text
|
844 |
+
|
845 |
+
responses_df["Experiment"] = responses_df["Experiment"].apply(clean_text)
|
846 |
+
responses_df["Question_ID"] = responses_df["Question_ID"].apply(clean_text)
|
847 |
+
responses_df["Item"] = responses_df["Item"].apply(clean_text)
|
848 |
+
responses_df["Response"] = responses_df["Response"].apply(clean_text)
|
849 |
+
|
850 |
+
output = [str(item) for item in output]
|
851 |
+
|
852 |
+
self.data = pd.DataFrame(list(
|
853 |
+
zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],output)),
|
854 |
+
columns=["Experiment", "Question_ID", "Item", "Response","Coding"])
|
855 |
+
|
856 |
+
return self.data
|
857 |
+
|
858 |
+
def code_results_llm(self, responses_df):
|
859 |
+
'''code results from LLM's response'''
|
860 |
+
output = []
|
861 |
+
'''database for Exp4'''
|
862 |
+
item4 = pd.read_csv(envs.ITEM_4_DATA)
|
863 |
+
wordpair2code = {}
|
864 |
+
for j in range(len(item4['Coding'])):
|
865 |
+
wordpair2code[item4['Pair'][j]] = item4['Coding'][j]
|
866 |
+
'''verb for Exp5'''
|
867 |
+
item5 = pd.read_csv(envs.ITEM_5_DATA)
|
868 |
+
# item corresponding to verb, same item id corresponding to verb pair
|
869 |
+
item2verb2 = {}
|
870 |
+
item2verb1 = {}
|
871 |
+
|
872 |
+
Stimuli1, Stimuli2 = {}, {}
|
873 |
+
for j in range(len(item5['Item'])):
|
874 |
+
item2verb1[item5['Item'][j]] = item5['Verb1'][j]
|
875 |
+
item2verb2[item5['Item'][j]] = item5['Verb2'][j]
|
876 |
+
Stimuli1[item5['ID'][j]] = item5['Stimuli-1'][j]
|
877 |
+
Stimuli2[item5['ID'][j]] = item5['Stimuli-2'][j]
|
878 |
+
|
879 |
+
male_keyword = ["he", "his", "himself"]
|
880 |
+
female_keyword = ["she", "her", "herself"]
|
881 |
+
#print(len(responses_df["Experiment"]))
|
882 |
+
for i in range(len(responses_df["Experiment"])):
|
883 |
+
|
884 |
+
|
885 |
+
print(i, "/", len(responses_df["Experiment"]))
|
886 |
+
# vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
|
887 |
+
# print()
|
888 |
+
if pd.isna(responses_df["Response"][i]):
|
889 |
+
output.append("Other")
|
890 |
+
continue
|
891 |
+
rs = responses_df["Response"][i].strip().lower()
|
892 |
+
print(rs)
|
893 |
+
rs = rs.replace('"', '').replace(" ", " ").replace('.', '')
|
894 |
+
#lines = rs.split("\n")
|
895 |
+
#filtered_lines = [line for line in lines if line and not (line.endswith(":") or line.endswith(":"))]
|
896 |
+
# filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for
|
897 |
+
# r in filtered_lines]
|
898 |
+
# rs = "\n".join(filtered_lines)
|
899 |
+
# rs = rs.strip()
|
900 |
+
'''Exp1'''
|
901 |
+
if responses_df["Experiment"][i] == "E1":
|
902 |
+
rs_lower = rs.lower()
|
903 |
+
if "round" in rs_lower and "spiky" in rs_lower:
|
904 |
+
output.append("Other")
|
905 |
+
elif "round" in rs_lower:
|
906 |
+
output.append("Round")
|
907 |
+
elif "spiky" in rs_lower:
|
908 |
+
output.append("Spiky")
|
909 |
+
else:
|
910 |
+
output.append("Other")
|
911 |
+
|
912 |
+
'''Exp2'''
|
913 |
+
|
914 |
+
elif responses_df["Experiment"][i] == "E2":
|
915 |
+
# rs = responses_df["Response"][i].strip()
|
916 |
+
rs = rs.split(' ')
|
917 |
+
#print("E2", rs)
|
918 |
+
male, female = 0, 0
|
919 |
+
for word in rs:
|
920 |
+
if word in female_keyword and male == 0:
|
921 |
+
female = 1
|
922 |
+
output.append("Female")
|
923 |
+
break
|
924 |
+
if word in male_keyword and female == 0:
|
925 |
+
male = 1
|
926 |
+
output.append("Male")
|
927 |
+
break
|
928 |
+
if male == 0 and female == 0:
|
929 |
+
output.append("Other")
|
930 |
|
931 |
+
'''Exp3'''
|
932 |
+
elif responses_df["Experiment"][i] == "E3":
|
933 |
+
# rs = responses_df["Response"][i].strip()
|
934 |
+
#print("E3", rs)
|
935 |
+
pair = responses_df["Factor 2"][i]
|
936 |
+
word1, word2 = pair.replace(".", "").split('_')
|
937 |
+
|
938 |
+
if responses_df["Item"][i] == 12:
|
939 |
+
output.append("Other")
|
940 |
+
else:
|
941 |
+
words = rs.split() # split the response into words
|
942 |
+
if any(word == word1 for word in words) and any(word == word2 for word in words):
|
943 |
+
output.append("Other")
|
944 |
+
else:
|
945 |
+
if any(word.lower() == word1.lower() for word in words):
|
946 |
+
if len(word1) > len(word2):
|
947 |
+
output.append("Long")
|
948 |
+
else:
|
949 |
+
output.append("Short")
|
950 |
+
elif any(word.lower() == word2.lower() for word in words):
|
951 |
+
if len(word1) > len(word2):
|
952 |
+
output.append("Short")
|
953 |
+
else:
|
954 |
+
output.append("Long")
|
955 |
+
else:
|
956 |
+
if len(words) > 1:
|
957 |
+
# joint the words using " "
|
958 |
+
word = " ".join(words)
|
959 |
+
if word.lower() == word1.lower():
|
960 |
+
if len(word1) > len(word2):
|
961 |
+
output.append("Long")
|
962 |
+
else:
|
963 |
+
output.append("Short")
|
964 |
+
elif word.lower() == word2.lower():
|
965 |
+
if len(word1) > len(word2):
|
966 |
+
output.append("Short")
|
967 |
+
else:
|
968 |
+
output.append("Long")
|
969 |
+
else:
|
970 |
+
output.append("Other")
|
971 |
+
else:
|
972 |
+
output.append("Other")
|
973 |
+
|
974 |
+
|
975 |
+
'''Exp4'''
|
976 |
+
|
977 |
+
elif responses_df["Experiment"][i] == "E4":
|
978 |
+
lines = rs.split("\n")
|
979 |
+
filtered_lines = []
|
980 |
+
if len(lines) > 1:
|
981 |
+
for r in lines[1:]:
|
982 |
+
if ':' in r:
|
983 |
+
filtered_lines.append(r.split(':', 1)[-1].strip())
|
984 |
+
else:
|
985 |
+
filtered_lines.append(r)
|
986 |
+
filtered_lines.insert(0, lines[0])
|
987 |
+
else:
|
988 |
+
filtered_lines = lines
|
989 |
+
# print(filtered_lines)
|
990 |
+
|
991 |
+
#filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
992 |
+
#rs = "\n".join(filtered_lines)
|
993 |
+
|
994 |
+
#filtered_lines = [r.split(':', 1)[-1].strip() if ':' in r else r for r in rs.split(";")]
|
995 |
+
#filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
996 |
+
rs = ";".join(filtered_lines).strip()
|
997 |
+
try:
|
998 |
+
meaning_word = rs.split(";")[4].replace(" ", '')
|
999 |
+
except IndexError:
|
1000 |
+
try:
|
1001 |
+
meaning_word = rs.split("\n")[4].replace(" ", '')
|
1002 |
+
except IndexError:
|
1003 |
+
output.append("Other")
|
1004 |
+
continue
|
1005 |
+
except Exception as e:
|
1006 |
+
print(f"Unexpected error: {e}")
|
1007 |
+
output.append("Other")
|
1008 |
+
continue
|
1009 |
+
|
1010 |
+
target = responses_df["Factor 2"][i].strip().lower()
|
1011 |
+
pair = target + "_" + meaning_word
|
1012 |
+
#print("E4:", pair)
|
1013 |
+
|
1014 |
+
if pair in wordpair2code.keys():
|
1015 |
+
output.append(wordpair2code[pair])
|
1016 |
+
else:
|
1017 |
+
output.append("Other")
|
1018 |
+
|
1019 |
+
'''Exp5'''
|
1020 |
+
elif responses_df["Experiment"][i] == "E5" or responses_df["Experiment"][i] == "E51":
|
1021 |
+
# sentence = responses_df["Response"][i].strip()
|
1022 |
+
item_id = responses_df["Item"][i]
|
1023 |
+
question_id = responses_df["Question_ID"][i]
|
1024 |
+
|
1025 |
+
if responses_df["Experiment"][i] == "E51":
|
1026 |
+
sti1 = Stimuli1[question_id[0:-1]].lower().replace("...", "")
|
1027 |
+
#sti2 = Stimuli2[question_id[0:-1]].lower().replace("...", "")
|
1028 |
+
verb = item2verb1[item_id].lower()
|
1029 |
+
|
1030 |
+
sentence = sti1 + " " + rs.replace(sti1, "")
|
1031 |
+
#print("E5", verb, sentence)
|
1032 |
+
if responses_df["Experiment"][i] == "E5":
|
1033 |
+
#sti1 = Stimuli1[question_id].lower().replace("...", "")
|
1034 |
+
# print(sti1)
|
1035 |
+
sti2 = Stimuli2[question_id].lower().replace("...", "")
|
1036 |
+
|
1037 |
+
verb = item2verb2[item_id].lower()
|
1038 |
+
sentence = sti2 + " " + rs.replace(sti2, "")
|
1039 |
+
#print("E5", verb, sentence)
|
1040 |
+
|
1041 |
+
doc = nlp1(sentence.replace(" ", " "))
|
1042 |
+
# print(doc)
|
1043 |
+
# print()
|
1044 |
+
verb_token = None
|
1045 |
+
for token in doc:
|
1046 |
+
# print(token.lemma_)
|
1047 |
+
if token.lemma_ == verb:
|
1048 |
+
verb_token = token
|
1049 |
+
break
|
1050 |
+
# exit()
|
1051 |
+
pobj, dative = None, None
|
1052 |
+
# print(verb_token.children)
|
1053 |
+
# exit()
|
1054 |
+
if verb_token is not None:
|
1055 |
+
for child in verb_token.children:
|
1056 |
+
# print(child)
|
1057 |
+
if (child.dep_ == 'dative' and child.pos_ == "ADP") or (
|
1058 |
+
child.text == "to" and child.dep_ == 'prep' and child.pos_ == "ADP"):
|
1059 |
+
pobj = child.text
|
1060 |
+
if child.dep_ == 'dative':
|
1061 |
+
dative = child.text
|
1062 |
+
|
1063 |
+
# print("E5", pobj, dative)
|
1064 |
+
# exit()
|
1065 |
+
|
1066 |
+
if pobj:
|
1067 |
+
output.append("PO")
|
1068 |
+
elif dative:
|
1069 |
+
output.append("DO")
|
1070 |
+
else:
|
1071 |
+
# print("Other", sentence, pobj, dative)
|
1072 |
+
# exit()
|
1073 |
+
output.append("Other")
|
1074 |
+
|
1075 |
+
|
1076 |
+
|
1077 |
+
'''Exp6'''
|
1078 |
+
|
1079 |
+
elif responses_df["Experiment"][i] == "E6":
|
1080 |
+
sentence = responses_df["Stimuli 1"][i].strip().lower()
|
1081 |
+
#print("E6", sentence)
|
1082 |
+
doc = nlp1(sentence)
|
1083 |
+
subject = "None"
|
1084 |
+
obj = "None"
|
1085 |
+
pobj_list = [] # To collect all prepositional objects
|
1086 |
|
1087 |
for token in doc:
|
1088 |
if token.dep_ == "nsubj":
|
1089 |
subject = token.text
|
1090 |
elif token.dep_ == "dobj":
|
1091 |
obj = token.text
|
1092 |
+
elif token.dep_ == "pobj":
|
1093 |
+
pobj_list.append(token.text) # Collect prepositional objects
|
1094 |
+
|
1095 |
+
rs_list = rs.lower().split()
|
1096 |
+
if subject in rs_list and (obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list)):
|
1097 |
output.append("Other")
|
1098 |
+
elif subject in rs_list:
|
|
|
1099 |
output.append("VP")
|
1100 |
+
elif obj in rs_list or any(pobj == r for pobj in pobj_list for r in rs_list):
|
|
|
1101 |
output.append("NP")
|
1102 |
else:
|
|
|
1103 |
output.append("Other")
|
1104 |
|
1105 |
'''Exp7'''
|
|
|
1177 |
# exit()
|
1178 |
'''LLM'''
|
1179 |
print(len(output))
|
1180 |
+
import re
|
1181 |
+
def clean_text(text):
|
1182 |
+
if isinstance(text, str):
|
1183 |
+
return re.sub(r'[^\x00-\x7F]+', '', text)
|
1184 |
+
return text
|
1185 |
+
|
1186 |
+
responses_df["Experiment"] = responses_df["Experiment"].apply(clean_text)
|
1187 |
+
responses_df["Question_ID"] = responses_df["Question_ID"].apply(clean_text)
|
1188 |
+
responses_df["Item"] = responses_df["Item"].apply(clean_text)
|
1189 |
+
responses_df["Response"] = responses_df["Response"].apply(clean_text)
|
1190 |
+
|
1191 |
+
output = [str(item) for item in output]
|
1192 |
+
|
1193 |
self.data = pd.DataFrame(list(
|
1194 |
+
zip(responses_df["Experiment"], responses_df["Question_ID"], responses_df["Item"], responses_df["Response"],output)),
|
1195 |
+
columns=["Experiment", "Question_ID", "Item", "Response","Coding"])
|
|
|
|
|
1196 |
|
1197 |
return self.data
|
1198 |
|
|
|
1202 |
|
1203 |
|
1204 |
|
1205 |
+
|
1206 |
+
|
1207 |
def calculate_js_divergence(self, file_path_1, file_path_2):
|
1208 |
"""
|
1209 |
Calculate the Jensen-Shannon divergence for response distributions between two datasets.
|
|
|
1211 |
removes the original E5 and E51, and then calculates the JS divergence between the datasets.
|
1212 |
|
1213 |
Parameters:
|
1214 |
+
file_path_1 (str): Path to the first dataset file (CSV format).
|
1215 |
file_path_2 (str): Path to the second dataset file (CSV format).
|
1216 |
|
1217 |
Returns:
|
|
|
1249 |
human_df = pd.concat([human_df, human_e5], ignore_index=True)
|
1250 |
llm_df = pd.concat([llm_df, llm_e5], ignore_index=True)
|
1251 |
|
|
|
1252 |
### Calculate Average JS Divergence ###
|
1253 |
|
|
|
1254 |
# Extract the relevant columns for JS divergence calculation
|
1255 |
human_responses = human_df[['Question_ID', 'Coding']]
|
1256 |
llm_responses = llm_df[['Question_ID', 'Coding']]
|
1257 |
|
1258 |
# Remove 'Other' responses
|
1259 |
+
#human_responses = human_responses[human_responses['Coding'] != 'Other']
|
1260 |
+
#llm_responses = llm_responses[llm_responses['Coding'] != 'Other']
|
1261 |
|
1262 |
# Get unique Question_IDs present in both datasets
|
1263 |
common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
|
|
1287 |
|
1288 |
# Calculate the average JS divergence per experiment and the confidence interval
|
1289 |
results = {}
|
1290 |
+
experiment_averages = []
|
1291 |
for exp, divs in js_divergence.items():
|
1292 |
avg_js_divergence = 1 - np.nanmean(divs)
|
1293 |
ci_lower, ci_upper = bootstrap((divs,), np.nanmean, confidence_level=0.95,
|
|
|
1296 |
'average_js_divergence': avg_js_divergence,
|
1297 |
'confidence_interval': (1 - ci_upper, 1 - ci_lower) # Adjust for 1 - score
|
1298 |
}
|
1299 |
+
experiment_averages.append(avg_js_divergence)
|
1300 |
|
1301 |
+
# Calculate the weighted average JS divergence across all experiments
|
1302 |
+
weighted_js_divergence = np.mean(experiment_averages) # Simple average over experiments
|
|
|
1303 |
|
1304 |
+
# Calculate the confidence interval for the overall JS divergence using bootstrap
|
1305 |
overall_ci_lower, overall_ci_upper = bootstrap(
|
1306 |
+
(experiment_averages,),
|
1307 |
np.nanmean,
|
1308 |
confidence_level=0.95,
|
1309 |
n_resamples=1000
|
|
|
1312 |
# Combine all results into one dictionary
|
1313 |
all_results = {
|
1314 |
'overall': {
|
1315 |
+
'average_js_divergence': weighted_js_divergence,
|
1316 |
+
'confidence_interval': (overall_ci_lower, overall_ci_upper)
|
1317 |
},
|
1318 |
'per_experiment': results
|
1319 |
}
|