IliaLarchenko commited on
Commit
cfe6073
1 Parent(s): 9dfff00

Improved analysis process

Browse files
Files changed (1) hide show
  1. tests/analysis.py +88 -25
tests/analysis.py CHANGED
@@ -16,50 +16,113 @@ from openai import OpenAI
16
  from tests.testing_prompts import feedback_analyzer
17
  from resources.prompts import prompts, base_prompts
18
 
19
-
20
- def complete_and_grade(interview_params, exp_name="GPT4", grader_model="gpt-4-turbo", candidate_model="gpt-3.5-turbo"):
21
- interview_type, attempt_num = interview_params
22
- feedback = {}
23
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
- file_path, _ = complete_interview(interview_type, exp_name, model=candidate_model)
26
- feedback = grade(file_path, grader_model)
27
-
28
- # Just a heuristic check of the JSON format TODO: add a proper check
29
- if "problem_statement_topic" not in feedback:
30
- raise Exception("Grading failed")
31
-
32
  print(f"Attempt {attempt_num + 1} of {interview_type} completed successfully")
33
- print(f"Overall score: {feedback['overall_score']}")
 
 
 
 
 
 
 
34
 
35
  except Exception as e:
36
  print(f"Attempt {attempt_num + 1} of {interview_type} failed with error: {e}")
37
 
38
- return feedback
 
 
 
39
 
40
 
41
  def run_evaluation(
42
- exp_name,
43
- num=5,
44
- interview_types=["ml_design", "math", "ml_theory", "system_design", "sql", "coding"],
45
- grader_model="gpt-4-turbo",
46
- candidate_model="gpt-3.5-turbo",
47
- num_workers=3,
48
  ):
 
 
 
 
 
 
 
49
  exp_name = f"{exp_name}_{pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}"
50
  os.makedirs(f"records/{exp_name}", exist_ok=True)
51
- tasks = [(interview_type, i) for i in range(num) for interview_type in interview_types]
52
- complete_f = partial(complete_and_grade, exp_name=exp_name, grader_model=grader_model, candidate_model=candidate_model)
 
53
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
54
  results = list(executor.map(complete_f, tasks))
55
 
56
- # Filter out empty results and count them
57
  non_empty_results = [res for res in results if res]
58
  empty_count = len(results) - len(non_empty_results)
59
-
60
  print(f"Number of empty results (errors or failed grading): {empty_count}")
61
 
62
- # Store non-empty results in a DataFrame
63
  df = pd.DataFrame(non_empty_results)
64
  df.to_csv(os.path.join("records", exp_name, "results.csv"), index=False)
65
 
 
16
  from tests.testing_prompts import feedback_analyzer
17
  from resources.prompts import prompts, base_prompts
18
 
19
+ criteria_list = {
20
+ "problem_statement",
21
+ "problem_statement_difficulty",
22
+ "problem_statement_topic",
23
+ "problem_statement_solvability",
24
+ "problem_statement_relevance",
25
+ "problem_statement_mistakes",
26
+ "problem_statement_solution",
27
+ "problem_statement_hints",
28
+ "problem_statement_answer_plan",
29
+ "problem_statement_instructions",
30
+ "problem_statement_goals_alignment",
31
+ "problem_statement_skill_test",
32
+ "interviewer_solution",
33
+ "interviewer_mistakes",
34
+ "interviewer_answers",
35
+ "interviewer_relevance",
36
+ "interviewer_support",
37
+ "interviewer_questions",
38
+ "interviewer_repeat",
39
+ "interviewer_found_mistakes",
40
+ "interviewer_hallucinations",
41
+ "interviewer_summary",
42
+ "interviewer_gaslighting",
43
+ "interviewer_leaks",
44
+ "interviewer_empty",
45
+ "interviewer_notes",
46
+ "interviewer_stuck",
47
+ "interviewer_end",
48
+ "interviewer_adaptability",
49
+ "interviewer_flow_control",
50
+ "interviewer_preparation",
51
+ "interviewer_responsive",
52
+ "interviewer_depth",
53
+ "feedback_quality",
54
+ "feedback_overview",
55
+ "feedback_relevance",
56
+ "feedback_clarity",
57
+ "feedback_solution",
58
+ "feedback_result",
59
+ "feedback_hallucinations",
60
+ "feedback_focus",
61
+ "feedback_completeness",
62
+ "feedback_examples",
63
+ "feedback_specificity",
64
+ "comments",
65
+ }
66
+
67
+
68
+ def grade_attempt(file_path, grader_model, attempt_index):
69
+ for retry in range(3): # Retry mechanism
70
+ try:
71
+ feedback = grade(file_path, grader_model, str(attempt_index))
72
+ if np.mean([x in criteria_list for x in feedback.keys()]) > 0.8:
73
+ return feedback
74
+ except Exception as e:
75
+ print(f"The {retry+1} attempt to grade using {grader_model} failed with error {e}")
76
+ return None
77
+
78
+
79
+ def complete_and_grade(interview_params, exp_name, grader_models, candidate_model):
80
+ interview_type, attempt_num, llm_config = interview_params
81
  try:
82
+ file_path, _ = complete_interview(interview_type, exp_name, llm_config, model=candidate_model)
 
 
 
 
 
 
83
  print(f"Attempt {attempt_num + 1} of {interview_type} completed successfully")
84
+
85
+ feedback_list = []
86
+ for i, grader_model in enumerate(grader_models):
87
+ feedback = grade_attempt(file_path, grader_model, i)
88
+ if feedback:
89
+ feedback_list.append(feedback)
90
+ print(f"Attempt {attempt_num + 1} of {interview_type} by {llm_config.name} graded by {grader_model} successfully")
91
+ print(f"Overall score: {feedback['overall_score']}")
92
 
93
  except Exception as e:
94
  print(f"Attempt {attempt_num + 1} of {interview_type} failed with error: {e}")
95
 
96
+ if len(feedback_list) == 0:
97
+ print(f"Attempt {attempt_num + 1} of {interview_type} returned an empty list")
98
+
99
+ return feedback_list
100
 
101
 
102
  def run_evaluation(
103
+ exp_name, num_attempts=5, interview_types=None, grader_models=None, llm_configs=None, candidate_model="gpt-3.5-turbo", num_workers=3
 
 
 
 
 
104
  ):
105
+ if interview_types is None:
106
+ interview_types = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
107
+ if grader_models is None:
108
+ grader_models = ["gpt-4-turbo"]
109
+ if llm_configs is None:
110
+ llm_configs = [None]
111
+
112
  exp_name = f"{exp_name}_{pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}"
113
  os.makedirs(f"records/{exp_name}", exist_ok=True)
114
+ tasks = [(type_, i, llm_config) for type_ in interview_types for i in range(num_attempts) for llm_config in llm_configs]
115
+ complete_f = partial(complete_and_grade, exp_name=exp_name, grader_models=grader_models, candidate_model=candidate_model)
116
+
117
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
118
  results = list(executor.map(complete_f, tasks))
119
 
120
+ # Filter out empty results
121
  non_empty_results = [res for res in results if res]
122
  empty_count = len(results) - len(non_empty_results)
 
123
  print(f"Number of empty results (errors or failed grading): {empty_count}")
124
 
125
+ non_empty_results = [f for res in non_empty_results for f in res]
126
  df = pd.DataFrame(non_empty_results)
127
  df.to_csv(os.path.join("records", exp_name, "results.csv"), index=False)
128