Spaces:
Sleeping
Sleeping
File size: 15,336 Bytes
269ec40 56bfed7 269ec40 56bfed7 269ec40 56bfed7 cfe6073 6ea7ebc 437e3cd 56bfed7 cfe6073 56bfed7 437e3cd 56bfed7 cfe6073 6ea7ebc cfe6073 56bfed7 cfe6073 56bfed7 cfe6073 feab4b2 cfe6073 56bfed7 bb0f942 cfe6073 56bfed7 cfe6073 56bfed7 cfe6073 56bfed7 7c9c7c8 56bfed7 9bc730f dd8cb56 56bfed7 7c9c7c8 56bfed7 269ec40 feab4b2 269ec40 feab4b2 269ec40 feab4b2 269ec40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
# This file contains some functions I use for automated analysis and evaluation
# It is not used in the main functionality of the service
# It is quite messy so far
# to use analytics tools you need to install some extra libraries
# !pip install pandas
from tests.candidate import complete_interview
from tests.grader import grade
import pandas as pd
import numpy as np
from functools import partial
import concurrent.futures
import os
from IPython.display import Markdown, display
from openai import OpenAI
from tests.testing_prompts import feedback_analyzer
from resources.prompts import prompts, base_prompts
criteria_list = {
"problem_statement",
"problem_statement_difficulty",
"problem_statement_topic",
"problem_statement_solvability",
"problem_statement_relevance",
"problem_statement_mistakes",
"problem_statement_solution",
"problem_statement_hints",
"problem_statement_answer_plan",
"problem_statement_instructions",
"problem_statement_goals_alignment",
"problem_statement_skill_test",
"interviewer_solution",
"interviewer_mistakes",
"interviewer_answers",
"interviewer_relevance",
"interviewer_support",
"interviewer_questions",
"interviewer_repeat",
"interviewer_found_mistakes",
"interviewer_hallucinations",
"interviewer_summary",
"interviewer_gaslighting",
"interviewer_leaks",
"interviewer_empty",
"interviewer_notes",
"interviewer_stuck",
"interviewer_end",
"interviewer_adaptability",
"interviewer_flow_control",
"interviewer_preparation",
"interviewer_responsive",
"interviewer_depth",
"feedback_quality",
"feedback_overview",
"feedback_relevance",
"feedback_clarity",
"feedback_solution",
"feedback_result",
"feedback_hallucinations",
"feedback_focus",
"feedback_completeness",
"feedback_examples",
"feedback_specificity",
"comments",
}
def grade_attempt(file_path, grader_model, attempt_index):
for retry in range(3): # Retry mechanism
try:
feedback = grade(file_path, grader_model, str(attempt_index))
if np.mean([x in criteria_list for x in feedback.keys()]) > 0.8:
return feedback
except Exception as e:
print(f"The {retry+1} attempt to grade using {grader_model} failed with error {e}")
return None
def complete_and_grade(interview_params, exp_name, grader_models, candidate_model):
interview_type, attempt_num, llm_config = interview_params
feedback_list = []
attempt_successful = False
for attempt in range(3): # Retry up to 3 times
try:
file_path, _ = complete_interview(interview_type, exp_name, llm_config, model=candidate_model, pause=attempt * 5)
print(
f"Attempt {attempt_num + 1}, retry {attempt + 1} interview simulation of {interview_type} by {llm_config.name} completed successfully"
)
attempt_successful = True
break
except Exception as e:
print(f"Retry {attempt + 1} for attempt {attempt_num + 1} of {interview_type} by {llm_config.name} failed with error: {e}")
if not attempt_successful:
print(f"All retries failed for attempt {attempt_num + 1} of {interview_type} by {llm_config.name}")
return feedback_list
try:
for i, grader_model in enumerate(grader_models):
feedback = grade_attempt(file_path, grader_model, i)
if feedback:
feedback_list.append(feedback)
print(f"Attempt {attempt_num + 1} of {interview_type} by {llm_config.name} graded by {grader_model} successfully")
print(f"Overall score: {feedback['overall_score']}")
except Exception as e:
print(f"Grading for attempt {attempt_num + 1} of {interview_type} by {llm_config.name} failed with error: {e}")
if len(feedback_list) == 0:
print(f"Attempt {attempt_num + 1} of {interview_type} by {llm_config.name} returned an empty list")
return feedback_list
def run_evaluation(
exp_name, num_attempts=5, interview_types=None, grader_models=None, llm_configs=None, candidate_model="gpt-3.5-turbo", num_workers=3
):
if interview_types is None:
interview_types = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
if grader_models is None:
grader_models = ["gpt-4o"]
if llm_configs is None:
llm_configs = [None]
exp_name = f"{exp_name}_{pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}"
os.makedirs(f"records/{exp_name}", exist_ok=True)
tasks = [(type_, i, llm_config) for i in range(num_attempts) for type_ in interview_types for llm_config in llm_configs]
complete_f = partial(complete_and_grade, exp_name=exp_name, grader_models=grader_models, candidate_model=candidate_model)
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
results = list(executor.map(complete_f, tasks))
# Filter out empty results
non_empty_results = [res for res in results if res]
empty_count = len(results) - len(non_empty_results)
print(f"Number of empty results (errors or failed grading): {empty_count}")
non_empty_results = [f for res in non_empty_results for f in res]
df = pd.DataFrame(non_empty_results)
df.to_csv(os.path.join("records", exp_name, "results.csv"), index=False)
return exp_name
def highlight_color(val):
color = "red" if val < 0.7 else "orange" if val < 0.9 else "lightgreen" if val < 0.95 else "green"
return f"color: {color}"
def generate_and_display_tables(df):
# Grouping by prefix
prefixes = ["problem", "interviewer", "feedback"]
prefix_columns = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]
criteria_summary_df = pd.DataFrame(df[prefix_columns].mean(), columns=["avg score"])
criteria_summary_df_styled = criteria_summary_df.style.map(highlight_color)
criteria_summary_df_styled.set_caption("Aggregated Scores per Criteria")
# Aggregated scores per stage
grouped_scores = {}
for prefix in prefixes:
prefix_cols = [col for col in df.columns if col.startswith(prefix)]
grouped_scores[prefix] = df[prefix_cols].mean(axis=1).mean()
grouped_scores_df = pd.DataFrame([grouped_scores]).T
grouped_scores_df.columns = ["avg score"]
grouped_scores_styled = grouped_scores_df.style.map(highlight_color)
grouped_scores_styled.set_caption("Aggregated Scores per Stage")
# Grouped by unique type
grouped_by_type = pd.DataFrame(df.groupby("type")[prefix_columns].mean().mean(axis=1), columns=["avg score"])
grouped_by_type_styled = grouped_by_type.style.map(highlight_color)
grouped_by_type_styled.set_caption("Scores Grouped by Unique Type")
total_llm_scores = df.groupby("agent_llm")[prefix_columns].mean().mean(axis=1).sort_values(ascending=False)
# Grouped by unique interviewer model and sorted by descending total score
grouped_by_interviewer = (
df.groupby("agent_llm")[["overall_score", "average_response_time_seconds", "number_of_messages"]]
.mean()
.reindex(total_llm_scores.index)
)
grouped_by_interviewer_styled = grouped_by_interviewer.style.map(highlight_color)
grouped_by_interviewer_styled.set_caption("Scores Grouped by Unique Interviewer Model")
for prefix in prefixes:
prefix_cols = [col for col in prefix_columns if col.startswith(prefix)]
df[prefix] = df[prefix_cols].mean(axis=1)
# Pivot table: Agent model vs Stage
pivot1 = pd.pivot_table(df, values=prefixes, index="agent_llm", aggfunc="mean").reindex(total_llm_scores.index)
pivot1_styled = pivot1.style.map(highlight_color)
pivot1_styled.set_caption("Pivot Table: Agent Model vs Stage")
# Pivot table: Agent model vs Type (Single aggregated score per type)
pivot2 = pd.pivot_table(df, values="overall_score", index="agent_llm", columns="type", aggfunc="mean").reindex(total_llm_scores.index)
pivot2_styled = pivot2.style.map(highlight_color)
pivot2_styled.set_caption("Pivot Table: Agent Model vs Type")
# Pivot table: Type vs Stage
pivot3 = pd.pivot_table(df, values=prefixes, index="type", aggfunc="mean")
pivot3_styled = pivot3.style.map(highlight_color)
pivot3_styled.set_caption("Pivot Table: Type vs Stage")
# Pivot table: Agent Model x Stage vs Type (MultiIndex)
multi_index_data = [(llm, stage) for llm in total_llm_scores.index for stage in prefixes]
multi_index = pd.MultiIndex.from_tuples(multi_index_data, names=["agent_llm", "stage"])
types = df["type"].unique()
pivot4_df = pd.DataFrame(index=multi_index, columns=types)
# Fill the DataFrame with the aggregated scores grouped by type
for llm in total_llm_scores.index:
for stage in prefixes:
mask = df["agent_llm"] == llm
stage_values = df.loc[mask, ["type", stage]].groupby("type").mean()[stage]
pivot4_df.loc[(llm, stage), :] = stage_values
pivot4_styled = pivot4_df.style.map(highlight_color)
pivot4_styled.set_caption("Pivot Table: Agent Model x Stage vs Type")
tables_dict = {
"criteria_summary_df_styled": criteria_summary_df_styled,
"grouped_scores_styled": grouped_scores_styled,
"grouped_by_type_styled": grouped_by_type_styled,
"grouped_by_interviewer_styled": grouped_by_interviewer_styled,
"pivot1_styled": pivot1_styled,
"pivot2_styled": pivot2_styled,
"pivot3_styled": pivot3_styled,
"pivot4_styled": pivot4_styled,
}
for table in tables_dict.values():
display(table)
return tables_dict
def filter_df(df, prefixes=["problem", "interviewer", "feedback"]):
# Identify all columns starting with any of the prefixes
columns_to_check = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]
# Function to check if a value is a boolean, None, or string representations of boolean types
def is_valid_value(val):
return isinstance(val, bool) or val is None or val is np.nan or val in {"True", "False", "None", "NaN"}
# Function to convert string representations to actual booleans
def to_bool(val):
if val == "True":
return True
elif val == "False":
return False
elif val == "None":
return None
return val
# Check if all values in the specified columns are valid
def all_values_valid(row):
return all(is_valid_value(row[col]) for col in columns_to_check)
# Apply filtering to keep only rows with valid values
valid_df = df[df.apply(all_values_valid, axis=1)].copy()
# Convert string representations to booleans
for col in columns_to_check:
valid_df[col] = valid_df[col].apply(to_bool)
# Identify removed rows
removed_rows = df[~df.index.isin(valid_df.index)]
# Print the number of rows removed
num_removed = len(removed_rows)
print(f"Number of rows removed: {num_removed}")
# Print the value from the "file_name" column for each removed row, or `None` if not present
if "file_name" in removed_rows.columns:
for value in removed_rows["file_name"].tolist():
print(f"Removed row file_name: {value}")
else:
print("Removed row file_name: None")
return valid_df
def generate_analysis_report(df, folder, focus=None, model="gpt-4o"):
client = OpenAI(base_url="https://api.openai.com/v1")
all_comments = "\n\n".join([f"Interview type: {t}. Feedback: {str(f)}" for t, f in zip(df["type"].values, df["comments"].values)])
messages = [
{"role": "system", "content": feedback_analyzer},
{"role": "user", "content": f"Interview feedback: {all_comments}"},
]
if focus:
messages.append({"role": "user", "content": f"Focus only on comments about {focus} part of the interview"})
response = client.chat.completions.create(model=model, messages=messages, temperature=1)
comments_analysis = response.choices[0].message.content
display(Markdown(comments_analysis))
if folder is not None:
with open(os.path.join(folder, "analysis.md"), "w") as f:
f.write(comments_analysis)
f.write("\n\n")
for t in np.unique(df["type"]):
f.write(f"Type: {t}\n")
f.write(df[[c for c in df.columns if c != "comments"]][df["type"] == t].T.to_markdown())
f.write("\n\n")
f.write(f"Type: all\n")
f.write("\n\n")
f.write("Feedback:\n")
f.write(all_comments)
return comments_analysis
def analyze_and_improve_segment(df, segment_to_improve=None):
sorted_stages = df[["problem", "interviewer", "feedback"]].mean().sort_values()
if not segment_to_improve:
segment_to_improve = sorted_stages.index[0]
th_score = sorted_stages.iloc[0] + 0.1
print(f"Let's try to improve {segment_to_improve}")
print(f"Quality threshold {th_score}")
# Identifying types that need improvement
type_stage_scores = df.groupby("type")[segment_to_improve].mean()
types_to_improve = []
for t, s in type_stage_scores.items():
if s < th_score:
types_to_improve.append(t)
print(f"We will focus on {types_to_improve}")
# Filtering DataFrame based on identified types and scoring criteria
filtered_df = df[df["type"].apply(lambda x: x in types_to_improve)]
prefix_columns = [col for col in df.columns if col.startswith(segment_to_improve)]
filtered_df = filtered_df[filtered_df[prefix_columns].mean(axis=1) < th_score]
# Generating an analysis report
comments_analysis = generate_analysis_report(filtered_df, None, focus=segment_to_improve, model="gpt-4o")
# Constructing improvement prompt
improvement_prompt = """You want to improve the prompts for LLM interviewer.
Below you will see some of the prompts that are used right now.
As well as a summary of mistakes that interviewer make.
You can add 1-3 lines to each of prompts if needed, but you can't change or remove anything.
"""
# Selecting the base prompt for the segment to improve
base_prompt = base_prompts.get(f"base_{segment_to_improve}", "Base prompt not found for the segment")
# Constructing the current prompts display
current_prompts = "The current prompts are below. \n"
current_prompts += "BASE PROMPT (applied to all interview types): \n"
current_prompts += base_prompt + "\n"
for k, v in prompts.items():
if segment_to_improve in k:
current_prompts += f"{k}: {v[len(base_prompt):]} \n\n"
# Making API call to OpenAI
client = OpenAI(base_url="https://api.openai.com/v1")
model = "gpt-4o"
messages = [
{"role": "system", "content": improvement_prompt},
{"role": "user", "content": current_prompts},
{"role": "user", "content": f"Interview feedback: {comments_analysis}"},
{"role": "user", "content": "Please return any additional instructions you would like to add to any of the prompts."},
]
response = client.chat.completions.create(model=model, messages=messages, temperature=1).choices[0].message.content
print(response)
|