File size: 15,336 Bytes
269ec40
 
 
56bfed7
 
 
 
 
 
269ec40
56bfed7
 
 
269ec40
 
 
 
56bfed7
cfe6073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ea7ebc
 
437e3cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56bfed7
cfe6073
 
 
 
 
 
56bfed7
 
437e3cd
56bfed7
cfe6073
6ea7ebc
cfe6073
 
56bfed7
 
 
cfe6073
56bfed7
cfe6073
 
 
feab4b2
cfe6073
 
 
56bfed7
 
bb0f942
cfe6073
 
56bfed7
 
 
cfe6073
56bfed7
 
 
 
cfe6073
56bfed7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c9c7c8
 
 
 
56bfed7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bc730f
dd8cb56
 
 
 
 
56bfed7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c9c7c8
56bfed7
 
 
 
 
 
 
 
 
 
 
 
 
269ec40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feab4b2
269ec40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feab4b2
269ec40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
feab4b2
269ec40
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# This file contains some functions I use for automated analysis and evaluation
# It is not used in the main functionality of the service
# It is quite messy so far
# to use analytics tools you need to install some extra libraries
# !pip install pandas

from tests.candidate import complete_interview
from tests.grader import grade
import pandas as pd
import numpy as np
from functools import partial
import concurrent.futures
import os
from IPython.display import Markdown, display
from openai import OpenAI
from tests.testing_prompts import feedback_analyzer
from resources.prompts import prompts, base_prompts

criteria_list = {
    "problem_statement",
    "problem_statement_difficulty",
    "problem_statement_topic",
    "problem_statement_solvability",
    "problem_statement_relevance",
    "problem_statement_mistakes",
    "problem_statement_solution",
    "problem_statement_hints",
    "problem_statement_answer_plan",
    "problem_statement_instructions",
    "problem_statement_goals_alignment",
    "problem_statement_skill_test",
    "interviewer_solution",
    "interviewer_mistakes",
    "interviewer_answers",
    "interviewer_relevance",
    "interviewer_support",
    "interviewer_questions",
    "interviewer_repeat",
    "interviewer_found_mistakes",
    "interviewer_hallucinations",
    "interviewer_summary",
    "interviewer_gaslighting",
    "interviewer_leaks",
    "interviewer_empty",
    "interviewer_notes",
    "interviewer_stuck",
    "interviewer_end",
    "interviewer_adaptability",
    "interviewer_flow_control",
    "interviewer_preparation",
    "interviewer_responsive",
    "interviewer_depth",
    "feedback_quality",
    "feedback_overview",
    "feedback_relevance",
    "feedback_clarity",
    "feedback_solution",
    "feedback_result",
    "feedback_hallucinations",
    "feedback_focus",
    "feedback_completeness",
    "feedback_examples",
    "feedback_specificity",
    "comments",
}


def grade_attempt(file_path, grader_model, attempt_index):
    for retry in range(3):  # Retry mechanism
        try:
            feedback = grade(file_path, grader_model, str(attempt_index))
            if np.mean([x in criteria_list for x in feedback.keys()]) > 0.8:
                return feedback
        except Exception as e:
            print(f"The {retry+1} attempt to grade using {grader_model} failed with error {e}")
    return None


def complete_and_grade(interview_params, exp_name, grader_models, candidate_model):
    interview_type, attempt_num, llm_config = interview_params

    feedback_list = []
    attempt_successful = False
    for attempt in range(3):  # Retry up to 3 times
        try:
            file_path, _ = complete_interview(interview_type, exp_name, llm_config, model=candidate_model, pause=attempt * 5)
            print(
                f"Attempt {attempt_num + 1}, retry {attempt + 1} interview simulation of {interview_type} by {llm_config.name} completed successfully"
            )
            attempt_successful = True
            break
        except Exception as e:
            print(f"Retry {attempt + 1} for attempt {attempt_num + 1} of {interview_type} by {llm_config.name} failed with error: {e}")

    if not attempt_successful:
        print(f"All retries failed for attempt {attempt_num + 1} of {interview_type} by {llm_config.name}")
        return feedback_list

    try:
        for i, grader_model in enumerate(grader_models):
            feedback = grade_attempt(file_path, grader_model, i)
            if feedback:
                feedback_list.append(feedback)
                print(f"Attempt {attempt_num + 1} of {interview_type} by {llm_config.name} graded by {grader_model} successfully")
                print(f"Overall score: {feedback['overall_score']}")

    except Exception as e:
        print(f"Grading for attempt {attempt_num + 1} of {interview_type} by {llm_config.name} failed with error: {e}")

    if len(feedback_list) == 0:
        print(f"Attempt {attempt_num + 1} of {interview_type} by {llm_config.name} returned an empty list")

    return feedback_list


def run_evaluation(
    exp_name, num_attempts=5, interview_types=None, grader_models=None, llm_configs=None, candidate_model="gpt-3.5-turbo", num_workers=3
):
    if interview_types is None:
        interview_types = ["ml_design", "math", "ml_theory", "system_design", "sql", "coding"]
    if grader_models is None:
        grader_models = ["gpt-4o"]
    if llm_configs is None:
        llm_configs = [None]

    exp_name = f"{exp_name}_{pd.Timestamp.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    os.makedirs(f"records/{exp_name}", exist_ok=True)
    tasks = [(type_, i, llm_config) for i in range(num_attempts) for type_ in interview_types for llm_config in llm_configs]
    complete_f = partial(complete_and_grade, exp_name=exp_name, grader_models=grader_models, candidate_model=candidate_model)

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(executor.map(complete_f, tasks))

    # Filter out empty results
    non_empty_results = [res for res in results if res]
    empty_count = len(results) - len(non_empty_results)
    print(f"Number of empty results (errors or failed grading): {empty_count}")

    non_empty_results = [f for res in non_empty_results for f in res]
    df = pd.DataFrame(non_empty_results)
    df.to_csv(os.path.join("records", exp_name, "results.csv"), index=False)

    return exp_name


def highlight_color(val):
    color = "red" if val < 0.7 else "orange" if val < 0.9 else "lightgreen" if val < 0.95 else "green"
    return f"color: {color}"


def generate_and_display_tables(df):
    # Grouping by prefix
    prefixes = ["problem", "interviewer", "feedback"]
    prefix_columns = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]

    criteria_summary_df = pd.DataFrame(df[prefix_columns].mean(), columns=["avg score"])
    criteria_summary_df_styled = criteria_summary_df.style.map(highlight_color)
    criteria_summary_df_styled.set_caption("Aggregated Scores per Criteria")

    # Aggregated scores per stage
    grouped_scores = {}
    for prefix in prefixes:
        prefix_cols = [col for col in df.columns if col.startswith(prefix)]
        grouped_scores[prefix] = df[prefix_cols].mean(axis=1).mean()

    grouped_scores_df = pd.DataFrame([grouped_scores]).T
    grouped_scores_df.columns = ["avg score"]
    grouped_scores_styled = grouped_scores_df.style.map(highlight_color)
    grouped_scores_styled.set_caption("Aggregated Scores per Stage")

    # Grouped by unique type
    grouped_by_type = pd.DataFrame(df.groupby("type")[prefix_columns].mean().mean(axis=1), columns=["avg score"])
    grouped_by_type_styled = grouped_by_type.style.map(highlight_color)
    grouped_by_type_styled.set_caption("Scores Grouped by Unique Type")

    total_llm_scores = df.groupby("agent_llm")[prefix_columns].mean().mean(axis=1).sort_values(ascending=False)
    # Grouped by unique interviewer model and sorted by descending total score
    grouped_by_interviewer = (
        df.groupby("agent_llm")[["overall_score", "average_response_time_seconds", "number_of_messages"]]
        .mean()
        .reindex(total_llm_scores.index)
    )
    grouped_by_interviewer_styled = grouped_by_interviewer.style.map(highlight_color)
    grouped_by_interviewer_styled.set_caption("Scores Grouped by Unique Interviewer Model")

    for prefix in prefixes:
        prefix_cols = [col for col in prefix_columns if col.startswith(prefix)]
        df[prefix] = df[prefix_cols].mean(axis=1)

    # Pivot table: Agent model vs Stage
    pivot1 = pd.pivot_table(df, values=prefixes, index="agent_llm", aggfunc="mean").reindex(total_llm_scores.index)
    pivot1_styled = pivot1.style.map(highlight_color)
    pivot1_styled.set_caption("Pivot Table: Agent Model vs Stage")

    # Pivot table: Agent model vs Type (Single aggregated score per type)
    pivot2 = pd.pivot_table(df, values="overall_score", index="agent_llm", columns="type", aggfunc="mean").reindex(total_llm_scores.index)
    pivot2_styled = pivot2.style.map(highlight_color)
    pivot2_styled.set_caption("Pivot Table: Agent Model vs Type")

    # Pivot table: Type vs Stage
    pivot3 = pd.pivot_table(df, values=prefixes, index="type", aggfunc="mean")
    pivot3_styled = pivot3.style.map(highlight_color)
    pivot3_styled.set_caption("Pivot Table: Type vs Stage")

    # Pivot table: Agent Model x Stage vs Type (MultiIndex)
    multi_index_data = [(llm, stage) for llm in total_llm_scores.index for stage in prefixes]
    multi_index = pd.MultiIndex.from_tuples(multi_index_data, names=["agent_llm", "stage"])
    types = df["type"].unique()
    pivot4_df = pd.DataFrame(index=multi_index, columns=types)

    # Fill the DataFrame with the aggregated scores grouped by type
    for llm in total_llm_scores.index:
        for stage in prefixes:
            mask = df["agent_llm"] == llm
            stage_values = df.loc[mask, ["type", stage]].groupby("type").mean()[stage]
            pivot4_df.loc[(llm, stage), :] = stage_values

    pivot4_styled = pivot4_df.style.map(highlight_color)
    pivot4_styled.set_caption("Pivot Table: Agent Model x Stage vs Type")

    tables_dict = {
        "criteria_summary_df_styled": criteria_summary_df_styled,
        "grouped_scores_styled": grouped_scores_styled,
        "grouped_by_type_styled": grouped_by_type_styled,
        "grouped_by_interviewer_styled": grouped_by_interviewer_styled,
        "pivot1_styled": pivot1_styled,
        "pivot2_styled": pivot2_styled,
        "pivot3_styled": pivot3_styled,
        "pivot4_styled": pivot4_styled,
    }

    for table in tables_dict.values():
        display(table)

    return tables_dict


def filter_df(df, prefixes=["problem", "interviewer", "feedback"]):
    # Identify all columns starting with any of the prefixes
    columns_to_check = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]

    # Function to check if a value is a boolean, None, or string representations of boolean types
    def is_valid_value(val):
        return isinstance(val, bool) or val is None or val is np.nan or val in {"True", "False", "None", "NaN"}

    # Function to convert string representations to actual booleans
    def to_bool(val):
        if val == "True":
            return True
        elif val == "False":
            return False
        elif val == "None":
            return None
        return val

    # Check if all values in the specified columns are valid
    def all_values_valid(row):
        return all(is_valid_value(row[col]) for col in columns_to_check)

    # Apply filtering to keep only rows with valid values
    valid_df = df[df.apply(all_values_valid, axis=1)].copy()

    # Convert string representations to booleans
    for col in columns_to_check:
        valid_df[col] = valid_df[col].apply(to_bool)

    # Identify removed rows
    removed_rows = df[~df.index.isin(valid_df.index)]

    # Print the number of rows removed
    num_removed = len(removed_rows)
    print(f"Number of rows removed: {num_removed}")

    # Print the value from the "file_name" column for each removed row, or `None` if not present
    if "file_name" in removed_rows.columns:
        for value in removed_rows["file_name"].tolist():
            print(f"Removed row file_name: {value}")
    else:
        print("Removed row file_name: None")

    return valid_df


def generate_analysis_report(df, folder, focus=None, model="gpt-4o"):

    client = OpenAI(base_url="https://api.openai.com/v1")

    all_comments = "\n\n".join([f"Interview type: {t}. Feedback: {str(f)}" for t, f in zip(df["type"].values, df["comments"].values)])

    messages = [
        {"role": "system", "content": feedback_analyzer},
        {"role": "user", "content": f"Interview feedback: {all_comments}"},
    ]

    if focus:
        messages.append({"role": "user", "content": f"Focus only on comments about {focus} part of the interview"})

    response = client.chat.completions.create(model=model, messages=messages, temperature=1)

    comments_analysis = response.choices[0].message.content
    display(Markdown(comments_analysis))

    if folder is not None:
        with open(os.path.join(folder, "analysis.md"), "w") as f:
            f.write(comments_analysis)
            f.write("\n\n")
            for t in np.unique(df["type"]):
                f.write(f"Type: {t}\n")
                f.write(df[[c for c in df.columns if c != "comments"]][df["type"] == t].T.to_markdown())
                f.write("\n\n")
            f.write(f"Type: all\n")
            f.write("\n\n")
            f.write("Feedback:\n")
            f.write(all_comments)

    return comments_analysis


def analyze_and_improve_segment(df, segment_to_improve=None):

    sorted_stages = df[["problem", "interviewer", "feedback"]].mean().sort_values()
    if not segment_to_improve:
        segment_to_improve = sorted_stages.index[0]
    th_score = sorted_stages.iloc[0] + 0.1

    print(f"Let's try to improve {segment_to_improve}")
    print(f"Quality threshold {th_score}")

    # Identifying types that need improvement
    type_stage_scores = df.groupby("type")[segment_to_improve].mean()
    types_to_improve = []
    for t, s in type_stage_scores.items():
        if s < th_score:
            types_to_improve.append(t)

    print(f"We will focus on {types_to_improve}")

    # Filtering DataFrame based on identified types and scoring criteria
    filtered_df = df[df["type"].apply(lambda x: x in types_to_improve)]
    prefix_columns = [col for col in df.columns if col.startswith(segment_to_improve)]
    filtered_df = filtered_df[filtered_df[prefix_columns].mean(axis=1) < th_score]

    # Generating an analysis report
    comments_analysis = generate_analysis_report(filtered_df, None, focus=segment_to_improve, model="gpt-4o")

    # Constructing improvement prompt
    improvement_prompt = """You want to improve the prompts for LLM interviewer.
Below you will see some of the prompts that are used right now. 
As well as a summary of mistakes that interviewer make.
You can add 1-3 lines to each of prompts if needed, but you can't change or remove anything.
"""

    # Selecting the base prompt for the segment to improve
    base_prompt = base_prompts.get(f"base_{segment_to_improve}", "Base prompt not found for the segment")

    # Constructing the current prompts display
    current_prompts = "The current prompts are below. \n"
    current_prompts += "BASE PROMPT (applied to all interview types): \n"
    current_prompts += base_prompt + "\n"

    for k, v in prompts.items():
        if segment_to_improve in k:
            current_prompts += f"{k}: {v[len(base_prompt):]} \n\n"

    # Making API call to OpenAI
    client = OpenAI(base_url="https://api.openai.com/v1")
    model = "gpt-4o"
    messages = [
        {"role": "system", "content": improvement_prompt},
        {"role": "user", "content": current_prompts},
        {"role": "user", "content": f"Interview feedback: {comments_analysis}"},
        {"role": "user", "content": "Please return any additional instructions you would like to add to any of the prompts."},
    ]

    response = client.chat.completions.create(model=model, messages=messages, temperature=1).choices[0].message.content
    print(response)