Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
IliaLarchenko
commited on
Commit
•
fb03edc
1
Parent(s):
e0a8291
Added e2e testing
Browse files- .gitignore +1 -0
- tests/candidate.py +108 -0
- tests/tessting_prompts.py +56 -0
- tests/test_e2e.py +6 -0
.gitignore
CHANGED
@@ -160,3 +160,4 @@ cython_debug/
|
|
160 |
#.idea/
|
161 |
|
162 |
.vscode/
|
|
|
|
160 |
#.idea/
|
161 |
|
162 |
.vscode/
|
163 |
+
records/
|
tests/candidate.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import string
|
5 |
+
import time
|
6 |
+
|
7 |
+
from collections import defaultdict
|
8 |
+
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from openai import OpenAI
|
11 |
+
|
12 |
+
from api.llm import LLMManager
|
13 |
+
from config import config
|
14 |
+
from resources.data import fixed_messages, topic_lists
|
15 |
+
from resources.prompts import prompts
|
16 |
+
from tests.tessting_prompts import candidate_prompt
|
17 |
+
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
|
21 |
+
def complete_interview(interview_type, exp_name, requirements="", difficulty="", topic="", model="gpt-3.5-turbo"):
|
22 |
+
client = OpenAI()
|
23 |
+
llm = LLMManager(config, prompts)
|
24 |
+
llm_name = config.llm.name
|
25 |
+
|
26 |
+
# Select a random topic or difficulty if not provided
|
27 |
+
topic = topic or random.choice(topic_lists[interview_type])
|
28 |
+
difficulty = difficulty or random.choice(["easy", "medium", "hard"])
|
29 |
+
|
30 |
+
problem_statement_text = llm.get_problem_full(requirements, difficulty, topic, interview_type)
|
31 |
+
|
32 |
+
interview_data = defaultdict(
|
33 |
+
lambda: None,
|
34 |
+
{
|
35 |
+
"interviewer_llm": llm_name,
|
36 |
+
"candidate_llm": model,
|
37 |
+
"inputs": {
|
38 |
+
"interview_type": interview_type,
|
39 |
+
"difficulty": difficulty,
|
40 |
+
"topic": topic,
|
41 |
+
"requirements": requirements,
|
42 |
+
},
|
43 |
+
"problem_statement": problem_statement_text,
|
44 |
+
"transcript": [],
|
45 |
+
"feedback": None,
|
46 |
+
"average_response_time_seconds": 0,
|
47 |
+
},
|
48 |
+
)
|
49 |
+
|
50 |
+
# Initialize interviewer and candidate messages
|
51 |
+
messages_interviewer = llm.init_bot(problem_statement_text, interview_type)
|
52 |
+
chat_display = [[None, fixed_messages["start"]]]
|
53 |
+
|
54 |
+
messages_candidate = [
|
55 |
+
{"role": "system", "content": candidate_prompt},
|
56 |
+
{"role": "user", "content": f"Your problem: {problem_statement_text}"},
|
57 |
+
{"role": "user", "content": chat_display[-1][1]},
|
58 |
+
]
|
59 |
+
|
60 |
+
response_times = []
|
61 |
+
previous_code = ""
|
62 |
+
|
63 |
+
for _ in range(30):
|
64 |
+
response = client.chat.completions.create(
|
65 |
+
model=model, messages=messages_candidate, temperature=1, response_format={"type": "json_object"}
|
66 |
+
)
|
67 |
+
response_json = json.loads(response.choices[0].message.content)
|
68 |
+
|
69 |
+
code = response_json.get("code", "")
|
70 |
+
candidate_message = response_json.get("message", "")
|
71 |
+
|
72 |
+
if not code and not candidate_message:
|
73 |
+
print("No message or code in response")
|
74 |
+
continue
|
75 |
+
|
76 |
+
messages_candidate.append({"role": "assistant", "content": response.choices[0].message.content})
|
77 |
+
|
78 |
+
if code:
|
79 |
+
interview_data["transcript"].append(f"CANDIDATE CODE: {code}")
|
80 |
+
elif candidate_message:
|
81 |
+
interview_data["transcript"].append(f"CANDIDATE MESSAGE: {candidate_message}")
|
82 |
+
|
83 |
+
chat_display.append([candidate_message, None])
|
84 |
+
|
85 |
+
# Check if the interview should finish
|
86 |
+
if response_json.get("finished") and not response_json.get("question"):
|
87 |
+
break
|
88 |
+
|
89 |
+
send_time = time.time()
|
90 |
+
messages_interviewer, chat_display, previous_code = llm.send_request_full(code, previous_code, messages_interviewer, chat_display)
|
91 |
+
response_times.append(time.time() - send_time)
|
92 |
+
|
93 |
+
messages_candidate.append({"role": "user", "content": chat_display[-1][1]})
|
94 |
+
interview_data["transcript"].append(f"INTERVIEWER MESSAGE: {chat_display[-1][1]}")
|
95 |
+
|
96 |
+
interview_data["feedback"] = llm.end_interview_full(problem_statement_text, messages_interviewer, interview_type)
|
97 |
+
interview_data["average_response_time_seconds"] = round(sum(response_times) / len(response_times), 2) if response_times else 0
|
98 |
+
|
99 |
+
current_time = time.strftime("%Y%m%d-%H%M%S")
|
100 |
+
random_suffix = "".join(random.choices(string.ascii_letters + string.digits, k=10))
|
101 |
+
file_path = os.path.join("records", exp_name, f"{current_time}-{random_suffix}.json")
|
102 |
+
|
103 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
104 |
+
|
105 |
+
with open(file_path, "w") as file:
|
106 |
+
json.dump(interview_data, file, indent=4)
|
107 |
+
|
108 |
+
return file_path, interview_data
|
tests/tessting_prompts.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
candidate_prompt = """You are completing an interview. You are a very bad developer who makes a lot of mistakes.
|
2 |
+
You write code in Python.
|
3 |
+
You should solve the problem in a very small and incremental steps and explain your thought process.
|
4 |
+
Don't rush to give the whole solution at once.
|
5 |
+
Sometime make mistakes in logic, computations, or syntax. Pretend you are stuck and ask for help.
|
6 |
+
Follow interviewer (user) instructions and answer their questions.
|
7 |
+
You can ask for clarification if you don't understand something.
|
8 |
+
Each your answer should be a json with 4 keys: "finished", "question", "message" and "code".
|
9 |
+
"finished" is a boolean, it is True if the user told you that the interview is finished, otherwise False.
|
10 |
+
"question" is a boolean, it is True if the last user message contains a question, otherwise False.
|
11 |
+
"message" is a string, it is a message you want to tell the interviewer. Message should never be empty.
|
12 |
+
"code" is a string, it is the current version of your notes (it can be code, query, pseudocode, answer structure, formulas, calculations, schemas, examples, test cases, etc.), if it didn't change from the last message return an empty string. Try to actively use this field, it is very important.
|
13 |
+
"""
|
14 |
+
|
15 |
+
|
16 |
+
grader_prompt = """
|
17 |
+
You are reviewing an interview. Your goal is to evaluate the performance of the interviewer, not the candidate.
|
18 |
+
Be extremely critical and strict, you have highest quality standards.
|
19 |
+
Even a slight mistake should lead to a negative evaluation. If in doubt about any criteria, give a negative evaluation.
|
20 |
+
Analyze the JSON file with the interview transcript and provide your feedback.
|
21 |
+
|
22 |
+
You should evaluate the following aspects and return a JSON with these keys:
|
23 |
+
|
24 |
+
"problem_statement": "The problem statement was clear and easily comprehensible.",
|
25 |
+
"problem_statement_difficulty": "The difficulty level of the problem was suitable for the interview level.",
|
26 |
+
"problem_statement_topic": "The problem was relevant to the stated topic of the interview.",
|
27 |
+
"problem_statement_solvability": "The problem could be solved within the allotted 30-minute time frame.",
|
28 |
+
"problem_statement_relevance": "The problem was pertinent to the specific type of interview.",
|
29 |
+
"problem_statement_mistakes": "The problem statement contained no errors or inaccuracies (e.g., in provided examples).",
|
30 |
+
|
31 |
+
"interviewer_solution": "The interviewer didn't provide the solutions and avoided offering unnecessary hints during the interview.",
|
32 |
+
"interviewer_mistakes": "The interviewer didn't make any errors in code, computation, or logical reasoning.",
|
33 |
+
"interviewer_answers": "The interviewer refrained from answering candidate questions unnecessarily.",
|
34 |
+
"interviewer_relevance": "The interviewer asked questions pertinent to the problem and interview objectives.",
|
35 |
+
"interviewer_support": "The interviewer effectively aided candidates when they were stuck without directly revealing answers.",
|
36 |
+
"interviewer_questions": "The interviewer didn't ask unnecessary questions.",
|
37 |
+
"interviewer_repeat": "The interviewer did not repeat questions that the candidate had already answered.",
|
38 |
+
"interviewer_found_mistakes": "The interviewer accurately identified any mistakes made by the candidate.",
|
39 |
+
"interviewer_hallucinations": "The interviewer didn't say anything non-relevant or strange.",
|
40 |
+
"interviewer_summary": "The interviewer doesn't repeat or summarize what the candidate just said.",
|
41 |
+
"interviewer_gaslighting": "The interviewer refrained from gaslitgting the candidate: didn't claim any candidates errors or missed facts that he didn't make.",
|
42 |
+
"interviewer_leaks": "The interviewer didn't leak any hidden notes to candidate.",
|
43 |
+
"interviewer_empty": "The interviewer didn't send any empty messages.",
|
44 |
+
|
45 |
+
"feedback_quality": "The feedback was constructive and offered actionable insights.",
|
46 |
+
"feedback_overview": "The feedback contains the recap of main mistakes and good ideas of the candidate.",
|
47 |
+
"feedback_relevance": "The feedback was directly related to the interview problem.",
|
48 |
+
"feedback_clarity": "The feedback was straightforward and understandable.",
|
49 |
+
"feedback_solution": "The feedback included the correct solution if the candidate was unable to solve the problem.",
|
50 |
+
"feedback_result": "The feedback accurately reflected the candidate's performance.",
|
51 |
+
"feedback_hallucinations": "The feedback didn't contain any non-relevant information.",
|
52 |
+
|
53 |
+
"comments": "Provide examples of mistakes made by the interviewer or areas for improvement, if there are some. List only bad things, don't list good."
|
54 |
+
|
55 |
+
Return just True, False, or None (if no info was provided) for each key except "comments", "comments" is string.
|
56 |
+
"""
|
tests/test_e2e.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tests.candidate import complete_interview
|
2 |
+
|
3 |
+
|
4 |
+
def test_complete_interview():
|
5 |
+
file_path, interview_data = complete_interview("coding", "test", model="gpt-3.5-turbo")
|
6 |
+
assert True
|