math-olympiad-solver

Runtime error

App Files Files Community

nisten commited on Sep 27, 2024

Commit

d25ef7d

verified ·

1 Parent(s): 8e396f7

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -151

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import gradio as gr
 from dataclasses import dataclass
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
@@ -14,16 +13,13 @@ import time
 from typing import Tuple, Dict, Any, List
 from sympy import N, simplify
 from sympy.parsing.latex import parse_latex
-from openai import OpenAI
 import base64
-client = OpenAI(
-    base_url=os.environ.get("SERVER_URL"),
-    api_key=os.environ.get("HF_TOKEN"),
-)
 @dataclass
 class Config:
@@ -59,7 +55,6 @@ class Config:
     # Push solutions to the Hub
     push_to_hub: bool = False
 class PythonREPL:
     def __init__(self, timeout=5):
         self.timeout = timeout
@@ -79,13 +74,16 @@ class PythonREPL:
             with open(temp_file_path, "w") as f:
                 f.write(query)
-            result = subprocess.run(
-                ["python3", temp_file_path],
-                capture_output=True,
-                check=False,
-                text=True,
-                timeout=self.timeout,
-            )
             if result.returncode == 0:
                 output = result.stdout
@@ -121,17 +119,16 @@ class PythonREPL:
             except TimeoutError:
                 return False, f"Timed out after {self.timeout} seconds."
 def execute_completion(
     executor: PythonREPL,
     completion: str,
     return_status: bool = False,
     last_code_block: bool = False,
 ) -> str | Tuple[str, bool]:
-    # executions = ["!" + code for code in re.findall(r"```bash(.*?)```", completion, re.DOTALL) if "!" not in code]
     executions = re.findall(r"```python(.*?)```", completion, re.DOTALL)
-    if len(executions) == 0:  # directly return cot result
         return completion, False if return_status else completion
     else:
         if last_code_block:
@@ -159,7 +156,7 @@ def execute_completion(
                 success, output = executor(code)
             except TimeoutError as e:
                 print("time out")
-                output = e
             if not success and not return_status:
                 output = ""
@@ -175,7 +172,6 @@ def execute_completion(
         else:
             return output
 def postprocess_completion(
     text: str, return_status: bool = False, last_code_block=False, timeout=5
 ) -> str | Tuple[str, bool]:
@@ -186,11 +182,9 @@ def postprocess_completion(
     return result
 def apply_template(example: Dict[str, Any], prompt: str) -> Dict[str, Any]:
     return prompt.format(example["prompt"], "{}")
 def last_boxed_only_string(string):
     """
     Extracts the last LaTeX boxed or framed expression from a string.
@@ -227,7 +221,6 @@ def last_boxed_only_string(string):
     return retval
 def remove_boxed(s):
     """
     Removes the LaTeX boxed command, returning the content inside the braces.
@@ -247,7 +240,6 @@ def remove_boxed(s):
     except Exception:
         return None
 def extract_boxed_answer(pred_str, strip_double_curly_brace=False):
     """
     Extracts the answer from a LaTeX boxed expression within
@@ -268,12 +260,11 @@ def extract_boxed_answer(pred_str, strip_double_curly_brace=False):
     if answer is None:
         return None
     if strip_double_curly_brace:
-        match = re.match("^\{(.*)\}$", answer)  # noqa: W605
         if match:
             answer = match.group(1)
     return answer
 def normalize_final_answer(final_answer: str) -> str:
     """
     Normalizes a final answer string by removing or replacing various LaTeX
@@ -286,9 +277,9 @@ def normalize_final_answer(final_answer: str) -> str:
     match = re.search(r"(.*?)Problem:", final_answer, flags=re.S)
     if match:
-        final_answer = match.group(1)  # 返回匹配的第一部分，即"Problem"之前的所有文本
     """Normalize a final answer to a quantitative reasoning question."""
-    # final_answer = final_answer.split('=')[-1]
     SUBSTITUTIONS = [
         ("an ", ""),
         ("a ", ""),
@@ -379,8 +370,8 @@ def normalize_final_answer(final_answer: str) -> str:
     if "rac" in final_answer and "\\frac" not in final_answer:
         final_answer = final_answer.replace("rac", "\\frac")
-    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
-    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
     final_answer = final_answer.replace("$", "")
     if final_answer.replace(",", "").isdigit():
@@ -388,18 +379,14 @@ def normalize_final_answer(final_answer: str) -> str:
     return final_answer
 def naive_parse(answer: str) -> str:
     """
     Extracts and returns the numeric digits from the input string, processing them in reverse order
     until a non-numeric character is encountered after encountering the first numeric character.
     Args:
         answer (str): The input string to parse.
     Returns:
         str: A string consisting of the numeric digits extracted from the input, in their original order.
     Example:
         >>> naive_parse("abc123def")
         '123'
@@ -422,7 +409,6 @@ def naive_parse(answer: str) -> str:
     out = reversed(out)
     return "".join(out)
 def validate_answer_is_numeric(x: str | int | float) -> int:
     FLOAT_TOLERANCE = 0.2
     try:
@@ -434,7 +420,6 @@ def validate_answer_is_numeric(x: str | int | float) -> int:
         x = -1
     return x
 def filter_answers(answers: List[str]) -> List[int]:
     formatted_answers = [validate_answer_is_numeric(a) for a in answers]
@@ -446,13 +431,12 @@ def filter_answers(answers: List[str]) -> List[int]:
     formatted_answers = [a for a in formatted_answers if a <= 999]
     return formatted_answers
 def check_sympy_equivalence(ref_answer: str, model_answer: str) -> bool:
     def do_answers_match(ref_answer: str, model_answer: str) -> bool:
         ref_sympy = parse_latex(ref_answer)
         model_sympy = parse_latex(model_answer)
         diff = simplify(ref_sympy - model_sympy)
-        return True if -1e-12 < N(diff) < 1e-12 or diff.is_zero else False
     try:
         result = do_answers_match(ref_answer, model_answer)
@@ -461,7 +445,6 @@ def check_sympy_equivalence(ref_answer: str, model_answer: str) -> bool:
         print(e)
         return False
 def check_string_match(ref_answer: str, model_answer: str) -> bool:
     try:
         return ref_answer == model_answer
@@ -469,7 +452,6 @@ def check_string_match(ref_answer: str, model_answer: str) -> bool:
         print(e)
     return False
 def check_answer(ref_answer: str, model_answer: str) -> bool:
     # check if strings are the same
     correct = check_string_match(ref_answer, model_answer)
@@ -483,9 +465,9 @@ def check_answer(ref_answer: str, model_answer: str) -> bool:
     return False
 debug = False
-model_id = "Numina-Math-7B"
 revision = "main"
 system_prompt = "{}"
 validation_set = "kaggle-validation-set-medium"
@@ -522,52 +504,43 @@ config = Config(
 )
 print(f"=== Running submission with config ===\n\n{config}")
-def generate(message, temperature):
     """
     Generates a chat completion response by streaming data from the client chat model.
     This function streams the response from the client chat model and yields the content
     of the response chunk by chunk. If an error occurs, it yields the error message.
     Parameters:
-    message (str): The input message to be sent to the chat model.
-    temperature (float): The sampling temperature to use. Higher values mean the model will take more risks.
     Yields:
     tuple: A tuple containing the content of the response and a boolean flag indicating if an error occurred.
            If no error occurred, the boolean flag will be False and the content will be the response text.
            If an error occurred, the boolean flag will be True and the content will be the error message.
     """
-    stream = client.chat.completions.create(
-        model="tgi",
-        messages=message,
-        stream=True,
-        max_tokens=1024,
-        stop=["```output\n"],
-        temperature=temperature,
-        timeout=30,
-    )
-    response = stream.response
-    # The reason why the library method is not used here is that if an error occurs,
-    #    the returned data will not be a stream, and using the official library will result in an error.
-    for chunk in response.iter_bytes():
-        chunk = chunk.decode("utf-8")
-        chune_json = json.loads(chunk.replace("data:", ""))
-        try:
-            if "error" in chune_json and chune_json["error"]:
-                yield chune_json["error"], True
                 break
-            content = chune_json["choices"][0]["delta"]["content"]
-            if content is not None:
-                yield content, False
-        except Exception as e:
-            print(f"func: generate error occurred\njson:{chune_json}\nerror:{e}")
-            yield "", True
 def get_majority_text(data):
     from collections import Counter
@@ -584,7 +557,6 @@ def get_majority_text(data):
     # Return the corresponding text in gen_texts
     return data["gen_texts"][majority_index]
 def extract_solution(text):
     # Split the text at "### Solution:"
     parts = text.split("### Solution:", 1)
@@ -595,7 +567,6 @@ def extract_solution(text):
         # Return an empty string if "### Solution:" is not found
         return ""
 def process_code(
     example: Dict[str, Any],
     config: Config,
@@ -607,19 +578,19 @@ def process_code(
     if num_python_blocks == 0:
         if restart_on_fail:
-            print("no code has ever been generated, RESTARTING")
-            # reset the text to the original
-            example["gen_texts"] = example["text"]
         else:
-            print("no code has ever been generated, STOP")
             example["should_prune"] = True
             example["has_code"] = False
         return example
-    if gen_text[-10:] != "```output\n" and ("answer is" in gen_text[-100:] or "\\boxed" in gen_text[-100:]):
         num_output_blocks = len(re.findall(r"```output(.*?)```", gen_text, re.DOTALL))
         if num_output_blocks == 0:
-            print("the model hallucinated the code answer")
             example["should_prune"] = True
             return example
@@ -639,70 +610,69 @@ def process_code(
         return example
     if last_step:
-        # no point in continuing if we are at the last step
         return example
-    if gen_text[-10:] != "```output\n":
-        # something else has gone wrong with the generation
-        print("warning: output block not found: ", gen_text[-40:])
         if restart_on_fail:
-            example["gen_texts"] = example["text"]
         else:
             example["should_prune"] = True
         return example
     code_result, status = postprocess_completion(gen_text, return_status=True, last_code_block=True)
-    # add the code result for the next round of generation
     TRUNCATION_LIMIT = 200
     if len(code_result) > TRUNCATION_LIMIT:
         code_result = code_result[:TRUNCATION_LIMIT] + " ... (output truncated)"
-    example["gen_texts"] = gen_text + f"{code_result}\n```"
     return example
 def solve_problem(problem, temperature, progress=gr.Progress()):
     """
     yield token: string, stop: bool
     """
-    problem = apply_template({"prompt": problem}, prompt=config.system_prompt)
-    print(f"Problem: {problem}")
     sample = {
-        "problem": problem,  # not used for the submission TODO Remove
-        "ground_truth": "unknown",  # not used for the submission TODO Remove
         "text": "## Solution:\n",
-        "gen_texts": "## Solution:\n",  # used to store all the generated text
         "should_prune": False,
-        "problem_index": -1,  # not used for the submission TODO Remove
         "model_answers": "-1",
         "has_code": True,
-        "corrects": False,  # not used for the submission TODO Remove
     }
     for step in progress.tqdm(
         range(config.num_generations), desc="Generating candidates"
-    ):  # Depth of the tree (e.g. 6 steps = 5 code blocks)
-        step_reponse = sample["gen_texts"]
         messages = [
-            {"role": "user", "content": sample["problem"]},
-            {"role": "assistant", "content": sample["gen_texts"]},
         ]
-        for reponse_message, error in generate(messages, temperature):
-            if reponse_message is not None:
-                step_reponse += reponse_message
-                yield step_reponse, False
                 if error:
-                    yield step_reponse, True
                     return
-        sample["gen_texts"] = step_reponse
-        # TODO: Maybe it should just return the result of running the code
         sample = process_code(
             sample,
             config=config,
@@ -711,55 +681,52 @@ def solve_problem(problem, temperature, progress=gr.Progress()):
         )
         sample["gen_texts"] = sample["gen_texts"] + "\n"
-        run_code_reponse = sample["gen_texts"].replace(step_reponse, "")
-        for output_mseeage in run_code_reponse:
-            if output_mseeage is not None:
-                step_reponse += output_mseeage
-                yield step_reponse, False
         if sample["should_prune"]:
             break
     yield sample["gen_texts"], True
 example_data = datasets.load_dataset(
     "AI-MO/kaggle-validation-set-medium-extended",
     split="train",
     use_auth_token=os.environ.get("HF_DATASET_TOKEN", None),
 )
-with open("app.css", "r") as f:
-    css = f.read()
 latex_delimiters = [
     {"left": "[", "right": "]", "display": True},
 ]
 def get_random_problem():
     example = random.choice(list(example_data))
     problem = example["problem"]
     return problem
 def update_example_problem():
     problem_example_text = get_random_problem()
     return problem_example_text, problem_example_text
 def clear():
     problem_example_text = get_random_problem()
     return "", 0.1, "", problem_example_text, problem_example_text
 def preprocess_output(text):
     return text.replace(r"\(", r"\\(").replace(r"\)", r"\\)")
 with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
     running_done = False
     btn_list = []
@@ -772,31 +739,31 @@ with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
     with gr.Row(elem_classes="sub-title"):
         gr.HTML(
-            "<div>Demo of the <a href='https://huggingface.co/AI-MO/NuminaMath-7B-TIR'>Numina-Math-7B-TIR</a>. Example data are drawn randomly from AMC12, year 2022-2023.</div>",
             elem_classes="sub-title-content",
         )
     with gr.Row(elem_classes="main-area"):
         with gr.Column(scale=1, elem_classes="left"):
-            with gr.Row(elem_classes="probelm-example-container"):
-                with gr.Blocks(elem_classes="probelm-example-title"):
-                    gr.HTML("Problem example", elem_classes="probelm-example-title-content")
                 with gr.Blocks(elem_classes="action-container"):
                     another_btn = gr.Button(
-                        "",
-                        elem_classes="probelm-example-another",
-                        icon="./static/images/reset.png",
                     )
-                    copy_btn = gr.Button("Copy", elem_classes="probelm-example-copy")
                 problem_example = gr.HTML(
                     problem_example_text,
-                    elem_classes="probelm-example-content",
                 )
-            with gr.Row(elem_classes="probelm-input-container"):
-                inp = gr.Textbox(placeholder="Problem", label="Problem input", lines=5, visible=True)
                 problem_markdown = gr.Markdown(
                     visible=False,
                     latex_delimiters=[
@@ -807,17 +774,15 @@ with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
                 )
                 inp.change(fn=lambda text: text, inputs=[inp], outputs=[problem_markdown])
-                problem_input_ele_list.append(inp)
-                problem_input_ele_list.append(problem_markdown)
             with gr.Accordion("Advanced Options", open=False):
-                temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.1, label="Temperature")
             with gr.Row() as btn_area:
                 btn_clear = gr.Button("Clear", elem_classes="clear-btn")
                 btn_run = gr.Button("Run", elem_classes="run-btn")
-                btn_list.append(btn_clear)
-                btn_list.append(btn_run)
         with gr.Column(scale=1, elem_classes="right"):
             gr.HTML("Solution", elem_classes="solution-title-content")
@@ -842,15 +807,15 @@ with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
                         running_done = True
             except Exception as e:
                 running_done = True
-                raise e
         def mount_run_btn(btn):
-            btn.click(fn=solve_problem_wrapper, inputs=[inp, temperature], outputs=out)
             btn.click(get_running_btns, None, outputs=btn_list)
             btn.click(get_run_after_problem_input, None, outputs=problem_input_ele_list)
         def get_run_after_problem_input():
-            return gr.Textbox(placeholder="Problem", label="Problem input", lines=5, visible=False), gr.Markdown(
                 visible=True,
                 latex_delimiters=[
                     {"left": "[", "right": "]", "display": True},
@@ -860,7 +825,7 @@ with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
             )
         def get_init_problem_input():
-            return gr.Textbox(placeholder="Problem", label="Problem input", lines=5, visible=True), gr.Markdown(
                 visible=False,
                 latex_delimiters=[
                     {"left": "[", "right": "]", "display": True},
@@ -890,14 +855,14 @@ with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
                 time.sleep(1)
-        copy_btn.click(fn=lambda example: example, inputs=[problem_example_text_hidden], outputs=[inp])
         btn_clear.click(
             fn=clear,
             inputs=[],
             outputs=[
                 inp,
-                temperature,
                 out,
                 problem_example,
                 problem_example_text_hidden,
@@ -927,4 +892,4 @@ with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
         )
 if __name__ == "__main__":
-    demo.queue(default_concurrency_limit=5).launch()

 import gradio as gr
 from dataclasses import dataclass
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 from typing import Tuple, Dict, Any, List
 from sympy import N, simplify
 from sympy.parsing.latex import parse_latex
+import openai
 import base64
+# Initialize OpenAI client to use local API
+openai.api_base = os.environ.get("SERVER_URL", "http://0.0.0.0:6061")
+openai.api_key = os.environ.get("HF_TOKEN", "")  # If no key needed, set empty string
 @dataclass
 class Config:
     # Push solutions to the Hub
     push_to_hub: bool = False
 class PythonREPL:
     def __init__(self, timeout=5):
         self.timeout = timeout
             with open(temp_file_path, "w") as f:
                 f.write(query)
+            try:
+                result = subprocess.run(
+                    ["python3", temp_file_path],
+                    capture_output=True,
+                    check=False,
+                    text=True,
+                    timeout=self.timeout,
+                )
+            except subprocess.TimeoutExpired:
+                return False, f"Timed out after {self.timeout} seconds."
             if result.returncode == 0:
                 output = result.stdout
             except TimeoutError:
                 return False, f"Timed out after {self.timeout} seconds."
 def execute_completion(
     executor: PythonREPL,
     completion: str,
     return_status: bool = False,
     last_code_block: bool = False,
 ) -> str | Tuple[str, bool]:
+    # Extract python code blocks enclosed in triple backticks with language 'python'
     executions = re.findall(r"```python(.*?)```", completion, re.DOTALL)
+    if len(executions) == 0:  # directly return COT result
         return completion, False if return_status else completion
     else:
         if last_code_block:
                 success, output = executor(code)
             except TimeoutError as e:
                 print("time out")
+                output = str(e)
             if not success and not return_status:
                 output = ""
         else:
             return output
 def postprocess_completion(
     text: str, return_status: bool = False, last_code_block=False, timeout=5
 ) -> str | Tuple[str, bool]:
     return result
 def apply_template(example: Dict[str, Any], prompt: str) -> Dict[str, Any]:
     return prompt.format(example["prompt"], "{}")
 def last_boxed_only_string(string):
     """
     Extracts the last LaTeX boxed or framed expression from a string.
     return retval
 def remove_boxed(s):
     """
     Removes the LaTeX boxed command, returning the content inside the braces.
     except Exception:
         return None
 def extract_boxed_answer(pred_str, strip_double_curly_brace=False):
     """
     Extracts the answer from a LaTeX boxed expression within
     if answer is None:
         return None
     if strip_double_curly_brace:
+        match = re.match(r"^\{(.*)\}$", answer)  # noqa: W605
         if match:
             answer = match.group(1)
     return answer
 def normalize_final_answer(final_answer: str) -> str:
     """
     Normalizes a final answer string by removing or replacing various LaTeX
     match = re.search(r"(.*?)Problem:", final_answer, flags=re.S)
     if match:
+        final_answer = match.group(1)  # Return all text before 'Problem'
     """Normalize a final answer to a quantitative reasoning question."""
     SUBSTITUTIONS = [
         ("an ", ""),
         ("a ", ""),
     if "rac" in final_answer and "\\frac" not in final_answer:
         final_answer = final_answer.replace("rac", "\\frac")
+    final_answer = re.sub(r"(frac)([^{])(.)", r"frac{\2}{\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", r"sqrt{\2}", final_answer)
     final_answer = final_answer.replace("$", "")
     if final_answer.replace(",", "").isdigit():
     return final_answer
 def naive_parse(answer: str) -> str:
     """
     Extracts and returns the numeric digits from the input string, processing them in reverse order
     until a non-numeric character is encountered after encountering the first numeric character.
     Args:
         answer (str): The input string to parse.
     Returns:
         str: A string consisting of the numeric digits extracted from the input, in their original order.
     Example:
         >>> naive_parse("abc123def")
         '123'
     out = reversed(out)
     return "".join(out)
 def validate_answer_is_numeric(x: str | int | float) -> int:
     FLOAT_TOLERANCE = 0.2
     try:
         x = -1
     return x
 def filter_answers(answers: List[str]) -> List[int]:
     formatted_answers = [validate_answer_is_numeric(a) for a in answers]
     formatted_answers = [a for a in formatted_answers if a <= 999]
     return formatted_answers
 def check_sympy_equivalence(ref_answer: str, model_answer: str) -> bool:
     def do_answers_match(ref_answer: str, model_answer: str) -> bool:
         ref_sympy = parse_latex(ref_answer)
         model_sympy = parse_latex(model_answer)
         diff = simplify(ref_sympy - model_sympy)
+        return True if (-1e-12 < N(diff) < 1e-12) or diff.is_zero else False
     try:
         result = do_answers_match(ref_answer, model_answer)
         print(e)
         return False
 def check_string_match(ref_answer: str, model_answer: str) -> bool:
     try:
         return ref_answer == model_answer
         print(e)
     return False
 def check_answer(ref_answer: str, model_answer: str) -> bool:
     # check if strings are the same
     correct = check_string_match(ref_answer, model_answer)
     return False
+# Configuration Parameters
 debug = False
+model_id = "qwen2-7b-math-q8_0"  # Update model ID
 revision = "main"
 system_prompt = "{}"
 validation_set = "kaggle-validation-set-medium"
 )
 print(f"=== Running submission with config ===\n\n{config}")
+def generate(messages, temperature):
     """
     Generates a chat completion response by streaming data from the client chat model.
     This function streams the response from the client chat model and yields the content
     of the response chunk by chunk. If an error occurs, it yields the error message.
     Parameters:
+    messages (list of dict): The list of message dicts for the chat model.
+    temperature (float): The sampling temperature to use.
     Yields:
     tuple: A tuple containing the content of the response and a boolean flag indicating if an error occurred.
            If no error occurred, the boolean flag will be False and the content will be the response text.
            If an error occurred, the boolean flag will be True and the content will be the error message.
     """
+    try:
+        response = openai.ChatCompletion.create(
+            model=config.model_id,
+            messages=messages,
+            stream=True,
+            max_tokens=1024,
+            temperature=temperature,
+        )
+    except Exception as e:
+        yield str(e), True
+        return
+    for chunk in response:
+        if 'choices' in chunk:
+            choice = chunk['choices'][0]
+            if 'delta' in choice:
+                content = choice['delta'].get('content', '')
+                if content:
+                    yield content, False
+            if choice.get('finish_reason') is not None:
                 break
+        elif 'error' in chunk:
+            yield chunk['error']['message'], True
+            break
 def get_majority_text(data):
     from collections import Counter
     # Return the corresponding text in gen_texts
     return data["gen_texts"][majority_index]
 def extract_solution(text):
     # Split the text at "### Solution:"
     parts = text.split("### Solution:", 1)
         # Return an empty string if "### Solution:" is not found
         return ""
 def process_code(
     example: Dict[str, Any],
     config: Config,
     if num_python_blocks == 0:
         if restart_on_fail:
+            print("No code has been generated. Restarting generation.")
+            # Reset the text to the original
+            example["gen_texts"] = "## Solution:\n"
         else:
+            print("No code has been generated. Stopping.")
             example["should_prune"] = True
             example["has_code"] = False
         return example
+    if not gen_text.endswith("```output\n") and ("answer is" in gen_text[-100:] or "\\boxed" in gen_text[-100:]):
         num_output_blocks = len(re.findall(r"```output(.*?)```", gen_text, re.DOTALL))
         if num_output_blocks == 0:
+            print("The model hallucinated the code answer.")
             example["should_prune"] = True
             return example
         return example
     if last_step:
+        # No point in continuing if we are at the last step
         return example
+    if not gen_text.endswith("```output\n"):
+        # Something else has gone wrong with the generation
+        print("Warning: Output block not found: ", gen_text[-40:])
         if restart_on_fail:
+            example["gen_texts"] = "## Solution:\n"
         else:
             example["should_prune"] = True
         return example
     code_result, status = postprocess_completion(gen_text, return_status=True, last_code_block=True)
+    # Add the code result for the next round of generation
     TRUNCATION_LIMIT = 200
     if len(code_result) > TRUNCATION_LIMIT:
         code_result = code_result[:TRUNCATION_LIMIT] + " ... (output truncated)"
+    example["gen_texts"] = gen_text + f"```\n{code_result}\n```\n"
     return example
 def solve_problem(problem, temperature, progress=gr.Progress()):
     """
     yield token: string, stop: bool
     """
+    # Apply the system prompt template
+    problem_formatted = config.system_prompt.format(problem)
+    print(f"Problem: {problem_formatted}")
     sample = {
+        "problem": problem_formatted,
+        "ground_truth": "unknown",
         "text": "## Solution:\n",
+        "gen_texts": "## Solution:\n",
         "should_prune": False,
+        "problem_index": -1,
         "model_answers": "-1",
         "has_code": True,
+        "corrects": False,
     }
     for step in progress.tqdm(
         range(config.num_generations), desc="Generating candidates"
+    ):
+        step_response = sample["gen_texts"]
         messages = [
+            {"role": "system", "content": config.system_prompt.format(problem)},
+            {"role": "user", "content": sample["gen_texts"]},
         ]
+        for response_message, error in generate(messages, temperature):
+            if response_message:
+                step_response += response_message
+                yield preprocess_output(step_response)
                 if error:
+                    yield step_response, True
                     return
+        sample["gen_texts"] = step_response
+        # Process the generated code
         sample = process_code(
             sample,
             config=config,
         )
         sample["gen_texts"] = sample["gen_texts"] + "\n"
+        # Extract any run code response
+        run_code_response = sample["gen_texts"].replace(step_response, "")
+        # Append the run code response if it exists
+        if run_code_response.strip():
+            step_response += run_code_response
+            yield preprocess_output(run_code_response)
         if sample["should_prune"]:
             break
     yield sample["gen_texts"], True
+# Load the dataset
 example_data = datasets.load_dataset(
     "AI-MO/kaggle-validation-set-medium-extended",
     split="train",
     use_auth_token=os.environ.get("HF_DATASET_TOKEN", None),
 )
+# Load CSS if available
+css = ""
+if os.path.exists("app.css"):
+    with open("app.css", "r") as f:
+        css = f.read()
 latex_delimiters = [
     {"left": "[", "right": "]", "display": True},
 ]
 def get_random_problem():
     example = random.choice(list(example_data))
     problem = example["problem"]
     return problem
 def update_example_problem():
     problem_example_text = get_random_problem()
     return problem_example_text, problem_example_text
 def clear():
     problem_example_text = get_random_problem()
     return "", 0.1, "", problem_example_text, problem_example_text
 def preprocess_output(text):
     return text.replace(r"\(", r"\\(").replace(r"\)", r"\\)")
 with gr.Blocks(css=css, title="Math Olympiad Solver") as demo:
     running_done = False
     btn_list = []
     with gr.Row(elem_classes="sub-title"):
         gr.HTML(
+            "<div>Demo of the <a href='https://huggingface.co/AI-MO/qwen2-7b-math-q8_0'>qwen2-7b-math-q8_0</a>. Example data are drawn randomly from AMC12, year 2022-2023.</div>",
             elem_classes="sub-title-content",
         )
     with gr.Row(elem_classes="main-area"):
         with gr.Column(scale=1, elem_classes="left"):
+            with gr.Row(elem_classes="problem-example-container"):
+                with gr.Blocks(elem_classes="problem-example-title"):
+                    gr.HTML("Problem Example", elem_classes="problem-example-title-content")
                 with gr.Blocks(elem_classes="action-container"):
                     another_btn = gr.Button(
+                        "Another Problem",
+                        elem_classes="problem-example-another",
+                        # Removed icon path to prevent errors
                     )
+                    copy_btn = gr.Button("Copy", elem_classes="problem-example-copy")
                 problem_example = gr.HTML(
                     problem_example_text,
+                    elem_classes="problem-example-content",
                 )
+            with gr.Row(elem_classes="problem-input-container"):
+                inp = gr.Textbox(placeholder="Enter your problem here...", label="Problem Input", lines=5)
                 problem_markdown = gr.Markdown(
                     visible=False,
                     latex_delimiters=[
                 )
                 inp.change(fn=lambda text: text, inputs=[inp], outputs=[problem_markdown])
+                problem_input_ele_list.extend([inp, problem_markdown])
             with gr.Accordion("Advanced Options", open=False):
+                temperature_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.1, label="Temperature")
             with gr.Row() as btn_area:
                 btn_clear = gr.Button("Clear", elem_classes="clear-btn")
                 btn_run = gr.Button("Run", elem_classes="run-btn")
+                btn_list.extend([btn_clear, btn_run])
         with gr.Column(scale=1, elem_classes="right"):
             gr.HTML("Solution", elem_classes="solution-title-content")
                         running_done = True
             except Exception as e:
                 running_done = True
+                yield str(e)
         def mount_run_btn(btn):
+            btn.click(fn=solve_problem_wrapper, inputs=[inp, temperature_slider], outputs=out)
             btn.click(get_running_btns, None, outputs=btn_list)
             btn.click(get_run_after_problem_input, None, outputs=problem_input_ele_list)
         def get_run_after_problem_input():
+            return gr.Textbox(placeholder="Enter your problem here...", label="Problem Input", lines=5, visible=False), gr.Markdown(
                 visible=True,
                 latex_delimiters=[
                     {"left": "[", "right": "]", "display": True},
             )
         def get_init_problem_input():
+            return gr.Textbox(placeholder="Enter your problem here...", label="Problem Input", lines=5, visible=True), gr.Markdown(
                 visible=False,
                 latex_delimiters=[
                     {"left": "[", "right": "]", "display": True},
                 time.sleep(1)
+        copy_btn.click(fn=lambda _: gr.update(value=problem_example_text, interactive=True), inputs=None, outputs=inp)
         btn_clear.click(
             fn=clear,
             inputs=[],
             outputs=[
                 inp,
+                temperature_slider,
                 out,
                 problem_example,
                 problem_example_text_hidden,
         )
 if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=5).launch(share=True)