From 84109ec32a5840fdb10375de92395fba39f4dc01 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Tue, 17 Sep 2024 18:31:06 +0200
Subject: [PATCH 1/5] simple eval script ouf!

---
 eval.py | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 eval.py

diff --git a/eval.py b/eval.py
new file mode 100644
index 0000000..b545396
--- /dev/null
+++ b/eval.py
@@ -0,0 +1,167 @@
+import asyncio
+import os
+import sys
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+
+import weave
+import simple_parsing
+from rich.logging import RichHandler
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(message)s",
+    datefmt="[%X]",
+    handlers=[RichHandler(rich_tracebacks=True)]
+)
+
+async def run_python(program: Path, input_file: Path, output_file: Path, timeout: float = 10):
+    """
+    Run a Python program with the given input file and output file.
+    """
+    try:
+        process = await asyncio.create_subprocess_exec(
+            sys.executable, program,
+            stdin=input_file.open('rb'),
+            stdout=output_file.open('wb'),
+            stderr=asyncio.subprocess.PIPE
+        )
+        
+        try:
+            _, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
+        except asyncio.TimeoutError:
+            process.kill()
+            raise TimeoutError(f"Program execution timed out after {timeout} seconds")
+        
+        if process.returncode != 0:
+            raise RuntimeError(f"Program execution failed: {stderr.decode()}")
+        
+        logging.info(f"Output saved to {output_file}")
+    except Exception as e:
+        raise RuntimeError(f"Error running Python program: {str(e)}")
+
+async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10):
+    """
+    Run a C++ program with the given input file and output file.
+    """
+    # Get the base name of the cpp file (without extension)
+    base_name = os.path.splitext(cpp_file.name)[0]
+    
+    # Compile the C++ program
+    compile_command = f"g++ {cpp_file} -o {base_name}"
+    process = await asyncio.create_subprocess_shell(
+        compile_command,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE
+    )
+    stdout, stderr = await process.communicate()
+    
+    if process.returncode != 0:
+        raise RuntimeError(f"Compilation failed: {stderr.decode()}")
+    
+    try:
+        # Run the compiled program with input from file
+        with open(input_file, 'r') as infile:
+            process = await asyncio.create_subprocess_exec(
+                f"./{base_name}",
+                stdin=infile,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            
+            try:
+                stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
+            except asyncio.TimeoutError:
+                process.kill()
+                raise TimeoutError(f"Program execution timed out after {timeout} seconds")
+            
+            if process.returncode != 0:
+                raise RuntimeError(f"Program execution failed: {stderr.decode()}")
+            
+            output_file.write_text(stdout.decode())
+        
+        logging.info(f"Output saved to {output_file}")
+    
+    finally:
+        # Clean up the compiled file
+        if os.path.exists(base_name):
+            os.remove(base_name)
+
+@weave.op
+async def run_program(program: Path, input: Path, output: Path, timeout: float = 10):
+    if program.suffix == ".cpp":
+        logging.info(f"Running C++ program: {program}")
+        await run_cpp(program, input, output, timeout)
+    elif program.suffix == ".py":
+        logging.info(f"Running Python program: {program}")
+        await run_python(program, input, output, timeout)
+    else:
+        raise ValueError(f"Unsupported file type: {program.suffix}")
+    return "success"
+
+@weave.op
+def check_solution(model_output: str, expected_output: str):
+    print(f"In Check Solution: {model_output}, {expected_output}")
+    output = Path(model_output["generated_output"]).read_text() # these may be big!
+    expected_output = Path(expected_output).read_text()
+    return {"solved": output.strip() == expected_output.strip()}
+
+@dataclass
+class Args(simple_parsing.Serializable):
+    program: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp"
+    input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in"
+    output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out"
+    eval_name: str = "super_dupper_model"
+    weave_project: str = "hackercup-eval-solution"
+    timeout: float = 10
+
+if __name__=="__main__":
+    args = simple_parsing.parse(Args)
+    weave.init(args.weave_project)
+    
+    generated_output_file = Path("generated_output.txt")
+
+    asyncio.run(run_program(
+        Path(args.program),
+        Path(args.input),
+        generated_output_file,
+        timeout=args.timeout
+    ))
+
+    passed = check_solution({"generated_output": generated_output_file}, Path(args.output))
+    logging.info(f"Program passed: {passed}")
+
+    class Runner(weave.Model):
+        timeout: float = 10
+
+        @weave.op
+        async def predict(self, program: str, input: str):
+            program, input = Path(program), Path(input)
+            print(program, input)
+            generated_output = input.parent / (input.stem + "_generated_output.txt")
+            print(f"Saving generated output to {generated_output}")
+            status = await run_program(program, input, generated_output, timeout=self.timeout)
+            predict_output =  {"generated_output": generated_output, "status": status}
+            print(f"Predict output: {predict_output}")
+            return predict_output
+
+
+
+    dataset = [{
+        "program": args.program,
+        "input": args.input,
+        "expected_output": args.output,
+    }]
+
+
+    model = Runner(timeout=args.timeout)
+
+    # out = asyncio.run(model.predict(args.program, args.input))
+    
+    # passed = check_solution(out["model_output"], args.output)
+    # logging.info(f"Program passed: {passed}")
+    print("="*100)
+    # run the eval
+    evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution])
+    asyncio.run(evaluation.evaluate(model))
\ No newline at end of file

From ddcebc84cafa190b5a0bcfb6800480a34de66277 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Tue, 17 Sep 2024 19:56:27 +0200
Subject: [PATCH 2/5] add more options

---
 eval.py | 200 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 133 insertions(+), 67 deletions(-)

diff --git a/eval.py b/eval.py
index b545396..ce369ba 100644
--- a/eval.py
+++ b/eval.py
@@ -7,14 +7,70 @@
 
 import weave
 import simple_parsing
+from pydantic import BaseModel, Field
 from rich.logging import RichHandler
 
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(message)s",
-    datefmt="[%X]",
-    handlers=[RichHandler(rich_tracebacks=True)]
-)
+def setup_logger(debug=False):
+    level = "DEBUG" if debug else "INFO"
+    logging.basicConfig(
+        level=level, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
+    )
+
+
+class Problem(BaseModel):
+    problem_dir: Path = Field(
+        ..., description="The path to the problem directory"
+    )
+    problem_name: str = Field(..., description="The name of the problem")
+    problem_description: str = Field(..., description="The description of the problem")
+    sample_input: str = Field(..., description="The path to the sample input of the problem")
+    sample_output: str = Field(..., description="The path to the sample output of the problem")
+    code: str = Field(..., description="The path to the code file")
+    input: str = Field(..., description="The path to the input file")
+    output: str = Field(..., description="The path to the output file")
+
+def guess_code_file(problem_name: str, problem_dir: Path) -> Path:
+    if os.path.exists(problem_dir / f"{problem_name}.cpp"):
+        return problem_dir / f"{problem_name}.cpp"
+    elif os.path.exists(problem_dir / f"{problem_name}.py"):
+        return problem_dir / f"{problem_name}.py"
+    else:
+        raise ValueError(f"No code file found for problem {problem_name}")
+
+def load_problem(problem_name: str, problem_dir: Path) -> Problem:
+    input = problem_dir / f"{problem_name}.in"
+    output = problem_dir / f"{problem_name}.out"
+    sample_input = problem_dir / f"{problem_name}_sample_input.txt"
+    sample_output = problem_dir / f"{problem_name}_sample_output.txt"
+    code = guess_code_file(problem_name, problem_dir)
+    problem_description = problem_dir / f"{problem_name}.md"
+    return Problem(
+        problem_dir=problem_dir,
+        problem_name=problem_name,
+        problem_description=problem_description.read_text(),
+        sample_input=str(sample_input),
+        sample_output=str(sample_output),
+        input=str(input),
+        output=str(output),
+        code=str(code),
+    )
+
+def find_problems(folder: Path) -> list[dict]:
+    """
+    Find all the problems in the given folder.
+    """
+    problems = []
+
+    # search for all files ending in .in
+    problem_names = [file.stem for file in folder.glob("**/*.in")]
+    for problem_name in problem_names:
+        try:
+            problems.append(load_problem(problem_name, folder))
+        except Exception as e:
+            logging.error(f"Error loading problem {problem_name}: {e}")
+    logging.info(f"Found {len(problems)} problems")
+    return problems
+
 
 async def run_python(program: Path, input_file: Path, output_file: Path, timeout: float = 10):
     """
@@ -49,7 +105,7 @@ async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout:
     base_name = os.path.splitext(cpp_file.name)[0]
     
     # Compile the C++ program
-    compile_command = f"g++ {cpp_file} -o {base_name}"
+    compile_command = f"g++ {cpp_file} -std=c++11 -o {base_name}"
     process = await asyncio.create_subprocess_shell(
         compile_command,
         stdout=asyncio.subprocess.PIPE,
@@ -89,79 +145,89 @@ async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout:
             os.remove(base_name)
 
 @weave.op
-async def run_program(program: Path, input: Path, output: Path, timeout: float = 10):
-    if program.suffix == ".cpp":
-        logging.info(f"Running C++ program: {program}")
-        await run_cpp(program, input, output, timeout)
-    elif program.suffix == ".py":
-        logging.info(f"Running Python program: {program}")
-        await run_python(program, input, output, timeout)
-    else:
-        raise ValueError(f"Unsupported file type: {program.suffix}")
-    return "success"
+async def run_program(code: Path, input: Path, output: Path, timeout: float = 10):
+    try:
+        if code.suffix == ".cpp":
+            logging.info(f"Running C++ program: {code}")
+            await run_cpp(code, input, output, timeout)
+        elif code.suffix == ".py":
+            logging.info(f"Running Python program: {code}")
+            await run_python(code, input, output, timeout)
+        else:
+            raise ValueError(f"Unsupported file type: {code}")
+    except Exception as e:
+        raise e
+    return
 
 @weave.op
-def check_solution(model_output: str, expected_output: str):
-    print(f"In Check Solution: {model_output}, {expected_output}")
-    output = Path(model_output["generated_output"]).read_text() # these may be big!
-    expected_output = Path(expected_output).read_text()
-    return {"solved": output.strip() == expected_output.strip()}
+def check_solution(model_output: dict, output: str):
+    "A simple check to see if the output is correct"
+    # these may be big!
+    generated_output = Path(model_output["generated_output"]).read_text() 
+    output = Path(output).read_text()
+    return {"solved": generated_output.strip() == output.strip(), "runnable": model_output["runnable"]}
 
 @dataclass
 class Args(simple_parsing.Serializable):
-    program: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp"
+    code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp"
     input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in"
     output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out"
     eval_name: str = "super_dupper_model"
     weave_project: str = "hackercup-eval-solution"
     timeout: float = 10
+    suffix: str = "_generated_output.txt"
+    debug: bool = False
+    folder: str = None
+    run_samples: bool = False
 
 if __name__=="__main__":
     args = simple_parsing.parse(Args)
+    setup_logger(args.debug)
+
     weave.init(args.weave_project)
     
-    generated_output_file = Path("generated_output.txt")
-
-    asyncio.run(run_program(
-        Path(args.program),
-        Path(args.input),
-        generated_output_file,
-        timeout=args.timeout
-    ))
-
-    passed = check_solution({"generated_output": generated_output_file}, Path(args.output))
-    logging.info(f"Program passed: {passed}")
-
-    class Runner(weave.Model):
-        timeout: float = 10
-
-        @weave.op
-        async def predict(self, program: str, input: str):
-            program, input = Path(program), Path(input)
-            print(program, input)
-            generated_output = input.parent / (input.stem + "_generated_output.txt")
-            print(f"Saving generated output to {generated_output}")
-            status = await run_program(program, input, generated_output, timeout=self.timeout)
-            predict_output =  {"generated_output": generated_output, "status": status}
-            print(f"Predict output: {predict_output}")
-            return predict_output
-
 
-
-    dataset = [{
-        "program": args.program,
-        "input": args.input,
-        "expected_output": args.output,
-    }]
-
-
-    model = Runner(timeout=args.timeout)
-
-    # out = asyncio.run(model.predict(args.program, args.input))
-    
-    # passed = check_solution(out["model_output"], args.output)
-    # logging.info(f"Program passed: {passed}")
-    print("="*100)
-    # run the eval
-    evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution])
-    asyncio.run(evaluation.evaluate(model))
\ No newline at end of file
+    # run one file
+    @weave.op
+    async def run_and_save(code: str, input: str, suffix: str, timeout: float):
+        code, input = Path(code), Path(input)
+        generated_output = input.parent / (input.stem + suffix)
+        try:
+            await run_program(code, input, generated_output, timeout=timeout)
+        except Exception as e:
+            generated_output.write_text(str(e))
+            return {"generated_output": generated_output, "runnable": False, "error": str(e)}
+        return {"generated_output": generated_output, "runnable": True, "error": None}
+
+    if not args.folder:
+        logging.info(f"Running file: {args.code}")
+        out = asyncio.run(run_and_save(
+            args.code,
+            args.input,
+            args.suffix,
+            args.timeout
+        ))
+
+        passed = check_solution(out, args.output)
+        logging.info(f"Program passed: {passed}")
+    else:
+        logging.info(f"Running folder: {args.folder}")
+        logging.info("="*60)
+        problems = find_problems(Path(args.folder))
+        class Runner(weave.Model):
+            timeout: float = 10
+            suffix: str = "_generated_output.txt"
+
+            @weave.op
+            async def predict(self, code: str, input: str):
+                return await run_and_save(code, input, self.suffix, self.timeout)
+
+        dataset = [{"input": problem.sample_input if args.run_samples else problem.input, 
+                    "output": problem.sample_output if args.run_samples else problem.output,
+                    "code": problem.code,
+                    "problem_name": problem.problem_name} for problem in problems]
+
+
+        model = Runner(timeout=args.timeout)
+        evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution])
+        asyncio.run(evaluation.evaluate(model))
\ No newline at end of file

From 224f8206540ea213051a91678f49198fca289ac2 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Tue, 17 Sep 2024 19:57:46 +0200
Subject: [PATCH 3/5] black

---
 eval.py | 120 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 47 deletions(-)

diff --git a/eval.py b/eval.py
index ce369ba..cb0b420 100644
--- a/eval.py
+++ b/eval.py
@@ -10,6 +10,7 @@
 from pydantic import BaseModel, Field
 from rich.logging import RichHandler
 
+
 def setup_logger(debug=False):
     level = "DEBUG" if debug else "INFO"
     logging.basicConfig(
@@ -18,17 +19,20 @@ def setup_logger(debug=False):
 
 
 class Problem(BaseModel):
-    problem_dir: Path = Field(
-        ..., description="The path to the problem directory"
-    )
+    problem_dir: Path = Field(..., description="The path to the problem directory")
     problem_name: str = Field(..., description="The name of the problem")
     problem_description: str = Field(..., description="The description of the problem")
-    sample_input: str = Field(..., description="The path to the sample input of the problem")
-    sample_output: str = Field(..., description="The path to the sample output of the problem")
+    sample_input: str = Field(
+        ..., description="The path to the sample input of the problem"
+    )
+    sample_output: str = Field(
+        ..., description="The path to the sample output of the problem"
+    )
     code: str = Field(..., description="The path to the code file")
     input: str = Field(..., description="The path to the input file")
     output: str = Field(..., description="The path to the output file")
 
+
 def guess_code_file(problem_name: str, problem_dir: Path) -> Path:
     if os.path.exists(problem_dir / f"{problem_name}.cpp"):
         return problem_dir / f"{problem_name}.cpp"
@@ -37,6 +41,7 @@ def guess_code_file(problem_name: str, problem_dir: Path) -> Path:
     else:
         raise ValueError(f"No code file found for problem {problem_name}")
 
+
 def load_problem(problem_name: str, problem_dir: Path) -> Problem:
     input = problem_dir / f"{problem_name}.in"
     output = problem_dir / f"{problem_name}.out"
@@ -55,6 +60,7 @@ def load_problem(problem_name: str, problem_dir: Path) -> Problem:
         code=str(code),
     )
 
+
 def find_problems(folder: Path) -> list[dict]:
     """
     Find all the problems in the given folder.
@@ -72,78 +78,87 @@ def find_problems(folder: Path) -> list[dict]:
     return problems
 
 
-async def run_python(program: Path, input_file: Path, output_file: Path, timeout: float = 10):
+async def run_python(
+    program: Path, input_file: Path, output_file: Path, timeout: float = 10
+):
     """
     Run a Python program with the given input file and output file.
     """
     try:
         process = await asyncio.create_subprocess_exec(
-            sys.executable, program,
-            stdin=input_file.open('rb'),
-            stdout=output_file.open('wb'),
-            stderr=asyncio.subprocess.PIPE
+            sys.executable,
+            program,
+            stdin=input_file.open("rb"),
+            stdout=output_file.open("wb"),
+            stderr=asyncio.subprocess.PIPE,
         )
-        
+
         try:
             _, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
         except asyncio.TimeoutError:
             process.kill()
             raise TimeoutError(f"Program execution timed out after {timeout} seconds")
-        
+
         if process.returncode != 0:
             raise RuntimeError(f"Program execution failed: {stderr.decode()}")
-        
+
         logging.info(f"Output saved to {output_file}")
     except Exception as e:
         raise RuntimeError(f"Error running Python program: {str(e)}")
 
-async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10):
+
+async def run_cpp(
+    cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10
+):
     """
     Run a C++ program with the given input file and output file.
     """
     # Get the base name of the cpp file (without extension)
     base_name = os.path.splitext(cpp_file.name)[0]
-    
+
     # Compile the C++ program
     compile_command = f"g++ {cpp_file} -std=c++11 -o {base_name}"
     process = await asyncio.create_subprocess_shell(
-        compile_command,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE
+        compile_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
     stdout, stderr = await process.communicate()
-    
+
     if process.returncode != 0:
         raise RuntimeError(f"Compilation failed: {stderr.decode()}")
-    
+
     try:
         # Run the compiled program with input from file
-        with open(input_file, 'r') as infile:
+        with open(input_file, "r") as infile:
             process = await asyncio.create_subprocess_exec(
                 f"./{base_name}",
                 stdin=infile,
                 stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE
+                stderr=asyncio.subprocess.PIPE,
             )
-            
+
             try:
-                stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout)
+                stdout, stderr = await asyncio.wait_for(
+                    process.communicate(), timeout=timeout
+                )
             except asyncio.TimeoutError:
                 process.kill()
-                raise TimeoutError(f"Program execution timed out after {timeout} seconds")
-            
+                raise TimeoutError(
+                    f"Program execution timed out after {timeout} seconds"
+                )
+
             if process.returncode != 0:
                 raise RuntimeError(f"Program execution failed: {stderr.decode()}")
-            
+
             output_file.write_text(stdout.decode())
-        
+
         logging.info(f"Output saved to {output_file}")
-    
+
     finally:
         # Clean up the compiled file
         if os.path.exists(base_name):
             os.remove(base_name)
 
+
 @weave.op
 async def run_program(code: Path, input: Path, output: Path, timeout: float = 10):
     try:
@@ -159,13 +174,18 @@ async def run_program(code: Path, input: Path, output: Path, timeout: float = 10
         raise e
     return
 
+
 @weave.op
 def check_solution(model_output: dict, output: str):
     "A simple check to see if the output is correct"
     # these may be big!
-    generated_output = Path(model_output["generated_output"]).read_text() 
+    generated_output = Path(model_output["generated_output"]).read_text()
     output = Path(output).read_text()
-    return {"solved": generated_output.strip() == output.strip(), "runnable": model_output["runnable"]}
+    return {
+        "solved": generated_output.strip() == output.strip(),
+        "runnable": model_output["runnable"],
+    }
+
 
 @dataclass
 class Args(simple_parsing.Serializable):
@@ -180,12 +200,12 @@ class Args(simple_parsing.Serializable):
     folder: str = None
     run_samples: bool = False
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     args = simple_parsing.parse(Args)
     setup_logger(args.debug)
 
     weave.init(args.weave_project)
-    
 
     # run one file
     @weave.op
@@ -196,24 +216,26 @@ async def run_and_save(code: str, input: str, suffix: str, timeout: float):
             await run_program(code, input, generated_output, timeout=timeout)
         except Exception as e:
             generated_output.write_text(str(e))
-            return {"generated_output": generated_output, "runnable": False, "error": str(e)}
+            return {
+                "generated_output": generated_output,
+                "runnable": False,
+                "error": str(e),
+            }
         return {"generated_output": generated_output, "runnable": True, "error": None}
 
     if not args.folder:
         logging.info(f"Running file: {args.code}")
-        out = asyncio.run(run_and_save(
-            args.code,
-            args.input,
-            args.suffix,
-            args.timeout
-        ))
+        out = asyncio.run(
+            run_and_save(args.code, args.input, args.suffix, args.timeout)
+        )
 
         passed = check_solution(out, args.output)
         logging.info(f"Program passed: {passed}")
     else:
         logging.info(f"Running folder: {args.folder}")
-        logging.info("="*60)
+        logging.info("=" * 60)
         problems = find_problems(Path(args.folder))
+
         class Runner(weave.Model):
             timeout: float = 10
             suffix: str = "_generated_output.txt"
@@ -222,12 +244,16 @@ class Runner(weave.Model):
             async def predict(self, code: str, input: str):
                 return await run_and_save(code, input, self.suffix, self.timeout)
 
-        dataset = [{"input": problem.sample_input if args.run_samples else problem.input, 
-                    "output": problem.sample_output if args.run_samples else problem.output,
-                    "code": problem.code,
-                    "problem_name": problem.problem_name} for problem in problems]
-
+        dataset = [
+            {
+                "input": problem.sample_input if args.run_samples else problem.input,
+                "output": problem.sample_output if args.run_samples else problem.output,
+                "code": problem.code,
+                "problem_name": problem.problem_name,
+            }
+            for problem in problems
+        ]
 
         model = Runner(timeout=args.timeout)
         evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution])
-        asyncio.run(evaluation.evaluate(model))
\ No newline at end of file
+        asyncio.run(evaluation.evaluate(model))

From c29bc82dfb835cf3e95f8ffc37be4bb9dd0b1b30 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Tue, 17 Sep 2024 20:13:20 +0200
Subject: [PATCH 4/5] add cpp version param

---
 eval.py | 53 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/eval.py b/eval.py
index cb0b420..afe79a2 100644
--- a/eval.py
+++ b/eval.py
@@ -108,7 +108,8 @@ async def run_python(
 
 
 async def run_cpp(
-    cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10
+    cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10,
+    cpp_version: int = 11,
 ):
     """
     Run a C++ program with the given input file and output file.
@@ -117,7 +118,7 @@ async def run_cpp(
     base_name = os.path.splitext(cpp_file.name)[0]
 
     # Compile the C++ program
-    compile_command = f"g++ {cpp_file} -std=c++11 -o {base_name}"
+    compile_command = f"g++ {cpp_file} -std=c++{cpp_version} -o {base_name}"
     process = await asyncio.create_subprocess_shell(
         compile_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
     )
@@ -160,11 +161,11 @@ async def run_cpp(
 
 
 @weave.op
-async def run_program(code: Path, input: Path, output: Path, timeout: float = 10):
+async def run_program(code: Path, input: Path, output: Path, timeout: float = 10, cpp_version: int = 11):
     try:
         if code.suffix == ".cpp":
             logging.info(f"Running C++ program: {code}")
-            await run_cpp(code, input, output, timeout)
+            await run_cpp(code, input, output, timeout, cpp_version)
         elif code.suffix == ".py":
             logging.info(f"Running Python program: {code}")
             await run_python(code, input, output, timeout)
@@ -187,6 +188,24 @@ def check_solution(model_output: dict, output: str):
     }
 
 
+@weave.op
+async def run_and_save_output(code: str, input: str, suffix: str, timeout: float, cpp_version: int = 11) -> dict:
+    """
+    Run the program and save the output to a file.
+    """
+    code, input = Path(code), Path(input)
+    generated_output = input.parent / (input.stem + suffix)
+    try:
+        await run_program(code, input, generated_output, timeout=timeout, cpp_version=cpp_version)
+    except Exception as e:
+        generated_output.write_text(str(e))
+        return {
+            "generated_output": generated_output,
+            "runnable": False,
+            "error": str(e),
+        }
+    return {"generated_output": generated_output, "runnable": True, "error": None}
+
 @dataclass
 class Args(simple_parsing.Serializable):
     code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp"
@@ -194,11 +213,12 @@ class Args(simple_parsing.Serializable):
     output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out"
     eval_name: str = "super_dupper_model"
     weave_project: str = "hackercup-eval-solution"
-    timeout: float = 10
+    timeout: float = 30
     suffix: str = "_generated_output.txt"
     debug: bool = False
     folder: str = None
     run_samples: bool = False
+    cpp_version: int = 11
 
 
 if __name__ == "__main__":
@@ -208,29 +228,15 @@ class Args(simple_parsing.Serializable):
     weave.init(args.weave_project)
 
     # run one file
-    @weave.op
-    async def run_and_save(code: str, input: str, suffix: str, timeout: float):
-        code, input = Path(code), Path(input)
-        generated_output = input.parent / (input.stem + suffix)
-        try:
-            await run_program(code, input, generated_output, timeout=timeout)
-        except Exception as e:
-            generated_output.write_text(str(e))
-            return {
-                "generated_output": generated_output,
-                "runnable": False,
-                "error": str(e),
-            }
-        return {"generated_output": generated_output, "runnable": True, "error": None}
-
     if not args.folder:
         logging.info(f"Running file: {args.code}")
         out = asyncio.run(
-            run_and_save(args.code, args.input, args.suffix, args.timeout)
+            run_and_save_output(args.code, args.input, args.suffix, args.timeout, args.cpp_version)
         )
 
         passed = check_solution(out, args.output)
         logging.info(f"Program passed: {passed}")
+    
     else:
         logging.info(f"Running folder: {args.folder}")
         logging.info("=" * 60)
@@ -239,10 +245,11 @@ async def run_and_save(code: str, input: str, suffix: str, timeout: float):
         class Runner(weave.Model):
             timeout: float = 10
             suffix: str = "_generated_output.txt"
+            cpp_version: int = 11
 
             @weave.op
             async def predict(self, code: str, input: str):
-                return await run_and_save(code, input, self.suffix, self.timeout)
+                return await run_and_save_output(code, input, self.suffix, self.timeout, self.cpp_version)
 
         dataset = [
             {
@@ -254,6 +261,6 @@ async def predict(self, code: str, input: str):
             for problem in problems
         ]
 
-        model = Runner(timeout=args.timeout)
+        model = Runner(timeout=args.timeout, suffix=args.suffix, cpp_version=args.cpp_version)
         evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution])
         asyncio.run(evaluation.evaluate(model))

From 8631036e60c0de4d1f2d7d366f5b9444c60a8525 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Tue, 17 Sep 2024 20:15:14 +0200
Subject: [PATCH 5/5] add docstrings

---
 eval.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/eval.py b/eval.py
index afe79a2..0890aa8 100644
--- a/eval.py
+++ b/eval.py
@@ -208,22 +208,22 @@ async def run_and_save_output(code: str, input: str, suffix: str, timeout: float
 
 @dataclass
 class Args(simple_parsing.Serializable):
-    code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp"
-    input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in"
-    output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out"
-    eval_name: str = "super_dupper_model"
-    weave_project: str = "hackercup-eval-solution"
-    timeout: float = 30
-    suffix: str = "_generated_output.txt"
-    debug: bool = False
-    folder: str = None
-    run_samples: bool = False
-    cpp_version: int = 11
+    code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp" # The file to run
+    input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in" # The input to run the program on
+    output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out" # The output to compare against
+    eval_name: str = "super_dupper_model" # The name of the evaluation
+    weave_project: str = "hackercup-eval-solution" # The name of the weave project
+    timeout: float = 30 # The timeout for the program execution (per problem)
+    suffix: str = "_generated_output.txt" # The suffix for the generated output file
+    verbose: bool = False # Whether to print verbose output
+    folder: str = None # Run all problems in this folder
+    run_samples: bool = False # Whether to run on the sample input/output pairs
+    cpp_version: int = 11 # The C++ version to use for the program execution
 
 
 if __name__ == "__main__":
     args = simple_parsing.parse(Args)
-    setup_logger(args.debug)
+    setup_logger(args.verbose)
 
     weave.init(args.weave_project)
 
@@ -262,5 +262,5 @@ async def predict(self, code: str, input: str):
         ]
 
         model = Runner(timeout=args.timeout, suffix=args.suffix, cpp_version=args.cpp_version)
-        evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution])
+        evaluation = weave.Evaluation(name=args.eval_name, dataset=dataset, scorers=[check_solution])
         asyncio.run(evaluation.evaluate(model))