From 84109ec32a5840fdb10375de92395fba39f4dc01 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Tue, 17 Sep 2024 18:31:06 +0200 Subject: [PATCH 1/5] simple eval script ouf! --- eval.py | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 eval.py diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..b545396 --- /dev/null +++ b/eval.py @@ -0,0 +1,167 @@ +import asyncio +import os +import sys +import logging +from dataclasses import dataclass +from pathlib import Path + +import weave +import simple_parsing +from rich.logging import RichHandler + +logging.basicConfig( + level=logging.INFO, + format="%(message)s", + datefmt="[%X]", + handlers=[RichHandler(rich_tracebacks=True)] +) + +async def run_python(program: Path, input_file: Path, output_file: Path, timeout: float = 10): + """ + Run a Python program with the given input file and output file. + """ + try: + process = await asyncio.create_subprocess_exec( + sys.executable, program, + stdin=input_file.open('rb'), + stdout=output_file.open('wb'), + stderr=asyncio.subprocess.PIPE + ) + + try: + _, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) + except asyncio.TimeoutError: + process.kill() + raise TimeoutError(f"Program execution timed out after {timeout} seconds") + + if process.returncode != 0: + raise RuntimeError(f"Program execution failed: {stderr.decode()}") + + logging.info(f"Output saved to {output_file}") + except Exception as e: + raise RuntimeError(f"Error running Python program: {str(e)}") + +async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10): + """ + Run a C++ program with the given input file and output file. + """ + # Get the base name of the cpp file (without extension) + base_name = os.path.splitext(cpp_file.name)[0] + + # Compile the C++ program + compile_command = f"g++ {cpp_file} -o {base_name}" + process = await asyncio.create_subprocess_shell( + compile_command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + if process.returncode != 0: + raise RuntimeError(f"Compilation failed: {stderr.decode()}") + + try: + # Run the compiled program with input from file + with open(input_file, 'r') as infile: + process = await asyncio.create_subprocess_exec( + f"./{base_name}", + stdin=infile, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + + try: + stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) + except asyncio.TimeoutError: + process.kill() + raise TimeoutError(f"Program execution timed out after {timeout} seconds") + + if process.returncode != 0: + raise RuntimeError(f"Program execution failed: {stderr.decode()}") + + output_file.write_text(stdout.decode()) + + logging.info(f"Output saved to {output_file}") + + finally: + # Clean up the compiled file + if os.path.exists(base_name): + os.remove(base_name) + +@weave.op +async def run_program(program: Path, input: Path, output: Path, timeout: float = 10): + if program.suffix == ".cpp": + logging.info(f"Running C++ program: {program}") + await run_cpp(program, input, output, timeout) + elif program.suffix == ".py": + logging.info(f"Running Python program: {program}") + await run_python(program, input, output, timeout) + else: + raise ValueError(f"Unsupported file type: {program.suffix}") + return "success" + +@weave.op +def check_solution(model_output: str, expected_output: str): + print(f"In Check Solution: {model_output}, {expected_output}") + output = Path(model_output["generated_output"]).read_text() # these may be big! + expected_output = Path(expected_output).read_text() + return {"solved": output.strip() == expected_output.strip()} + +@dataclass +class Args(simple_parsing.Serializable): + program: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp" + input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in" + output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out" + eval_name: str = "super_dupper_model" + weave_project: str = "hackercup-eval-solution" + timeout: float = 10 + +if __name__=="__main__": + args = simple_parsing.parse(Args) + weave.init(args.weave_project) + + generated_output_file = Path("generated_output.txt") + + asyncio.run(run_program( + Path(args.program), + Path(args.input), + generated_output_file, + timeout=args.timeout + )) + + passed = check_solution({"generated_output": generated_output_file}, Path(args.output)) + logging.info(f"Program passed: {passed}") + + class Runner(weave.Model): + timeout: float = 10 + + @weave.op + async def predict(self, program: str, input: str): + program, input = Path(program), Path(input) + print(program, input) + generated_output = input.parent / (input.stem + "_generated_output.txt") + print(f"Saving generated output to {generated_output}") + status = await run_program(program, input, generated_output, timeout=self.timeout) + predict_output = {"generated_output": generated_output, "status": status} + print(f"Predict output: {predict_output}") + return predict_output + + + + dataset = [{ + "program": args.program, + "input": args.input, + "expected_output": args.output, + }] + + + model = Runner(timeout=args.timeout) + + # out = asyncio.run(model.predict(args.program, args.input)) + + # passed = check_solution(out["model_output"], args.output) + # logging.info(f"Program passed: {passed}") + print("="*100) + # run the eval + evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution]) + asyncio.run(evaluation.evaluate(model)) \ No newline at end of file From ddcebc84cafa190b5a0bcfb6800480a34de66277 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Tue, 17 Sep 2024 19:56:27 +0200 Subject: [PATCH 2/5] add more options --- eval.py | 200 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 133 insertions(+), 67 deletions(-) diff --git a/eval.py b/eval.py index b545396..ce369ba 100644 --- a/eval.py +++ b/eval.py @@ -7,14 +7,70 @@ import weave import simple_parsing +from pydantic import BaseModel, Field from rich.logging import RichHandler -logging.basicConfig( - level=logging.INFO, - format="%(message)s", - datefmt="[%X]", - handlers=[RichHandler(rich_tracebacks=True)] -) +def setup_logger(debug=False): + level = "DEBUG" if debug else "INFO" + logging.basicConfig( + level=level, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()] + ) + + +class Problem(BaseModel): + problem_dir: Path = Field( + ..., description="The path to the problem directory" + ) + problem_name: str = Field(..., description="The name of the problem") + problem_description: str = Field(..., description="The description of the problem") + sample_input: str = Field(..., description="The path to the sample input of the problem") + sample_output: str = Field(..., description="The path to the sample output of the problem") + code: str = Field(..., description="The path to the code file") + input: str = Field(..., description="The path to the input file") + output: str = Field(..., description="The path to the output file") + +def guess_code_file(problem_name: str, problem_dir: Path) -> Path: + if os.path.exists(problem_dir / f"{problem_name}.cpp"): + return problem_dir / f"{problem_name}.cpp" + elif os.path.exists(problem_dir / f"{problem_name}.py"): + return problem_dir / f"{problem_name}.py" + else: + raise ValueError(f"No code file found for problem {problem_name}") + +def load_problem(problem_name: str, problem_dir: Path) -> Problem: + input = problem_dir / f"{problem_name}.in" + output = problem_dir / f"{problem_name}.out" + sample_input = problem_dir / f"{problem_name}_sample_input.txt" + sample_output = problem_dir / f"{problem_name}_sample_output.txt" + code = guess_code_file(problem_name, problem_dir) + problem_description = problem_dir / f"{problem_name}.md" + return Problem( + problem_dir=problem_dir, + problem_name=problem_name, + problem_description=problem_description.read_text(), + sample_input=str(sample_input), + sample_output=str(sample_output), + input=str(input), + output=str(output), + code=str(code), + ) + +def find_problems(folder: Path) -> list[dict]: + """ + Find all the problems in the given folder. + """ + problems = [] + + # search for all files ending in .in + problem_names = [file.stem for file in folder.glob("**/*.in")] + for problem_name in problem_names: + try: + problems.append(load_problem(problem_name, folder)) + except Exception as e: + logging.error(f"Error loading problem {problem_name}: {e}") + logging.info(f"Found {len(problems)} problems") + return problems + async def run_python(program: Path, input_file: Path, output_file: Path, timeout: float = 10): """ @@ -49,7 +105,7 @@ async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout: base_name = os.path.splitext(cpp_file.name)[0] # Compile the C++ program - compile_command = f"g++ {cpp_file} -o {base_name}" + compile_command = f"g++ {cpp_file} -std=c++11 -o {base_name}" process = await asyncio.create_subprocess_shell( compile_command, stdout=asyncio.subprocess.PIPE, @@ -89,79 +145,89 @@ async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout: os.remove(base_name) @weave.op -async def run_program(program: Path, input: Path, output: Path, timeout: float = 10): - if program.suffix == ".cpp": - logging.info(f"Running C++ program: {program}") - await run_cpp(program, input, output, timeout) - elif program.suffix == ".py": - logging.info(f"Running Python program: {program}") - await run_python(program, input, output, timeout) - else: - raise ValueError(f"Unsupported file type: {program.suffix}") - return "success" +async def run_program(code: Path, input: Path, output: Path, timeout: float = 10): + try: + if code.suffix == ".cpp": + logging.info(f"Running C++ program: {code}") + await run_cpp(code, input, output, timeout) + elif code.suffix == ".py": + logging.info(f"Running Python program: {code}") + await run_python(code, input, output, timeout) + else: + raise ValueError(f"Unsupported file type: {code}") + except Exception as e: + raise e + return @weave.op -def check_solution(model_output: str, expected_output: str): - print(f"In Check Solution: {model_output}, {expected_output}") - output = Path(model_output["generated_output"]).read_text() # these may be big! - expected_output = Path(expected_output).read_text() - return {"solved": output.strip() == expected_output.strip()} +def check_solution(model_output: dict, output: str): + "A simple check to see if the output is correct" + # these may be big! + generated_output = Path(model_output["generated_output"]).read_text() + output = Path(output).read_text() + return {"solved": generated_output.strip() == output.strip(), "runnable": model_output["runnable"]} @dataclass class Args(simple_parsing.Serializable): - program: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp" + code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp" input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in" output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out" eval_name: str = "super_dupper_model" weave_project: str = "hackercup-eval-solution" timeout: float = 10 + suffix: str = "_generated_output.txt" + debug: bool = False + folder: str = None + run_samples: bool = False if __name__=="__main__": args = simple_parsing.parse(Args) + setup_logger(args.debug) + weave.init(args.weave_project) - generated_output_file = Path("generated_output.txt") - - asyncio.run(run_program( - Path(args.program), - Path(args.input), - generated_output_file, - timeout=args.timeout - )) - - passed = check_solution({"generated_output": generated_output_file}, Path(args.output)) - logging.info(f"Program passed: {passed}") - - class Runner(weave.Model): - timeout: float = 10 - - @weave.op - async def predict(self, program: str, input: str): - program, input = Path(program), Path(input) - print(program, input) - generated_output = input.parent / (input.stem + "_generated_output.txt") - print(f"Saving generated output to {generated_output}") - status = await run_program(program, input, generated_output, timeout=self.timeout) - predict_output = {"generated_output": generated_output, "status": status} - print(f"Predict output: {predict_output}") - return predict_output - - - dataset = [{ - "program": args.program, - "input": args.input, - "expected_output": args.output, - }] - - - model = Runner(timeout=args.timeout) - - # out = asyncio.run(model.predict(args.program, args.input)) - - # passed = check_solution(out["model_output"], args.output) - # logging.info(f"Program passed: {passed}") - print("="*100) - # run the eval - evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution]) - asyncio.run(evaluation.evaluate(model)) \ No newline at end of file + # run one file + @weave.op + async def run_and_save(code: str, input: str, suffix: str, timeout: float): + code, input = Path(code), Path(input) + generated_output = input.parent / (input.stem + suffix) + try: + await run_program(code, input, generated_output, timeout=timeout) + except Exception as e: + generated_output.write_text(str(e)) + return {"generated_output": generated_output, "runnable": False, "error": str(e)} + return {"generated_output": generated_output, "runnable": True, "error": None} + + if not args.folder: + logging.info(f"Running file: {args.code}") + out = asyncio.run(run_and_save( + args.code, + args.input, + args.suffix, + args.timeout + )) + + passed = check_solution(out, args.output) + logging.info(f"Program passed: {passed}") + else: + logging.info(f"Running folder: {args.folder}") + logging.info("="*60) + problems = find_problems(Path(args.folder)) + class Runner(weave.Model): + timeout: float = 10 + suffix: str = "_generated_output.txt" + + @weave.op + async def predict(self, code: str, input: str): + return await run_and_save(code, input, self.suffix, self.timeout) + + dataset = [{"input": problem.sample_input if args.run_samples else problem.input, + "output": problem.sample_output if args.run_samples else problem.output, + "code": problem.code, + "problem_name": problem.problem_name} for problem in problems] + + + model = Runner(timeout=args.timeout) + evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution]) + asyncio.run(evaluation.evaluate(model)) \ No newline at end of file From 224f8206540ea213051a91678f49198fca289ac2 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Tue, 17 Sep 2024 19:57:46 +0200 Subject: [PATCH 3/5] black --- eval.py | 120 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 47 deletions(-) diff --git a/eval.py b/eval.py index ce369ba..cb0b420 100644 --- a/eval.py +++ b/eval.py @@ -10,6 +10,7 @@ from pydantic import BaseModel, Field from rich.logging import RichHandler + def setup_logger(debug=False): level = "DEBUG" if debug else "INFO" logging.basicConfig( @@ -18,17 +19,20 @@ def setup_logger(debug=False): class Problem(BaseModel): - problem_dir: Path = Field( - ..., description="The path to the problem directory" - ) + problem_dir: Path = Field(..., description="The path to the problem directory") problem_name: str = Field(..., description="The name of the problem") problem_description: str = Field(..., description="The description of the problem") - sample_input: str = Field(..., description="The path to the sample input of the problem") - sample_output: str = Field(..., description="The path to the sample output of the problem") + sample_input: str = Field( + ..., description="The path to the sample input of the problem" + ) + sample_output: str = Field( + ..., description="The path to the sample output of the problem" + ) code: str = Field(..., description="The path to the code file") input: str = Field(..., description="The path to the input file") output: str = Field(..., description="The path to the output file") + def guess_code_file(problem_name: str, problem_dir: Path) -> Path: if os.path.exists(problem_dir / f"{problem_name}.cpp"): return problem_dir / f"{problem_name}.cpp" @@ -37,6 +41,7 @@ def guess_code_file(problem_name: str, problem_dir: Path) -> Path: else: raise ValueError(f"No code file found for problem {problem_name}") + def load_problem(problem_name: str, problem_dir: Path) -> Problem: input = problem_dir / f"{problem_name}.in" output = problem_dir / f"{problem_name}.out" @@ -55,6 +60,7 @@ def load_problem(problem_name: str, problem_dir: Path) -> Problem: code=str(code), ) + def find_problems(folder: Path) -> list[dict]: """ Find all the problems in the given folder. @@ -72,78 +78,87 @@ def find_problems(folder: Path) -> list[dict]: return problems -async def run_python(program: Path, input_file: Path, output_file: Path, timeout: float = 10): +async def run_python( + program: Path, input_file: Path, output_file: Path, timeout: float = 10 +): """ Run a Python program with the given input file and output file. """ try: process = await asyncio.create_subprocess_exec( - sys.executable, program, - stdin=input_file.open('rb'), - stdout=output_file.open('wb'), - stderr=asyncio.subprocess.PIPE + sys.executable, + program, + stdin=input_file.open("rb"), + stdout=output_file.open("wb"), + stderr=asyncio.subprocess.PIPE, ) - + try: _, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) except asyncio.TimeoutError: process.kill() raise TimeoutError(f"Program execution timed out after {timeout} seconds") - + if process.returncode != 0: raise RuntimeError(f"Program execution failed: {stderr.decode()}") - + logging.info(f"Output saved to {output_file}") except Exception as e: raise RuntimeError(f"Error running Python program: {str(e)}") -async def run_cpp(cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10): + +async def run_cpp( + cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10 +): """ Run a C++ program with the given input file and output file. """ # Get the base name of the cpp file (without extension) base_name = os.path.splitext(cpp_file.name)[0] - + # Compile the C++ program compile_command = f"g++ {cpp_file} -std=c++11 -o {base_name}" process = await asyncio.create_subprocess_shell( - compile_command, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE + compile_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() - + if process.returncode != 0: raise RuntimeError(f"Compilation failed: {stderr.decode()}") - + try: # Run the compiled program with input from file - with open(input_file, 'r') as infile: + with open(input_file, "r") as infile: process = await asyncio.create_subprocess_exec( f"./{base_name}", stdin=infile, stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE + stderr=asyncio.subprocess.PIPE, ) - + try: - stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout) + stdout, stderr = await asyncio.wait_for( + process.communicate(), timeout=timeout + ) except asyncio.TimeoutError: process.kill() - raise TimeoutError(f"Program execution timed out after {timeout} seconds") - + raise TimeoutError( + f"Program execution timed out after {timeout} seconds" + ) + if process.returncode != 0: raise RuntimeError(f"Program execution failed: {stderr.decode()}") - + output_file.write_text(stdout.decode()) - + logging.info(f"Output saved to {output_file}") - + finally: # Clean up the compiled file if os.path.exists(base_name): os.remove(base_name) + @weave.op async def run_program(code: Path, input: Path, output: Path, timeout: float = 10): try: @@ -159,13 +174,18 @@ async def run_program(code: Path, input: Path, output: Path, timeout: float = 10 raise e return + @weave.op def check_solution(model_output: dict, output: str): "A simple check to see if the output is correct" # these may be big! - generated_output = Path(model_output["generated_output"]).read_text() + generated_output = Path(model_output["generated_output"]).read_text() output = Path(output).read_text() - return {"solved": generated_output.strip() == output.strip(), "runnable": model_output["runnable"]} + return { + "solved": generated_output.strip() == output.strip(), + "runnable": model_output["runnable"], + } + @dataclass class Args(simple_parsing.Serializable): @@ -180,12 +200,12 @@ class Args(simple_parsing.Serializable): folder: str = None run_samples: bool = False -if __name__=="__main__": + +if __name__ == "__main__": args = simple_parsing.parse(Args) setup_logger(args.debug) weave.init(args.weave_project) - # run one file @weave.op @@ -196,24 +216,26 @@ async def run_and_save(code: str, input: str, suffix: str, timeout: float): await run_program(code, input, generated_output, timeout=timeout) except Exception as e: generated_output.write_text(str(e)) - return {"generated_output": generated_output, "runnable": False, "error": str(e)} + return { + "generated_output": generated_output, + "runnable": False, + "error": str(e), + } return {"generated_output": generated_output, "runnable": True, "error": None} if not args.folder: logging.info(f"Running file: {args.code}") - out = asyncio.run(run_and_save( - args.code, - args.input, - args.suffix, - args.timeout - )) + out = asyncio.run( + run_and_save(args.code, args.input, args.suffix, args.timeout) + ) passed = check_solution(out, args.output) logging.info(f"Program passed: {passed}") else: logging.info(f"Running folder: {args.folder}") - logging.info("="*60) + logging.info("=" * 60) problems = find_problems(Path(args.folder)) + class Runner(weave.Model): timeout: float = 10 suffix: str = "_generated_output.txt" @@ -222,12 +244,16 @@ class Runner(weave.Model): async def predict(self, code: str, input: str): return await run_and_save(code, input, self.suffix, self.timeout) - dataset = [{"input": problem.sample_input if args.run_samples else problem.input, - "output": problem.sample_output if args.run_samples else problem.output, - "code": problem.code, - "problem_name": problem.problem_name} for problem in problems] - + dataset = [ + { + "input": problem.sample_input if args.run_samples else problem.input, + "output": problem.sample_output if args.run_samples else problem.output, + "code": problem.code, + "problem_name": problem.problem_name, + } + for problem in problems + ] model = Runner(timeout=args.timeout) evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution]) - asyncio.run(evaluation.evaluate(model)) \ No newline at end of file + asyncio.run(evaluation.evaluate(model)) From c29bc82dfb835cf3e95f8ffc37be4bb9dd0b1b30 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Tue, 17 Sep 2024 20:13:20 +0200 Subject: [PATCH 4/5] add cpp version param --- eval.py | 53 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/eval.py b/eval.py index cb0b420..afe79a2 100644 --- a/eval.py +++ b/eval.py @@ -108,7 +108,8 @@ async def run_python( async def run_cpp( - cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10 + cpp_file: Path, input_file: Path, output_file: Path, timeout: float = 10, + cpp_version: int = 11, ): """ Run a C++ program with the given input file and output file. @@ -117,7 +118,7 @@ async def run_cpp( base_name = os.path.splitext(cpp_file.name)[0] # Compile the C++ program - compile_command = f"g++ {cpp_file} -std=c++11 -o {base_name}" + compile_command = f"g++ {cpp_file} -std=c++{cpp_version} -o {base_name}" process = await asyncio.create_subprocess_shell( compile_command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) @@ -160,11 +161,11 @@ async def run_cpp( @weave.op -async def run_program(code: Path, input: Path, output: Path, timeout: float = 10): +async def run_program(code: Path, input: Path, output: Path, timeout: float = 10, cpp_version: int = 11): try: if code.suffix == ".cpp": logging.info(f"Running C++ program: {code}") - await run_cpp(code, input, output, timeout) + await run_cpp(code, input, output, timeout, cpp_version) elif code.suffix == ".py": logging.info(f"Running Python program: {code}") await run_python(code, input, output, timeout) @@ -187,6 +188,24 @@ def check_solution(model_output: dict, output: str): } +@weave.op +async def run_and_save_output(code: str, input: str, suffix: str, timeout: float, cpp_version: int = 11) -> dict: + """ + Run the program and save the output to a file. + """ + code, input = Path(code), Path(input) + generated_output = input.parent / (input.stem + suffix) + try: + await run_program(code, input, generated_output, timeout=timeout, cpp_version=cpp_version) + except Exception as e: + generated_output.write_text(str(e)) + return { + "generated_output": generated_output, + "runnable": False, + "error": str(e), + } + return {"generated_output": generated_output, "runnable": True, "error": None} + @dataclass class Args(simple_parsing.Serializable): code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp" @@ -194,11 +213,12 @@ class Args(simple_parsing.Serializable): output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out" eval_name: str = "super_dupper_model" weave_project: str = "hackercup-eval-solution" - timeout: float = 10 + timeout: float = 30 suffix: str = "_generated_output.txt" debug: bool = False folder: str = None run_samples: bool = False + cpp_version: int = 11 if __name__ == "__main__": @@ -208,29 +228,15 @@ class Args(simple_parsing.Serializable): weave.init(args.weave_project) # run one file - @weave.op - async def run_and_save(code: str, input: str, suffix: str, timeout: float): - code, input = Path(code), Path(input) - generated_output = input.parent / (input.stem + suffix) - try: - await run_program(code, input, generated_output, timeout=timeout) - except Exception as e: - generated_output.write_text(str(e)) - return { - "generated_output": generated_output, - "runnable": False, - "error": str(e), - } - return {"generated_output": generated_output, "runnable": True, "error": None} - if not args.folder: logging.info(f"Running file: {args.code}") out = asyncio.run( - run_and_save(args.code, args.input, args.suffix, args.timeout) + run_and_save_output(args.code, args.input, args.suffix, args.timeout, args.cpp_version) ) passed = check_solution(out, args.output) logging.info(f"Program passed: {passed}") + else: logging.info(f"Running folder: {args.folder}") logging.info("=" * 60) @@ -239,10 +245,11 @@ async def run_and_save(code: str, input: str, suffix: str, timeout: float): class Runner(weave.Model): timeout: float = 10 suffix: str = "_generated_output.txt" + cpp_version: int = 11 @weave.op async def predict(self, code: str, input: str): - return await run_and_save(code, input, self.suffix, self.timeout) + return await run_and_save_output(code, input, self.suffix, self.timeout, self.cpp_version) dataset = [ { @@ -254,6 +261,6 @@ async def predict(self, code: str, input: str): for problem in problems ] - model = Runner(timeout=args.timeout) + model = Runner(timeout=args.timeout, suffix=args.suffix, cpp_version=args.cpp_version) evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution]) asyncio.run(evaluation.evaluate(model)) From 8631036e60c0de4d1f2d7d366f5b9444c60a8525 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Tue, 17 Sep 2024 20:15:14 +0200 Subject: [PATCH 5/5] add docstrings --- eval.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/eval.py b/eval.py index afe79a2..0890aa8 100644 --- a/eval.py +++ b/eval.py @@ -208,22 +208,22 @@ async def run_and_save_output(code: str, input: str, suffix: str, timeout: float @dataclass class Args(simple_parsing.Serializable): - code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp" - input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in" - output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out" - eval_name: str = "super_dupper_model" - weave_project: str = "hackercup-eval-solution" - timeout: float = 30 - suffix: str = "_generated_output.txt" - debug: bool = False - folder: str = None - run_samples: bool = False - cpp_version: int = 11 + code: str = "dataset/2023/practice/cheeseburger_corollary_ch1.cpp" # The file to run + input: str = "dataset/2023/practice/cheeseburger_corollary_ch1.in" # The input to run the program on + output: str = "dataset/2023/practice/cheeseburger_corollary_ch1.out" # The output to compare against + eval_name: str = "super_dupper_model" # The name of the evaluation + weave_project: str = "hackercup-eval-solution" # The name of the weave project + timeout: float = 30 # The timeout for the program execution (per problem) + suffix: str = "_generated_output.txt" # The suffix for the generated output file + verbose: bool = False # Whether to print verbose output + folder: str = None # Run all problems in this folder + run_samples: bool = False # Whether to run on the sample input/output pairs + cpp_version: int = 11 # The C++ version to use for the program execution if __name__ == "__main__": args = simple_parsing.parse(Args) - setup_logger(args.debug) + setup_logger(args.verbose) weave.init(args.weave_project) @@ -262,5 +262,5 @@ async def predict(self, code: str, input: str): ] model = Runner(timeout=args.timeout, suffix=args.suffix, cpp_version=args.cpp_version) - evaluation = weave.Evaluation(dataset=dataset, scorers=[check_solution]) + evaluation = weave.Evaluation(name=args.eval_name, dataset=dataset, scorers=[check_solution]) asyncio.run(evaluation.evaluate(model))