From a3e2d356d98d56617fed9814ada04f573196c3eb Mon Sep 17 00:00:00 2001 From: ganler Date: Sat, 9 Aug 2025 21:10:44 +0000 Subject: [PATCH] fix: correct org and dataset paths --- README.md | 19 ++++++++++++++++++- eval/compile_xscode/cwe2ovrf.py | 4 ++-- sft/ctxdistill_qwen14b.yaml | 2 +- sft/ctxdistill_qwen32b.yaml | 2 +- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 04b916e..328b948 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,19 @@ We will go through the example based on `Qwen/Qwen2.5-14B-Instruct-1M`: ### Rejection Sampling +This step aims to generate SFT data for later use. +Note that we already have pre-generated datasets: + +* [`Qwen2.5-14B-Instruct-1M`](https://huggingface.co/datasets/purpcode/ctxdistill-verified-Qwen2.5-14B-Instruct-1M-57k) via best of 8 +* [`Qwen2.5-32B-Instruct`](https://huggingface.co/datasets/purpcode/ctxdistill-verified-Qwen2.5-32B-Instruct-55k) via best of 4 + +To generate data from scratch or for other models, follow the steps below: + +
Rejection Sampling from Scratch :: click to expand :: +
+ +The instructions are exemplified for `Qwen/Qwen2.5-14B-Instruct-1M`. Please change the model names and the later SFT script accordingly for other models. + ```bash # --- TMUX SESSION "sgl" --- conda create -n sgl python=3.12 -y @@ -102,6 +115,10 @@ python datagen/ctxdistill/post.py --generation-path Qwen2.5-14B-Instruct-1M.dist # ---------------------------- ``` +
+
+ + ### Running SFT ```bash @@ -116,7 +133,7 @@ pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]' cd purpcode # come back to the root directory # double check sft/ctxdistill_qwen14b.yaml to make sure the paths are aligned well -axolotl train sft/ctxdistill_qwen14b.yaml --deepspeed deepspeed_configs/zero3.json +axolotl train sft/ctxdistill_qwen14b.yaml --deepspeed deepspeed_configs/zero3.json # default to pre-generated datasets # --> outputs/purpcode-14b-ctxdistill ``` diff --git a/eval/compile_xscode/cwe2ovrf.py b/eval/compile_xscode/cwe2ovrf.py index 95a218c..ef258d4 100644 --- a/eval/compile_xscode/cwe2ovrf.py +++ b/eval/compile_xscode/cwe2ovrf.py @@ -224,7 +224,7 @@ def load_codeguru_vulnerabilities(file_path): return vulnerabilities -def create_codeguru_information(dataset_path: str = "purpcorn/codeguru-rules"): +def create_codeguru_information(dataset_path: str = "purpcode/codeguru-rules"): collection = {} ds = load_dataset(dataset_path, split="scraped") @@ -574,7 +574,7 @@ def cwe2ovrf_main( collection = create_cwe_information() if vuln_rules_type == "codeguru": - collection = create_codeguru_information("purpcorn/codeguru-rules") + collection = create_codeguru_information("purpcode/codeguru-rules") if os.path.exists(init_filepath): cprint(f"Found existing init messages at {init_filepath}", "yellow") diff --git a/sft/ctxdistill_qwen14b.yaml b/sft/ctxdistill_qwen14b.yaml index 88d1c75..9e689da 100644 --- a/sft/ctxdistill_qwen14b.yaml +++ b/sft/ctxdistill_qwen14b.yaml @@ -61,7 +61,7 @@ bf16: true # dataset datasets: - - path: purpcode/ctxdistill-verified-Qwen2.5-14B-Instruct-1M.jsonl + - path: purpcode/ctxdistill-verified-Qwen2.5-14B-Instruct-1M-57k type: chat_template field_messages: messages message_field_training: train diff --git a/sft/ctxdistill_qwen32b.yaml b/sft/ctxdistill_qwen32b.yaml index c8a8a33..aa42d53 100644 --- a/sft/ctxdistill_qwen32b.yaml +++ b/sft/ctxdistill_qwen32b.yaml @@ -61,7 +61,7 @@ bf16: true # dataset datasets: - - path: purpcode/ctxdistill-verified-Qwen-2.5-32B-Instruct.jsonl + - path: purpcode/ctxdistill-verified-Qwen2.5-32B-Instruct-55k type: chat_template field_messages: messages message_field_training: train