From a3e2d356d98d56617fed9814ada04f573196c3eb Mon Sep 17 00:00:00 2001
From: ganler <jaway.liu@gmail.com>
Date: Sat, 9 Aug 2025 21:10:44 +0000
Subject: [PATCH] fix: correct org and dataset paths

---
 README.md                       | 19 ++++++++++++++++++-
 eval/compile_xscode/cwe2ovrf.py |  4 ++--
 sft/ctxdistill_qwen14b.yaml     |  2 +-
 sft/ctxdistill_qwen32b.yaml     |  2 +-
 4 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index 04b916e..328b948 100644
--- a/README.md
+++ b/README.md
@@ -62,6 +62,19 @@ We will go through the example based on `Qwen/Qwen2.5-14B-Instruct-1M`:
 
 ### Rejection Sampling
 
+This step aims to generate SFT data for later use.
+Note that we already have pre-generated datasets:
+
+* [`Qwen2.5-14B-Instruct-1M`](https://huggingface.co/datasets/purpcode/ctxdistill-verified-Qwen2.5-14B-Instruct-1M-57k) via best of 8
+* [`Qwen2.5-32B-Instruct`](https://huggingface.co/datasets/purpcode/ctxdistill-verified-Qwen2.5-32B-Instruct-55k) via best of 4
+
+To generate data from scratch or for other models, follow the steps below:
+
+<details><summary><b>Rejection Sampling from Scratch</b> <i>:: click to expand ::</i></summary>
+<div>
+
+The instructions are exemplified for `Qwen/Qwen2.5-14B-Instruct-1M`. Please change the model names and the later SFT script accordingly for other models.
+
 ```bash
 # --- TMUX SESSION "sgl" ---
 conda create -n sgl python=3.12 -y
@@ -102,6 +115,10 @@ python datagen/ctxdistill/post.py --generation-path Qwen2.5-14B-Instruct-1M.dist
 # ----------------------------
 ```
 
+</div>
+</details>
+
+
 ### Running SFT
 
 ```bash
@@ -116,7 +133,7 @@ pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 
 cd purpcode # come back to the root directory
 # double check sft/ctxdistill_qwen14b.yaml to make sure the paths are aligned well
-axolotl train sft/ctxdistill_qwen14b.yaml --deepspeed deepspeed_configs/zero3.json
+axolotl train sft/ctxdistill_qwen14b.yaml --deepspeed deepspeed_configs/zero3.json # default to pre-generated datasets
 # --> outputs/purpcode-14b-ctxdistill
 ```
 
diff --git a/eval/compile_xscode/cwe2ovrf.py b/eval/compile_xscode/cwe2ovrf.py
index 95a218c..ef258d4 100644
--- a/eval/compile_xscode/cwe2ovrf.py
+++ b/eval/compile_xscode/cwe2ovrf.py
@@ -224,7 +224,7 @@ def load_codeguru_vulnerabilities(file_path):
     return vulnerabilities
 
 
-def create_codeguru_information(dataset_path: str = "purpcorn/codeguru-rules"):
+def create_codeguru_information(dataset_path: str = "purpcode/codeguru-rules"):
     collection = {}
     ds = load_dataset(dataset_path, split="scraped")
 
@@ -574,7 +574,7 @@ def cwe2ovrf_main(
 
     collection = create_cwe_information()
     if vuln_rules_type == "codeguru":
-        collection = create_codeguru_information("purpcorn/codeguru-rules")
+        collection = create_codeguru_information("purpcode/codeguru-rules")
 
     if os.path.exists(init_filepath):
         cprint(f"Found existing init messages at {init_filepath}", "yellow")
diff --git a/sft/ctxdistill_qwen14b.yaml b/sft/ctxdistill_qwen14b.yaml
index 88d1c75..9e689da 100644
--- a/sft/ctxdistill_qwen14b.yaml
+++ b/sft/ctxdistill_qwen14b.yaml
@@ -61,7 +61,7 @@ bf16: true
 
 # dataset
 datasets:
-  - path: purpcode/ctxdistill-verified-Qwen2.5-14B-Instruct-1M.jsonl
+  - path: purpcode/ctxdistill-verified-Qwen2.5-14B-Instruct-1M-57k
     type: chat_template
     field_messages: messages
     message_field_training: train
diff --git a/sft/ctxdistill_qwen32b.yaml b/sft/ctxdistill_qwen32b.yaml
index c8a8a33..aa42d53 100644
--- a/sft/ctxdistill_qwen32b.yaml
+++ b/sft/ctxdistill_qwen32b.yaml
@@ -61,7 +61,7 @@ bf16: true
 
 # dataset
 datasets:
-  - path: purpcode/ctxdistill-verified-Qwen-2.5-32B-Instruct.jsonl
+  - path: purpcode/ctxdistill-verified-Qwen2.5-32B-Instruct-55k
     type: chat_template
     field_messages: messages
     message_field_training: train