From 3f9f1ba5da7ea785b4b35d65a9bd22219340b44b Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Wed, 5 Oct 2022 21:09:56 +0100
Subject: [PATCH 01/16] Fix bugs in scraping code and add dask

---
 codepile/ghdiffs/ghdiffs_scrape.py | 113 +++++++++++++++++++++++++++++
 codepile/ghdiffs/test_file.json    |  20 +++++
 2 files changed, 133 insertions(+)
 create mode 100644 codepile/ghdiffs/ghdiffs_scrape.py
 create mode 100644 codepile/ghdiffs/test_file.json

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
new file mode 100644
index 0000000..332bdee
--- /dev/null
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -0,0 +1,113 @@
+import json
+import urllib.request
+from pathlib import Path
+from pprint import pprint
+
+import dask.bag as db
+import dask.dataframe as dd
+import pyarrow as pa
+from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources,
+                              Processor, RawDataset, Scraper)
+from unidiff import PatchSet
+
+
+def process_ind_patch(patch_diff) -> dict:
+    """Process patch to get diff data."""
+    patch_parsed_diff: dict = {
+        "hunks": [],
+    }
+
+    patch_parsed_diff["addition_count"] = patch_diff.added
+    patch_parsed_diff["deletion_count"] = patch_diff.removed
+    patch_parsed_diff["src_file"] = patch_diff.source_file
+    patch_parsed_diff["tgt_file"] = patch_diff.target_file
+    # patch_parsed_diff["patch_info"] = patch_diff.patch_info
+    patch_diff_list = list(patch_diff)
+    for patch_diff_ind in patch_diff_list:
+        patch_diff_ind = str(patch_diff_ind)
+        patch_diff_split = patch_diff_ind.split("@@")
+        patch_diff_line = patch_diff_split[2].split("\n")
+        patch_diff_line_numbers = [list(map(int, hunk.strip("-+").split(",")))
+                                   for hunk in patch_diff_split[1].strip().split(" ")]
+        patch_parsed_diff["hunks"].append(patch_diff_line_numbers + patch_diff_line[:-1])
+    return patch_parsed_diff
+
+
+def patch_parse(commit_hash: str, repo_name: str) -> list:
+    """Parse a commit to get diff data."""
+    diff_url = (f"https://github.com/{repo_name}/commit/{commit_hash}.diff")
+    diff = urllib.request.urlopen(diff_url)
+    encoding = diff.headers.get_charsets()[0]
+    patch = PatchSet(diff, encoding=encoding)
+    diff_list: list = []
+    for patch_ind in patch:
+        # Skip if the file is not a python file.
+        # if not patch_ind.target_file.endswith(".py"):
+        #     continue
+        diff_list.append(process_ind_patch(patch_ind))
+    return diff_list
+
+
+def apply_reverse_patch(diff_list: list, commit_hash: str, repo_name: str, length_threshold: int = 4096) -> list:
+    """Apply reverse patch to get before files. Returns list of modified files."""
+    files_list: list = []
+    repo_owner, repo_name = repo_name.split("/")
+    for diff in diff_list:
+        if diff["src_file"] == "/dev/null":
+            files_list.append([])
+            continue
+        # Get raw after file.
+        file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/"
+                        f"{repo_name}/{commit_hash}/{diff['tgt_file'][2:]}")
+        raw_file = urllib.request.urlopen(file_raw_url)
+        raw_file_encoding = raw_file.headers.get_charsets()[0]
+        raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
+        # file_length = sum(1 for _ in raw_file)
+        # if file_length < length_threshold:
+        files_list.append(raw_file)
+        # Iterate over hunks for this file and apply the reverse patch.
+        for hunk in diff["hunks"]:
+            hunk_list = []
+            for line in hunk[3:]:
+                if line.startswith("-") or line.startswith(" "):
+                    hunk_list.append(line[1:] + "\n")
+            files_list[-1][hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list
+
+    return files_list
+
+
+def process_commit(commit_data: dict) -> dict:
+    """Process a commit hash and repo name to get the before files and diff dict."""
+    # Get dict containing diff data.
+    diff_list = patch_parse(commit_data["commit"], commit_data["repo_name"])
+    # Get list of files, each of which is a list of strings, one for each line.
+    files_list = apply_reverse_patch(diff_list, commit_data["commit"], commit_data["repo_name"])
+    commit_data["before_files"] = files_list
+    commit_data["diff"] = [json.dumps(diff) for diff in diff_list]
+    return commit_data
+
+
+class GitHubDiffDataset(Dataset):
+    def __init__(self, config):
+        pass
+
+
+class GitHubDiffScraper(Scraper):
+    def __init__(self, config):
+        pass
+
+    def scrape(self):
+        pass
+
+
+if __name__ == "__main__":
+    read_path = Path(__file__).parent / "test_file.json"
+    schema = pa.schema([
+        (pa.field("commit", pa.string())),
+        (pa.field("message", pa.string())),
+        (pa.field("repo_name", pa.string())),
+        (pa.field("before_files", pa.list_(pa.list_(pa.string())))),
+        (pa.field("diff", pa.list_(pa.string()))),
+    ])
+    db.read_text(read_path).map(json.loads).map(process_commit).to_dataframe().to_parquet("test.parquet",
+                                                                                          schema=schema)
diff --git a/codepile/ghdiffs/test_file.json b/codepile/ghdiffs/test_file.json
new file mode 100644
index 0000000..a515a81
--- /dev/null
+++ b/codepile/ghdiffs/test_file.json
@@ -0,0 +1,20 @@
+{"commit":"f96ce6590c9bad17df325fe965222fadc8a5d485","message":"Merge branch 'master' into assignments_small_fixes","repo_name":"tensorflow/tensorflow"}
+{"commit":"0ec0a9bc0437f069ea0b08c46f1734ab837d2bf6","message":"Unroll partial merge from #1472\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"c12b21ca8fb6f38b89fe769176079d8530c8683b","message":"In regular DataFeeder for classification convert y into int64 to not raise numpy warnings on itemset\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"038ccd6ab030222d6041421d949b71891809de6c","message":"Merge pull request #48 from frol/master\n\nFixed travis scripts to be POSIX sh complaint and proof against whitespace issues","repo_name":"tensorflow/tensorflow"}
+{"commit":"e52003b01cd4e7e0bc7a698bcfec331f6d5638f1","message":"Switch to enumerate in seq2seq ops\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"a64f3d7eaeceea5999b0530d55e32b6d2ab7d1dc","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"bd443b52367cc64358d4247a535439e7d42c6641","message":"Merge pull request #24 from terrytangyuan/patch-1\n\nAdded missing CV import in multiple_gpu example","repo_name":"tensorflow/tensorflow"}
+{"commit":"abdfb10f2887210d18ff3b33596085f104029d7d","message":"Up the required version of Tensorflow - due to recent RNN changes, skflow will now only support 0.7+\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"43cd5c01318e4c51d65eb53f77a2b27ef72c557d","message":"Fixes #22: Added cross_validation to iris.py and iris_custom_model.py\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"b75c2a316558cf26cf1c81644dad43a7c2edd332","message":"Added apache headers\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"21df765ae4a35f655b068ae7d2440da98944cb7e","message":"Adding rnn_decoder with sampling sub-graph. Adding rnn seq2seq that will work with it.\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"67e33815af883113216001db33a8d4d14a674823","message":"Refactored out Trainer and ops from main file\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"881265ff80209710b2f064a98e2b7bc4608bbf64","message":"Fixes #42: Added a test for GridSearchCV to check that DNN works correctly with it\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"135018dbe63d7d72201862d0c5e5878d1fa4a923","message":"Added more coverage in tests\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"9cfbd7bcc1b9fe89fc22261ba316d6d5336985b4","message":"Merge pull request #36 from terrytangyuan/master\n\nAllow pandas.DataFrame data types","repo_name":"tensorflow/tensorflow"}
+{"commit":"e27aa15ed7fd4f3d735bf7a8a29b97fdede710bd","message":"Add usage of batch norm in conv test and fix usage of is_training collection\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"05cae1f96f7ac2dfb6368fd87e6254194a04281a","message":"Merge pull request #89 from lopuhin/fix-dropout\n\nFix dropout management in TensorFlowEstimator._predict","repo_name":"tensorflow/tensorflow"}
+{"commit":"a9a488de5754936a7e67245033f0d6b5fe7b4c88","message":"Fix #130: Added explicit save/restore to VocabularyProcessor\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"edc98e6269dc448f8e55be9af6018e4ee768b223","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"}
+{"commit":"845b91dd680a15e37e516d80fc7bff498e827f94","message":"[tf.learn] Allow Attention in RNNClassifier and Regressor (#2820)\n\n* Added attention cell wrappers in rnn_model\r\n\r\n* refactor and add attn_rnn tests\r\n\r\n* revised according to comments on lint\r\n","repo_name":"tensorflow/tensorflow"}

From 2b679397ed5d9e4f4d1fdfe12664d3a656a99f16 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Wed, 5 Oct 2022 21:11:36 +0100
Subject: [PATCH 02/16] Add dask client

---
 codepile/ghdiffs/ghdiffs_scrape.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index 332bdee..a1c66b7 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -4,10 +4,10 @@
 from pprint import pprint
 
 import dask.bag as db
-import dask.dataframe as dd
 import pyarrow as pa
 from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources,
                               Processor, RawDataset, Scraper)
+from dask.distributed import Client
 from unidiff import PatchSet
 
 
@@ -102,6 +102,7 @@ def scrape(self):
 
 if __name__ == "__main__":
     read_path = Path(__file__).parent / "test_file.json"
+    client = Client(n_workers=8, threads_per_worker=2)
     schema = pa.schema([
         (pa.field("commit", pa.string())),
         (pa.field("message", pa.string())),

From f448fc4c2250c678d88e646707c6036c72359b15 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Thu, 6 Oct 2022 01:16:27 +0100
Subject: [PATCH 03/16] Fix parquet saving

---
 codepile/ghdiffs/ghdiffs_scrape.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index a1c66b7..8f6e0e3 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -4,6 +4,7 @@
 from pprint import pprint
 
 import dask.bag as db
+import numpy as np
 import pyarrow as pa
 from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources,
                               Processor, RawDataset, Scraper)
@@ -15,6 +16,7 @@ def process_ind_patch(patch_diff) -> dict:
     """Process patch to get diff data."""
     patch_parsed_diff: dict = {
         "hunks": [],
+        "hunks_process": [],
     }
 
     patch_parsed_diff["addition_count"] = patch_diff.added
@@ -29,7 +31,8 @@ def process_ind_patch(patch_diff) -> dict:
         patch_diff_line = patch_diff_split[2].split("\n")
         patch_diff_line_numbers = [list(map(int, hunk.strip("-+").split(",")))
                                    for hunk in patch_diff_split[1].strip().split(" ")]
-        patch_parsed_diff["hunks"].append(patch_diff_line_numbers + patch_diff_line[:-1])
+        patch_parsed_diff["hunks_process"].append(patch_diff_line_numbers + patch_diff_line[:-1])
+        patch_parsed_diff["hunks"].append(patch_diff_ind)
     return patch_parsed_diff
 
 
@@ -66,12 +69,13 @@ def apply_reverse_patch(diff_list: list, commit_hash: str, repo_name: str, lengt
         # if file_length < length_threshold:
         files_list.append(raw_file)
         # Iterate over hunks for this file and apply the reverse patch.
-        for hunk in diff["hunks"]:
+        for hunk in diff["hunks_process"]:
             hunk_list = []
             for line in hunk[3:]:
                 if line.startswith("-") or line.startswith(" "):
                     hunk_list.append(line[1:] + "\n")
             files_list[-1][hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list
+        del diff["hunks_process"]
 
     return files_list
 
@@ -83,7 +87,7 @@ def process_commit(commit_data: dict) -> dict:
     # Get list of files, each of which is a list of strings, one for each line.
     files_list = apply_reverse_patch(diff_list, commit_data["commit"], commit_data["repo_name"])
     commit_data["before_files"] = files_list
-    commit_data["diff"] = [json.dumps(diff) for diff in diff_list]
+    commit_data["diff"] = diff_list
     return commit_data
 
 
@@ -103,12 +107,19 @@ def scrape(self):
 if __name__ == "__main__":
     read_path = Path(__file__).parent / "test_file.json"
     client = Client(n_workers=8, threads_per_worker=2)
+    diff_struct = pa.struct([
+        ("addition_count", pa.int32()),
+        ("deletion_count", pa.int32()),
+        ("src_file", pa.string()),
+        ("tgt_file", pa.string()),
+        ("hunks", pa.list_(pa.string())),
+    ])
     schema = pa.schema([
         (pa.field("commit", pa.string())),
         (pa.field("message", pa.string())),
         (pa.field("repo_name", pa.string())),
         (pa.field("before_files", pa.list_(pa.list_(pa.string())))),
-        (pa.field("diff", pa.list_(pa.string()))),
+        (pa.field("diff", pa.list_(diff_struct))),
     ])
     db.read_text(read_path).map(json.loads).map(process_commit).to_dataframe().to_parquet("test.parquet",
                                                                                           schema=schema)

From d421d6a0d0d689ba1f38bc90ef029b700d0e08fd Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Thu, 6 Oct 2022 17:55:34 +0100
Subject: [PATCH 04/16] Implement GitHubDiffDataset, fix dask loading

---
 codepile/ghdiffs/ghdiffs_scrape.py | 134 +++++++++++++++--------------
 1 file changed, 68 insertions(+), 66 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index 8f6e0e3..73c4cb4 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -2,6 +2,7 @@
 import urllib.request
 from pathlib import Path
 from pprint import pprint
+from xml.dom.minidom import ReadOnlySequentialNamedNodeMap
 
 import dask.bag as db
 import numpy as np
@@ -33,93 +34,94 @@ def process_ind_patch(patch_diff) -> dict:
                                    for hunk in patch_diff_split[1].strip().split(" ")]
         patch_parsed_diff["hunks_process"].append(patch_diff_line_numbers + patch_diff_line[:-1])
         patch_parsed_diff["hunks"].append(patch_diff_ind)
+    patch_parsed_diff["hunks"] = "".join(patch_parsed_diff["hunks"])
     return patch_parsed_diff
 
 
-def patch_parse(commit_hash: str, repo_name: str) -> list:
-    """Parse a commit to get diff data."""
-    diff_url = (f"https://github.com/{repo_name}/commit/{commit_hash}.diff")
+def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str:
+    repo_owner, repo_name = repo_name.split("/")
+    if file_diff["src_file"] == "/dev/null":
+        return ""
+    # Get raw after file.
+    file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/"
+                    f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}")
+    raw_file = urllib.request.urlopen(file_raw_url)
+    raw_file_encoding = raw_file.headers.get_charsets()[0]
+    raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
+    # file_length = sum(1 for _ in raw_file)
+    # if file_length < length_threshold:
+    # files_list.append(raw_file)
+    # Iterate over hunks for this file and apply the reverse patch.
+    for hunk in file_diff["hunks_process"]:
+        hunk_list = []
+        for line in hunk[3:]:
+            if line.startswith("-") or line.startswith(" "):
+                hunk_list.append(line[1:] + "\n")
+        raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list
+    del file_diff["hunks_process"]  # Deletes this item from the dict in parent functions
+    return "".join(raw_file)
+
+
+def process_commit(commit_data: dict) -> list[dict]:
+    """
+    Process a commit dictionary to get the before files and diff dict.
+
+    Args:
+        commit_data (dict): Dictionary containing commit hash, repo name, and
+        commit message.
+
+    Returns:
+        list[dict]: A list of dicts, where each dict contains the data for a
+        change to a single file.
+    """
+    # Scrape a commit's diff file.
+    diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff"
     diff = urllib.request.urlopen(diff_url)
     encoding = diff.headers.get_charsets()[0]
     patch = PatchSet(diff, encoding=encoding)
-    diff_list: list = []
+    commit_list: list[dict] = []
     for patch_ind in patch:
         # Skip if the file is not a python file.
         # if not patch_ind.target_file.endswith(".py"):
         #     continue
-        diff_list.append(process_ind_patch(patch_ind))
-    return diff_list
-
-
-def apply_reverse_patch(diff_list: list, commit_hash: str, repo_name: str, length_threshold: int = 4096) -> list:
-    """Apply reverse patch to get before files. Returns list of modified files."""
-    files_list: list = []
-    repo_owner, repo_name = repo_name.split("/")
-    for diff in diff_list:
-        if diff["src_file"] == "/dev/null":
-            files_list.append([])
-            continue
-        # Get raw after file.
-        file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/"
-                        f"{repo_name}/{commit_hash}/{diff['tgt_file'][2:]}")
-        raw_file = urllib.request.urlopen(file_raw_url)
-        raw_file_encoding = raw_file.headers.get_charsets()[0]
-        raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
-        # file_length = sum(1 for _ in raw_file)
-        # if file_length < length_threshold:
-        files_list.append(raw_file)
-        # Iterate over hunks for this file and apply the reverse patch.
-        for hunk in diff["hunks_process"]:
-            hunk_list = []
-            for line in hunk[3:]:
-                if line.startswith("-") or line.startswith(" "):
-                    hunk_list.append(line[1:] + "\n")
-            files_list[-1][hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list
-        del diff["hunks_process"]
-
-    return files_list
-
-
-def process_commit(commit_data: dict) -> dict:
-    """Process a commit hash and repo name to get the before files and diff dict."""
-    # Get dict containing diff data.
-    diff_list = patch_parse(commit_data["commit"], commit_data["repo_name"])
-    # Get list of files, each of which is a list of strings, one for each line.
-    files_list = apply_reverse_patch(diff_list, commit_data["commit"], commit_data["repo_name"])
-    commit_data["before_files"] = files_list
-    commit_data["diff"] = diff_list
-    return commit_data
+        diff_dict: dict = process_ind_patch(patch_ind)
+        diff_dict.update(commit_data)
+        diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"])
+        commit_list.append(diff_dict)
+    return commit_list
 
 
 class GitHubDiffDataset(Dataset):
     def __init__(self, config):
-        pass
+        self.config = config
+        self.scraper = GitHubDiffScraper(self.config)
+
+    def download(self, *args, **kwargs) -> RawDataset:
+        return self.scraper.scrape()
+
+    def process(self):
+        raise NotImplementedError
 
 
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
-        pass
+        # TODO: Dask multi-node scheduling here
+        client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
+        self.read_path = Path(config.read_path)
+        self.save_path = Path(config.save_path)
 
-    def scrape(self):
-        pass
+    def scrape(self) -> RawDataset:
+        _ = (
+            db.read_text(self.read_path).map(json.loads)
+            .map(process_commit).flatten().to_dataframe()
+            .to_parquet(self.save_path)
+        )
+        dataset = RawDataset(storage_uris=self.save_path, complete=True)
+        return dataset
 
 
 if __name__ == "__main__":
     read_path = Path(__file__).parent / "test_file.json"
+    save_path = Path(__file__).parent / "test.parquet"
     client = Client(n_workers=8, threads_per_worker=2)
-    diff_struct = pa.struct([
-        ("addition_count", pa.int32()),
-        ("deletion_count", pa.int32()),
-        ("src_file", pa.string()),
-        ("tgt_file", pa.string()),
-        ("hunks", pa.list_(pa.string())),
-    ])
-    schema = pa.schema([
-        (pa.field("commit", pa.string())),
-        (pa.field("message", pa.string())),
-        (pa.field("repo_name", pa.string())),
-        (pa.field("before_files", pa.list_(pa.list_(pa.string())))),
-        (pa.field("diff", pa.list_(diff_struct))),
-    ])
-    db.read_text(read_path).map(json.loads).map(process_commit).to_dataframe().to_parquet("test.parquet",
-                                                                                          schema=schema)
+    db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path)

From 0177806e00b2aab3e948657aa966b43995488ca9 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Fri, 7 Oct 2022 00:41:26 +0100
Subject: [PATCH 05/16] Fix file deletion bug

---
 codepile/ghdiffs/ghdiffs_scrape.py | 57 +++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index 73c4cb4..835c437 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -1,14 +1,9 @@
 import json
 import urllib.request
 from pathlib import Path
-from pprint import pprint
-from xml.dom.minidom import ReadOnlySequentialNamedNodeMap
-
+import datetime
 import dask.bag as db
-import numpy as np
-import pyarrow as pa
-from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources,
-                              Processor, RawDataset, Scraper)
+from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper)
 from dask.distributed import Client
 from unidiff import PatchSet
 
@@ -24,6 +19,7 @@ def process_ind_patch(patch_diff) -> dict:
     patch_parsed_diff["deletion_count"] = patch_diff.removed
     patch_parsed_diff["src_file"] = patch_diff.source_file
     patch_parsed_diff["tgt_file"] = patch_diff.target_file
+    patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix
     # patch_parsed_diff["patch_info"] = patch_diff.patch_info
     patch_diff_list = list(patch_diff)
     for patch_diff_ind in patch_diff_list:
@@ -40,14 +36,19 @@ def process_ind_patch(patch_diff) -> dict:
 
 def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str:
     repo_owner, repo_name = repo_name.split("/")
-    if file_diff["src_file"] == "/dev/null":
+    if file_diff["src_file"] == "/dev/null" or file_diff["tgt_file"] == "/dev/null":
         return ""
     # Get raw after file.
     file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/"
                     f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}")
-    raw_file = urllib.request.urlopen(file_raw_url)
-    raw_file_encoding = raw_file.headers.get_charsets()[0]
-    raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
+    try:
+        raw_file = urllib.request.urlopen(file_raw_url)
+        raw_file_encoding = raw_file.headers.get_charsets()[0]
+        raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
+    except Exception as e:
+        print(e)
+        print(file_raw_url)
+        return ""
     # file_length = sum(1 for _ in raw_file)
     # if file_length < length_threshold:
     # files_list.append(raw_file)
@@ -76,7 +77,10 @@ def process_commit(commit_data: dict) -> list[dict]:
     """
     # Scrape a commit's diff file.
     diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff"
-    diff = urllib.request.urlopen(diff_url)
+    try:
+        diff = urllib.request.urlopen(diff_url)
+    except Exception as e:
+        return []
     encoding = diff.headers.get_charsets()[0]
     patch = PatchSet(diff, encoding=encoding)
     commit_list: list[dict] = []
@@ -102,13 +106,23 @@ def download(self, *args, **kwargs) -> RawDataset:
     def process(self):
         raise NotImplementedError
 
+    @property
+    def info(self) -> DatasetInfo:
+        return DatasetInfo(
+            id="GitHubDiffDataset",
+            description="Dataset of diffs from GitHub")
+
+    @property
+    def id(self) -> str:
+        return ""
+
 
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
         # TODO: Dask multi-node scheduling here
-        client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
-        self.read_path = Path(config.read_path)
-        self.save_path = Path(config.save_path)
+        client = Client(n_workers=16, threads_per_worker=1)
+        self.read_path = Path(config["read_path"])
+        self.save_path = Path(config["save_path"])
 
     def scrape(self) -> RawDataset:
         _ = (
@@ -116,12 +130,15 @@ def scrape(self) -> RawDataset:
             .map(process_commit).flatten().to_dataframe()
             .to_parquet(self.save_path)
         )
-        dataset = RawDataset(storage_uris=self.save_path, complete=True)
+        dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True)
         return dataset
 
 
 if __name__ == "__main__":
-    read_path = Path(__file__).parent / "test_file.json"
-    save_path = Path(__file__).parent / "test.parquet"
-    client = Client(n_workers=8, threads_per_worker=2)
-    db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path)
+    read_path = Path(__file__).parent / "full_test.json"
+    save_path = Path(__file__).parent / "full_test.parquet"
+    config = {"read_path": read_path, "save_path": save_path}
+    ghdiff_dataset = GitHubDiffDataset(config)
+    ghdiff_dataset.download()
+    # client = Client(n_workers=16, threads_per_worker=1)
+    # db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path)

From ac781b3b38375249740a054a4d7bf84bace21500 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Fri, 7 Oct 2022 13:24:44 +0100
Subject: [PATCH 06/16] Fix deleted file bugs, add argparse

---
 .gitignore                         |  1 +
 codepile/ghdiffs/ghdiffs_scrape.py | 86 +++++++++++++++++-------------
 2 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/.gitignore b/.gitignore
index 68bc17f..0561ce6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -158,3 +158,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+.vscode/
diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index 835c437..3f879b9 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -1,10 +1,10 @@
 import json
 import urllib.request
+import argparse
 from pathlib import Path
-import datetime
 import dask.bag as db
 from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper)
-from dask.distributed import Client
+from dask.distributed import Client, progress
 from unidiff import PatchSet
 
 
@@ -19,10 +19,12 @@ def process_ind_patch(patch_diff) -> dict:
     patch_parsed_diff["deletion_count"] = patch_diff.removed
     patch_parsed_diff["src_file"] = patch_diff.source_file
     patch_parsed_diff["tgt_file"] = patch_diff.target_file
-    patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix
+    if patch_parsed_diff["tgt_file"] == "/dev/null":
+        patch_parsed_diff["file_extension"] = Path(patch_diff.source_file).suffix
+    else:
+        patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix
     # patch_parsed_diff["patch_info"] = patch_diff.patch_info
-    patch_diff_list = list(patch_diff)
-    for patch_diff_ind in patch_diff_list:
+    for patch_diff_ind in patch_diff:
         patch_diff_ind = str(patch_diff_ind)
         patch_diff_split = patch_diff_ind.split("@@")
         patch_diff_line = patch_diff_split[2].split("\n")
@@ -36,29 +38,33 @@ def process_ind_patch(patch_diff) -> dict:
 
 def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str:
     repo_owner, repo_name = repo_name.split("/")
-    if file_diff["src_file"] == "/dev/null" or file_diff["tgt_file"] == "/dev/null":
-        return ""
-    # Get raw after file.
-    file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/"
-                    f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}")
-    try:
-        raw_file = urllib.request.urlopen(file_raw_url)
-        raw_file_encoding = raw_file.headers.get_charsets()[0]
-        raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
-    except Exception as e:
-        print(e)
-        print(file_raw_url)
-        return ""
-    # file_length = sum(1 for _ in raw_file)
-    # if file_length < length_threshold:
-    # files_list.append(raw_file)
-    # Iterate over hunks for this file and apply the reverse patch.
-    for hunk in file_diff["hunks_process"]:
-        hunk_list = []
-        for line in hunk[3:]:
-            if line.startswith("-") or line.startswith(" "):
-                hunk_list.append(line[1:] + "\n")
-        raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list
+    if file_diff["src_file"] == "/dev/null":
+        raw_file = ["Add"]
+    elif file_diff["tgt_file"] == "/dev/null":
+        # If file is deleted, get before file from the raw diff, which will be the full file.
+        raw_file = [line[1:] + "\n" for line in file_diff["hunks_process"][0][3:]]
+    else:
+        # Get raw after file.
+        file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/"
+                        f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}")
+        try:
+            raw_file = urllib.request.urlopen(file_raw_url)
+            raw_file_encoding = raw_file.headers.get_charsets()[0]
+            raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
+        except Exception as e:
+            print(e)
+            print(file_raw_url)
+            return ""
+        # file_length = sum(1 for _ in raw_file)
+        # if file_length < length_threshold:
+        # files_list.append(raw_file)
+        # Iterate over hunks for this file and apply the reverse patch.
+        for hunk in file_diff["hunks_process"]:
+            hunk_list = []
+            for line in hunk[3:]:
+                if line.startswith("-") or line.startswith(" "):
+                    hunk_list.append(line[1:] + "\n")
+            raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list
     del file_diff["hunks_process"]  # Deletes this item from the dict in parent functions
     return "".join(raw_file)
 
@@ -80,13 +86,14 @@ def process_commit(commit_data: dict) -> list[dict]:
     try:
         diff = urllib.request.urlopen(diff_url)
     except Exception as e:
+        print(e)
         return []
     encoding = diff.headers.get_charsets()[0]
     patch = PatchSet(diff, encoding=encoding)
     commit_list: list[dict] = []
     for patch_ind in patch:
         # Skip if the file is not a python file.
-        # if not patch_ind.target_file.endswith(".py"):
+        # if not Path(patch_ind.target_file).suffix == ".py":
         #     continue
         diff_dict: dict = process_ind_patch(patch_ind)
         diff_dict.update(commit_data)
@@ -120,25 +127,28 @@ def id(self) -> str:
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
         # TODO: Dask multi-node scheduling here
-        client = Client(n_workers=16, threads_per_worker=1)
-        self.read_path = Path(config["read_path"])
-        self.save_path = Path(config["save_path"])
+        client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
+        self.read_path = Path(config.read_path)
+        self.save_path = Path(config.save_path)
 
     def scrape(self) -> RawDataset:
-        _ = (
+        result = (
             db.read_text(self.read_path).map(json.loads)
             .map(process_commit).flatten().to_dataframe()
             .to_parquet(self.save_path)
         )
+        progress(result)
         dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True)
         return dataset
 
 
 if __name__ == "__main__":
-    read_path = Path(__file__).parent / "full_test.json"
-    save_path = Path(__file__).parent / "full_test.parquet"
-    config = {"read_path": read_path, "save_path": save_path}
+    parser = argparse.ArgumentParser('codepile dataset tool')
+
+    parser.add_argument('--read_path', type=str)
+    parser.add_argument('--save_path', type=str)
+    parser.add_argument('--n_workers', type=int, default=8)
+    parser.add_argument('--threads_per_worker', type=int, default=2)
+    config = parser.parse_args()
     ghdiff_dataset = GitHubDiffDataset(config)
     ghdiff_dataset.download()
-    # client = Client(n_workers=16, threads_per_worker=1)
-    # db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path)

From 91b091eff92f3094657951b791633b655d362f3c Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Fri, 7 Oct 2022 13:27:10 +0100
Subject: [PATCH 07/16] Delete test json

---
 codepile/ghdiffs/test_file.json | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 codepile/ghdiffs/test_file.json

diff --git a/codepile/ghdiffs/test_file.json b/codepile/ghdiffs/test_file.json
deleted file mode 100644
index a515a81..0000000
--- a/codepile/ghdiffs/test_file.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{"commit":"f96ce6590c9bad17df325fe965222fadc8a5d485","message":"Merge branch 'master' into assignments_small_fixes","repo_name":"tensorflow/tensorflow"}
-{"commit":"0ec0a9bc0437f069ea0b08c46f1734ab837d2bf6","message":"Unroll partial merge from #1472\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"c12b21ca8fb6f38b89fe769176079d8530c8683b","message":"In regular DataFeeder for classification convert y into int64 to not raise numpy warnings on itemset\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"038ccd6ab030222d6041421d949b71891809de6c","message":"Merge pull request #48 from frol/master\n\nFixed travis scripts to be POSIX sh complaint and proof against whitespace issues","repo_name":"tensorflow/tensorflow"}
-{"commit":"e52003b01cd4e7e0bc7a698bcfec331f6d5638f1","message":"Switch to enumerate in seq2seq ops\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"a64f3d7eaeceea5999b0530d55e32b6d2ab7d1dc","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"bd443b52367cc64358d4247a535439e7d42c6641","message":"Merge pull request #24 from terrytangyuan/patch-1\n\nAdded missing CV import in multiple_gpu example","repo_name":"tensorflow/tensorflow"}
-{"commit":"abdfb10f2887210d18ff3b33596085f104029d7d","message":"Up the required version of Tensorflow - due to recent RNN changes, skflow will now only support 0.7+\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"43cd5c01318e4c51d65eb53f77a2b27ef72c557d","message":"Fixes #22: Added cross_validation to iris.py and iris_custom_model.py\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"b75c2a316558cf26cf1c81644dad43a7c2edd332","message":"Added apache headers\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"21df765ae4a35f655b068ae7d2440da98944cb7e","message":"Adding rnn_decoder with sampling sub-graph. Adding rnn seq2seq that will work with it.\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"67e33815af883113216001db33a8d4d14a674823","message":"Refactored out Trainer and ops from main file\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"881265ff80209710b2f064a98e2b7bc4608bbf64","message":"Fixes #42: Added a test for GridSearchCV to check that DNN works correctly with it\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"135018dbe63d7d72201862d0c5e5878d1fa4a923","message":"Added more coverage in tests\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"9cfbd7bcc1b9fe89fc22261ba316d6d5336985b4","message":"Merge pull request #36 from terrytangyuan/master\n\nAllow pandas.DataFrame data types","repo_name":"tensorflow/tensorflow"}
-{"commit":"e27aa15ed7fd4f3d735bf7a8a29b97fdede710bd","message":"Add usage of batch norm in conv test and fix usage of is_training collection\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"05cae1f96f7ac2dfb6368fd87e6254194a04281a","message":"Merge pull request #89 from lopuhin/fix-dropout\n\nFix dropout management in TensorFlowEstimator._predict","repo_name":"tensorflow/tensorflow"}
-{"commit":"a9a488de5754936a7e67245033f0d6b5fe7b4c88","message":"Fix #130: Added explicit save/restore to VocabularyProcessor\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"edc98e6269dc448f8e55be9af6018e4ee768b223","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"}
-{"commit":"845b91dd680a15e37e516d80fc7bff498e827f94","message":"[tf.learn] Allow Attention in RNNClassifier and Regressor (#2820)\n\n* Added attention cell wrappers in rnn_model\r\n\r\n* refactor and add attn_rnn tests\r\n\r\n* revised according to comments on lint\r\n","repo_name":"tensorflow/tensorflow"}

From b60ca7d95a51bf3f6902a0d0c341c545b38a6b42 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Fri, 7 Oct 2022 13:49:46 +0100
Subject: [PATCH 08/16] Fix self.client

---
 codepile/ghdiffs/ghdiffs_scrape.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index 3f879b9..010b497 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -127,7 +127,7 @@ def id(self) -> str:
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
         # TODO: Dask multi-node scheduling here
-        client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
+        self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
         self.read_path = Path(config.read_path)
         self.save_path = Path(config.save_path)
 

From ef100741a22fec7635f56de6982bbb93cf6725d2 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Mon, 17 Oct 2022 13:32:30 +0100
Subject: [PATCH 09/16] Add filtering

---
 codepile/ghdiffs/ghdiffs_scrape.py | 99 ++++++++++++++++++------------
 1 file changed, 59 insertions(+), 40 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index 010b497..cf06bea 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -1,4 +1,5 @@
 import json
+from typing import Any
 import urllib.request
 import argparse
 from pathlib import Path
@@ -36,10 +37,10 @@ def process_ind_patch(patch_diff) -> dict:
     return patch_parsed_diff
 
 
-def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str:
+def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_threshold: int) -> str:
     repo_owner, repo_name = repo_name.split("/")
     if file_diff["src_file"] == "/dev/null":
-        raw_file = ["Add"]
+        raw_file: Any = ["ADDFILE"]
     elif file_diff["tgt_file"] == "/dev/null":
         # If file is deleted, get before file from the raw diff, which will be the full file.
         raw_file = [line[1:] + "\n" for line in file_diff["hunks_process"][0][3:]]
@@ -51,13 +52,12 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str:
             raw_file = urllib.request.urlopen(file_raw_url)
             raw_file_encoding = raw_file.headers.get_charsets()[0]
             raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()]
+            if length_threshold > 0 and len(raw_file) > length_threshold:
+                return ""
         except Exception as e:
             print(e)
             print(file_raw_url)
             return ""
-        # file_length = sum(1 for _ in raw_file)
-        # if file_length < length_threshold:
-        # files_list.append(raw_file)
         # Iterate over hunks for this file and apply the reverse patch.
         for hunk in file_diff["hunks_process"]:
             hunk_list = []
@@ -69,39 +69,6 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str:
     return "".join(raw_file)
 
 
-def process_commit(commit_data: dict) -> list[dict]:
-    """
-    Process a commit dictionary to get the before files and diff dict.
-
-    Args:
-        commit_data (dict): Dictionary containing commit hash, repo name, and
-        commit message.
-
-    Returns:
-        list[dict]: A list of dicts, where each dict contains the data for a
-        change to a single file.
-    """
-    # Scrape a commit's diff file.
-    diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff"
-    try:
-        diff = urllib.request.urlopen(diff_url)
-    except Exception as e:
-        print(e)
-        return []
-    encoding = diff.headers.get_charsets()[0]
-    patch = PatchSet(diff, encoding=encoding)
-    commit_list: list[dict] = []
-    for patch_ind in patch:
-        # Skip if the file is not a python file.
-        # if not Path(patch_ind.target_file).suffix == ".py":
-        #     continue
-        diff_dict: dict = process_ind_patch(patch_ind)
-        diff_dict.update(commit_data)
-        diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"])
-        commit_list.append(diff_dict)
-    return commit_list
-
-
 class GitHubDiffDataset(Dataset):
     def __init__(self, config):
         self.config = config
@@ -127,20 +94,65 @@ def id(self) -> str:
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
         # TODO: Dask multi-node scheduling here
-        self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
+        # self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
         self.read_path = Path(config.read_path)
         self.save_path = Path(config.save_path)
+        self.python_only = config.python_only
+        self.diff_length_threshold = config.diff_length_threshold
+        self.code_length_threshold = config.code_length_threshold
+        self.ignore_deletions = config.ignore_deletions
 
     def scrape(self) -> RawDataset:
+        # TODO: pass meta dataframe
         result = (
             db.read_text(self.read_path).map(json.loads)
-            .map(process_commit).flatten().to_dataframe()
+            .map(self.process_commit).flatten().to_dataframe()
             .to_parquet(self.save_path)
         )
         progress(result)
         dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True)
         return dataset
 
+    def process_commit(self, commit_data: dict) -> list[dict]:
+        """
+        Process a commit dictionary to get the before files and diff dict.
+
+        Args:
+            commit_data (dict): Dictionary containing commit hash, repo name, and
+            commit message.
+
+        Returns:
+            list[dict]: A list of dicts, where each dict contains the data for a
+            change to a single file.
+        """
+        if self.python_only and commit_data["language_name"] != "Python":
+            return []
+        # Scrape a commit's diff file.
+        diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff"
+        try:
+            diff = urllib.request.urlopen(diff_url)
+            encoding = diff.headers.get_charsets()[0]
+            patch = PatchSet(diff, encoding=encoding)
+        except Exception as e:
+            print(e)
+            return []
+        commit_list: list[dict] = []
+        # Iterate over files within the diff.
+        for patch_ind in patch:
+            if self.ignore_deletions and patch_ind.target_file == "/dev/null":
+                continue
+            if self.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > self.diff_length_threshold:
+                continue
+            diff_dict: dict = process_ind_patch(patch_ind)
+            diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"],
+                                                       length_threshold=self.code_length_threshold)
+            if not diff_dict["before_file"]:
+                # Happens if exception is thrown or file is too long.
+                continue
+            diff_dict.update(commit_data)
+            commit_list.append(diff_dict)
+        return commit_list
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser('codepile dataset tool')
@@ -149,6 +161,13 @@ def scrape(self) -> RawDataset:
     parser.add_argument('--save_path', type=str)
     parser.add_argument('--n_workers', type=int, default=8)
     parser.add_argument('--threads_per_worker', type=int, default=2)
+    parser.add_argument('--python_only', type=bool, default=False)
+    parser.add_argument('--diff_length_threshold', type=int, default=1000,
+                        help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.")
+    parser.add_argument('--code_length_threshold', type=int, default=1000,
+                        help="Maximum number of lines in code files")
+    parser.add_argument('--ignore_deletions', type=bool, default=True,
+                        help="Ignore file deletion diffs.")
     config = parser.parse_args()
     ghdiff_dataset = GitHubDiffDataset(config)
     ghdiff_dataset.download()

From 4f28074e6ad1889ea08b26911c89a775018487b4 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Mon, 17 Oct 2022 15:27:44 +0100
Subject: [PATCH 10/16] Fix saving with metadata

---
 codepile/ghdiffs/ghdiffs_scrape.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index cf06bea..8044048 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -24,7 +24,6 @@ def process_ind_patch(patch_diff) -> dict:
         patch_parsed_diff["file_extension"] = Path(patch_diff.source_file).suffix
     else:
         patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix
-    # patch_parsed_diff["patch_info"] = patch_diff.patch_info
     for patch_diff_ind in patch_diff:
         patch_diff_ind = str(patch_diff_ind)
         patch_diff_split = patch_diff_ind.split("@@")
@@ -55,8 +54,7 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_th
             if length_threshold > 0 and len(raw_file) > length_threshold:
                 return ""
         except Exception as e:
-            print(e)
-            print(file_raw_url)
+            # print(e, file_raw_url)
             return ""
         # Iterate over hunks for this file and apply the reverse patch.
         for hunk in file_diff["hunks_process"]:
@@ -94,7 +92,7 @@ def id(self) -> str:
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
         # TODO: Dask multi-node scheduling here
-        # self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
+        self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
         self.read_path = Path(config.read_path)
         self.save_path = Path(config.save_path)
         self.python_only = config.python_only
@@ -103,10 +101,14 @@ def __init__(self, config):
         self.ignore_deletions = config.ignore_deletions
 
     def scrape(self) -> RawDataset:
-        # TODO: pass meta dataframe
+        meta_spec = {'hunks': str, 'addition_count': int, 'deletion_count': int,
+                     'src_file': str, 'tgt_file': str, 'file_extension': str,
+                     'before_file': str, 'commit': str, 'message': str,
+                     'repo_name': str, 'language_name': str, 'author_name': str,
+                     'license': str}
         result = (
             db.read_text(self.read_path).map(json.loads)
-            .map(self.process_commit).flatten().to_dataframe()
+            .map(self.process_commit).flatten().to_dataframe(meta=meta_spec)
             .to_parquet(self.save_path)
         )
         progress(result)
@@ -133,8 +135,10 @@ def process_commit(self, commit_data: dict) -> list[dict]:
             diff = urllib.request.urlopen(diff_url)
             encoding = diff.headers.get_charsets()[0]
             patch = PatchSet(diff, encoding=encoding)
+            if len(patch) == 0:
+                return []
         except Exception as e:
-            print(e)
+            # print(e, diff_url)
             return []
         commit_list: list[dict] = []
         # Iterate over files within the diff.
@@ -149,7 +153,12 @@ def process_commit(self, commit_data: dict) -> list[dict]:
             if not diff_dict["before_file"]:
                 # Happens if exception is thrown or file is too long.
                 continue
-            diff_dict.update(commit_data)
+            diff_dict["commit"] = commit_data["commit"]
+            diff_dict["message"] = commit_data["message"]
+            diff_dict["repo_name"] = commit_data["repo_name"]
+            diff_dict["language_name"] = commit_data["language_name"]
+            diff_dict["author_name"] = commit_data["author"]["name"]
+            diff_dict["license"] = commit_data["license"]
             commit_list.append(diff_dict)
         return commit_list
 
@@ -165,7 +174,7 @@ def process_commit(self, commit_data: dict) -> list[dict]:
     parser.add_argument('--diff_length_threshold', type=int, default=1000,
                         help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.")
     parser.add_argument('--code_length_threshold', type=int, default=1000,
-                        help="Maximum number of lines in code files")
+                        help="Maximum number of lines in code files. Set to 0 for no limit.")
     parser.add_argument('--ignore_deletions', type=bool, default=True,
                         help="Ignore file deletion diffs.")
     config = parser.parse_args()

From b50ca0bc8092d7fe35ca93234fce37e8479943f2 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Mon, 17 Oct 2022 18:06:29 +0100
Subject: [PATCH 11/16] Fix dask bug with map

---
 codepile/ghdiffs/ghdiffs_scrape.py | 103 ++++++++++++++---------------
 1 file changed, 51 insertions(+), 52 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index 8044048..d2085bc 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -67,6 +67,54 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_th
     return "".join(raw_file)
 
 
+def process_commit(commit_data: dict, config: argparse.Namespace) -> list[dict]:
+    """
+    Process a commit dictionary to get the before files and diff dict.
+
+    Args:
+        commit_data (dict): Dictionary containing commit hash, repo name, and
+        commit message.
+
+    Returns:
+        list[dict]: A list of dicts, where each dict contains the data for a
+        change to a single file.
+    """
+    if config.python_only and commit_data["language_name"] != "Python":
+        return []
+    # Scrape a commit's diff file.
+    diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff"
+    try:
+        diff = urllib.request.urlopen(diff_url)
+        encoding = diff.headers.get_charsets()[0]
+        patch = PatchSet(diff, encoding=encoding)
+        if len(patch) == 0:
+            return []
+    except Exception as e:
+        # print(e, diff_url)
+        return []
+    commit_list: list[dict] = []
+    # Iterate over files within the diff.
+    for patch_ind in patch:
+        if config.ignore_deletions and patch_ind.target_file == "/dev/null":
+            continue
+        if config.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > config.diff_length_threshold:
+            continue
+        diff_dict: dict = process_ind_patch(patch_ind)
+        diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"],
+                                                    length_threshold=config.code_length_threshold)
+        if not diff_dict["before_file"]:
+            # Happens if exception is thrown or file is too long.
+            continue
+        diff_dict["commit"] = commit_data["commit"]
+        diff_dict["message"] = commit_data["message"]
+        diff_dict["repo_name"] = commit_data["repo_name"]
+        diff_dict["language_name"] = commit_data["language_name"]
+        diff_dict["author_name"] = commit_data["author"]["name"]
+        diff_dict["license"] = commit_data["license"]
+        commit_list.append(diff_dict)
+    return commit_list
+
+
 class GitHubDiffDataset(Dataset):
     def __init__(self, config):
         self.config = config
@@ -92,13 +140,11 @@ def id(self) -> str:
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
         # TODO: Dask multi-node scheduling here
+        # TODO: Release the GIL inside process_commit
+        self.config = config
         self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
         self.read_path = Path(config.read_path)
         self.save_path = Path(config.save_path)
-        self.python_only = config.python_only
-        self.diff_length_threshold = config.diff_length_threshold
-        self.code_length_threshold = config.code_length_threshold
-        self.ignore_deletions = config.ignore_deletions
 
     def scrape(self) -> RawDataset:
         meta_spec = {'hunks': str, 'addition_count': int, 'deletion_count': int,
@@ -108,60 +154,13 @@ def scrape(self) -> RawDataset:
                      'license': str}
         result = (
             db.read_text(self.read_path).map(json.loads)
-            .map(self.process_commit).flatten().to_dataframe(meta=meta_spec)
+            .map(process_commit, config=self.config).flatten().to_dataframe(meta=meta_spec)
             .to_parquet(self.save_path)
         )
         progress(result)
         dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True)
         return dataset
 
-    def process_commit(self, commit_data: dict) -> list[dict]:
-        """
-        Process a commit dictionary to get the before files and diff dict.
-
-        Args:
-            commit_data (dict): Dictionary containing commit hash, repo name, and
-            commit message.
-
-        Returns:
-            list[dict]: A list of dicts, where each dict contains the data for a
-            change to a single file.
-        """
-        if self.python_only and commit_data["language_name"] != "Python":
-            return []
-        # Scrape a commit's diff file.
-        diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff"
-        try:
-            diff = urllib.request.urlopen(diff_url)
-            encoding = diff.headers.get_charsets()[0]
-            patch = PatchSet(diff, encoding=encoding)
-            if len(patch) == 0:
-                return []
-        except Exception as e:
-            # print(e, diff_url)
-            return []
-        commit_list: list[dict] = []
-        # Iterate over files within the diff.
-        for patch_ind in patch:
-            if self.ignore_deletions and patch_ind.target_file == "/dev/null":
-                continue
-            if self.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > self.diff_length_threshold:
-                continue
-            diff_dict: dict = process_ind_patch(patch_ind)
-            diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"],
-                                                       length_threshold=self.code_length_threshold)
-            if not diff_dict["before_file"]:
-                # Happens if exception is thrown or file is too long.
-                continue
-            diff_dict["commit"] = commit_data["commit"]
-            diff_dict["message"] = commit_data["message"]
-            diff_dict["repo_name"] = commit_data["repo_name"]
-            diff_dict["language_name"] = commit_data["language_name"]
-            diff_dict["author_name"] = commit_data["author"]["name"]
-            diff_dict["license"] = commit_data["license"]
-            commit_list.append(diff_dict)
-        return commit_list
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser('codepile dataset tool')

From b5f4b92a3a4e6068486b96ebe1e6dc141348400b Mon Sep 17 00:00:00 2001
From: Reshinth Adithyan <36307201+reshinthadithyan@users.noreply.github.com>
Date: Tue, 18 Oct 2022 18:00:05 +0530
Subject: [PATCH 12/16] GitHub Diff Repo Filter

---
 codepile/ghdiffs/ghdiffs_repo_filter.py | 96 +++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 codepile/ghdiffs/ghdiffs_repo_filter.py

diff --git a/codepile/ghdiffs/ghdiffs_repo_filter.py b/codepile/ghdiffs/ghdiffs_repo_filter.py
new file mode 100644
index 0000000..024b8d9
--- /dev/null
+++ b/codepile/ghdiffs/ghdiffs_repo_filter.py
@@ -0,0 +1,96 @@
+import re
+from turtle import position
+import pandas as pd
+import requests
+import logging
+from io import StringIO
+import os
+from tqdm import tqdm
+import json
+
+logging.basicConfig(
+    level = logging.INFO,
+    format=  '%(asctime)s %(levelname)-8s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger(__name__)
+
+
+def read_json_file(json_path:str):
+    with open(json_path,"r") as f:
+        return [json.loads(line) for line in f]
+
+def write_json_file(json_path:str,obj:dict):
+    with open(json_path, "w") as jsonFile:
+        json.dump(obj, jsonFile)
+
+class GitHubDiffFilter:
+    def __init__(self) -> None:
+        repo_file_url = "https://raw.githubusercontent.com/EleutherAI/github-downloader/master/github_repositories.csv"
+        repo_file_string = StringIO(requests.get(repo_file_url).text)
+        self.single_file_count_threshold = 10_000
+        self.save_every = 1000
+        if not os.path.isdir("tmp"):
+            logger.info(f"Sucessfully added `tmp/` path")
+            os.mkdir("tmp")
+
+        self.ckpt_path = os.path.join("tmp","ckpt.json")
+        self.repos_df =  pd.read_csv(repo_file_string)
+        self.top_repos_list = self.get_top_repo_list()
+        self.checkpoint_list = [] #Would contain the curr_index of the last saved file.
+        logger.info(f"Size of top repos : {len(self.top_repos_list)}")
+
+    def get_top_repo_list(self)->list:
+        return self.repos_df.iloc[:,0].values.tolist()
+
+    def get_diff_path_list(self,github_diff_path:str)->list:
+        """
+        Get all the files in the given path
+        returns :
+                github_diff_list (list) : GitHub Diff path.
+        """
+        return os.listdir(github_diff_path)
+    
+    def __call__(self,github_diff_path,output_file_path):
+        github_diff_files = self.get_diff_path_list(github_diff_path)
+        logger.info(f"Starting to process {len(github_diff_files)} files..")
+        #index and stuffs.
+        content_output_count = 0
+        file_index = 0
+        output_list = []
+        output_file_index = 0
+        for github_diff_file in tqdm(github_diff_files,total=len(github_diff_files)):
+            file_index += 1 # File index.
+
+            if file_index == self.save_every:
+                write_json_file(self.ckpt_path,{"index":file_index})
+            github_diff_file_path = os.path.join(github_diff_path,github_diff_file)
+            github_diff_content_list = read_json_file(github_diff_file_path)
+            for ind_content in tqdm(github_diff_content_list,total=len(github_diff_content_list),leave=False):
+                if ind_content["repo_name"] not in self.top_repos_list:
+                    output_list.append(ind_content)
+                    content_output_count += 1
+                    self.checkpoint_list.append(file_index)
+                
+                if content_output_count == self.single_file_count_threshold:
+                    #If the number of elements in the list is 
+                    content_df = pd.DataFrame.from_dict(output_list,orient="columns")
+                    output_inter_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet")
+                    content_df.to_parquet(output_inter_file_path)
+                    output_file_index += 1
+            
+            #Final Chunk goes here..
+            write_json_file(self.ckpt_path,{"index":file_index})
+            content_df = pd.DataFrame.from_dict(output_list,orient="columns")
+            output_final_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet")
+            content_df.to_parquet(output_final_file_path)
+
+
+
+
+         
+
+    
+
+
+if __name__ == "__main__":
+    gh_diff_filter = GitHubDiffFilter()
\ No newline at end of file

From 84dcb2cc890de49f2415d1fb44e499d0aa33bf68 Mon Sep 17 00:00:00 2001
From: Reshinth Adithyan <36307201+reshinthadithyan@users.noreply.github.com>
Date: Tue, 18 Oct 2022 18:08:16 +0530
Subject: [PATCH 13/16] Add resume checkpoint util

---
 codepile/ghdiffs/ghdiffs_repo_filter.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/codepile/ghdiffs/ghdiffs_repo_filter.py b/codepile/ghdiffs/ghdiffs_repo_filter.py
index 024b8d9..9a14ea0 100644
--- a/codepile/ghdiffs/ghdiffs_repo_filter.py
+++ b/codepile/ghdiffs/ghdiffs_repo_filter.py
@@ -32,8 +32,12 @@ def __init__(self) -> None:
         if not os.path.isdir("tmp"):
             logger.info(f"Sucessfully added `tmp/` path")
             os.mkdir("tmp")
-
         self.ckpt_path = os.path.join("tmp","ckpt.json")
+        
+        if os.path.exists(self.ckpt_path):
+            self.to_start = read_json_file(self.ckpt_path)["index"]
+        else:
+            self.to_start = 0
         self.repos_df =  pd.read_csv(repo_file_string)
         self.top_repos_list = self.get_top_repo_list()
         self.checkpoint_list = [] #Would contain the curr_index of the last saved file.
@@ -58,6 +62,9 @@ def __call__(self,github_diff_path,output_file_path):
         file_index = 0
         output_list = []
         output_file_index = 0
+        
+        #Start_ind
+        github_diff_files = github_diff_files[self.to_start:]
         for github_diff_file in tqdm(github_diff_files,total=len(github_diff_files)):
             file_index += 1 # File index.
 

From 9ac9ece57996df445a6165273b1c6b706762a294 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Wed, 19 Oct 2022 02:21:58 +0100
Subject: [PATCH 14/16] Filter out zero diffs

---
 codepile/ghdiffs/ghdiffs_scrape.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py
index d2085bc..753446f 100644
--- a/codepile/ghdiffs/ghdiffs_scrape.py
+++ b/codepile/ghdiffs/ghdiffs_scrape.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 import dask.bag as db
 from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper)
-from dask.distributed import Client, progress
+from dask.distributed import Client, progress, LocalCluster
 from unidiff import PatchSet
 
 
@@ -54,7 +54,6 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_th
             if length_threshold > 0 and len(raw_file) > length_threshold:
                 return ""
         except Exception as e:
-            # print(e, file_raw_url)
             return ""
         # Iterate over hunks for this file and apply the reverse patch.
         for hunk in file_diff["hunks_process"]:
@@ -99,6 +98,9 @@ def process_commit(commit_data: dict, config: argparse.Namespace) -> list[dict]:
             continue
         if config.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > config.diff_length_threshold:
             continue
+        # Filter non-text files.
+        if patch_ind.added == 0 and patch_ind.removed == 0:
+            continue
         diff_dict: dict = process_ind_patch(patch_ind)
         diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"],
                                                     length_threshold=config.code_length_threshold)
@@ -140,9 +142,9 @@ def id(self) -> str:
 class GitHubDiffScraper(Scraper):
     def __init__(self, config):
         # TODO: Dask multi-node scheduling here
-        # TODO: Release the GIL inside process_commit
         self.config = config
-        self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
+        cluster = LocalCluster(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker)
+        self.client = Client(cluster)
         self.read_path = Path(config.read_path)
         self.save_path = Path(config.save_path)
 
@@ -168,7 +170,7 @@ def scrape(self) -> RawDataset:
     parser.add_argument('--read_path', type=str)
     parser.add_argument('--save_path', type=str)
     parser.add_argument('--n_workers', type=int, default=8)
-    parser.add_argument('--threads_per_worker', type=int, default=2)
+    parser.add_argument('--threads_per_worker', type=int, default=1)
     parser.add_argument('--python_only', type=bool, default=False)
     parser.add_argument('--diff_length_threshold', type=int, default=1000,
                         help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.")

From 6633763fed9d7c6a975381b3d20ba5ebba9a2de0 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Wed, 19 Oct 2022 02:22:09 +0100
Subject: [PATCH 15/16] Add dummy parquet

---
 codepile/ghdiffs/test/ghdiffs_dummy.parquet | Bin 0 -> 7735 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 codepile/ghdiffs/test/ghdiffs_dummy.parquet

diff --git a/codepile/ghdiffs/test/ghdiffs_dummy.parquet b/codepile/ghdiffs/test/ghdiffs_dummy.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..e30733b2bf10784f09093357928000723f8010ee
GIT binary patch
literal 7735
zcmcgx&2QV-5$EQ^NjBcyqOA?PaN9t@3MlfS@2s`AM>(RY+OngpNlHZ_@JkY9lF~|~
zWjP2?6g}kFL(xM{d+4cuK>wf~dRXkKhn{li%p)Z}iLy4z3T7foeDm>}H#2W$2%nN#
zX6w!tw{`0XZrsl~j!P!@wl;SQKjSubckXt4_k6f@Yn$8N+x|_GOWyz8Uce5F^KyUx
z1Zbh)=c-{?p4D@kde3(~M#JS24exF-8Vu7m7ixI_$r2Uc-HxokKOE@ImSvj^{GV?x
z5q@`v5%${Nyzu*F!hgFP34e~LP4m(--61%E!T;Z#W%wJBiieun>J7}833r#t-;3lo
z^q%8btmofvED`<d{YdnlV-AOEn|0fN?=6vi`yi5iFfjYQrmH%v^!Lls<}aT{(x2I?
z+x7u^Uc-MLEW>{oX?UpmUZ*#hm;e6LW%8Sm{5{*!A!sw!KU^aFr_Z94{YA6s`nKIP
z)Zuy4at-sc*<?ch@68!KyYIg!a621Ay<<A+)*iRD`Q@$fz<tRb?a>1UD|06aP8}}J
z9*#hEn(h;ERCs@q#_Zx;da@)o?b!7&o!F&Gny_n8_}SIju(A|3T<rQJ43jn7^dk3i
z+uZKn?hhYavy!V9spLmgrQ}LwD7jMINq$73NxoaLvLxSqcqvJ)Qah5X6^rER)gk$I
z#d45*yTbWTe(ZtYetUZx`semPZvT?wcAxBQ^i>xM#m+y!c%2Aph2FD$#~miVe)NaL
zf=ZJ3<C8~;fF~#}CS`q{z+wwRSbBh?Wtp}?%U`q>KsCK_9~l$AYw111OaQm*JN@yL
z(9^o6?%}#)da9v%DzLv!m@cqeZX0FKz9~Q5PrP{(88d4|f=&#mV#bhY`6t<edwlZ*
zObYj$m>o0XuyOhvzU6w)^Knm1ml;RIz-Nna96U~6Blu*tiNms#Il$E-AKUPxm&7qy
z%6c(pC#g7coW5RsW(<pi&k@&)91L)A&}e<Vc;VC+2V@BAMV!uxam*RgdQoR)%s6t4
zyk2x>K8<6~5!Z_xPPTDWhOl15MO`e`8LnZPvt(M3KcHjLx<wn*z*N27fHpDP(gx~a
zj6JMBRtJOL2pw@@4)6sHSBuZm&(aC#E{PDDp=F-spJfwoxSdT>tDVSS2SfYLcfU^2
zi~ODt@)ACH!S6ib+oViLo0Qsgjpt|RBV2qjrJs<);2G{z3#E&Mj0niVH&p`PR0!^M
z`0%5;TsW^Q$AH)adHyTWvHgZ1<&|vpLOYZ^%{{hGEM61sVoP(Sp;j&ee|4Z_L%z(u
zrMw!oMJMF9i+#<h*`^>}iQQgX&J8WiDSM4q;&V|bWHezk6iXFHm8D^$T=eU5#zsD>
zy99b%z60_dJyT4Z3QuXGEBUHiy=Vw$)=`OA!0YLUcFQ=lM^taAa#45sc0KnppvzLa
zF4sDS@B(dzJxP8&TRd0gf{R$9EuGb@bY65b#Tnbfy_)07)GpEKWHiUS5-r?gqu-D(
z&y{SUqseEWn^@{0*p@q{xVlgV-<7YFT&<vktw=ZILLafgPTaR|$eEETF?(~0mg-1d
zqeSUmL#cH@N4h@s*JbgjgnNiEVQ=TEQtNBkLJM?fG&k6H!?A5msI{j4E>%ERt#1hS
zg=QDKT2AsqoG|WH)`xx2ldT4JEU?q|je~kR@I%Hngfiqotuu|?VR|SYGO;dI&cRo}
z7sHOlUW9zn1Sze`MIZAHd=}s=$_2>J$PUOSg)gxFh8U---GzAod%7I9AXjJN>sH70
zOc89lw6BY?o~v|~Y@@G*`FB%sHUuNh;x349x{<53V0{?o*P^&W{@PtFTMPFW_nDTh
zeA|$#tHhT2OUX)kLy&wu3o*86{w|0u`b}yzluo)K7p^eI)X$4z`_s<(;`llhyWtea
z4QY9d>yG4Txr)vD`=;UyXV}wZJM@3N#>~)f%x@W6&s95(Y*2F-<Q~+j>Q!Afg1ovZ
zyWEPc3($*#p0co)h;g2qIu}Fer>-HFUC6hfhg{P)H0%{Q3H&Rud$}n)JBCv0LB6xt
z3(jsiV+frK-DdW3Q*m}Kp>JYsxi<fXFca9CTfN!*TQUdXY_i~-v|wI<x#zNlvw@lk
z=c975b!{E|kskNs^*UpqEp@18(4#MO3uZ6u%>oSRPI+wNY*DI=Ou0PPonkJqqhafA
z6=sS&om);`<wr+dVqw2zX0mKUsRT6~=cw)rzgo(VMK=wzWF9`H5ssM(j+-U;M-E0f
zi;i$i6-O|J@;W{r<1={)&j)AlgY@u*(<|YTfb%LLm2NSW+J_%6N_4SAhg$j*$%KcQ
zmGIa4mQ=4odBRi4>NgX8<p6}v!;wCsqrPcQkz?df?WbC)6cYFv+zDk0?4yt52*2tI
z%q~K49L|Mg|1_6Mjk}LS`vW}i23!mt`izg&3uF>cze=StbMpLT-bW6+&f&w48w!6h
z!G-7J)I}=QL3w0<c}BM`Ap7*y0=SSSOzS%_kpKDBtiAB^0T*L0eP4h%BIBtFsI4G>
z>J0IKZ@GVFdn*}J`wq~(2yu;nFrWDNoB?J%M)|8b`Ivvuem<Iys3*udzG^9_aAW*K
zQ;+cv4b@>Z7lI-}%82(^M0r$9I&_sfiRLSPBq#U<?@vRR<JtKflD-R)xZf}zkEiyB
z`91G%y8rn64hq0NNd7j)k-pENQbJD>Yyz?!=mMtd0j`5_(;$ZVFuz}VK!o5uClU!9
whR@v2Z~N-tTi^7a>izyx`ri8}e6f8*-*|U_fIl^T;D7F4zv8%4_>=Mf0pKy7h5!Hn

literal 0
HcmV?d00001


From 6b61fbe4ea23af9543dd45e945a9f7b7726dc5c8 Mon Sep 17 00:00:00 2001
From: Herbie Bradley <mail@herbiebradley.com>
Date: Wed, 19 Oct 2022 02:22:40 +0100
Subject: [PATCH 16/16] Fix bug & change output to json for easy scraping

---
 codepile/ghdiffs/ghdiffs_repo_filter.py | 77 ++++++++++++-------------
 1 file changed, 36 insertions(+), 41 deletions(-)

diff --git a/codepile/ghdiffs/ghdiffs_repo_filter.py b/codepile/ghdiffs/ghdiffs_repo_filter.py
index 9a14ea0..0082cfe 100644
--- a/codepile/ghdiffs/ghdiffs_repo_filter.py
+++ b/codepile/ghdiffs/ghdiffs_repo_filter.py
@@ -9,20 +9,22 @@
 import json
 
 logging.basicConfig(
-    level = logging.INFO,
-    format=  '%(asctime)s %(levelname)-8s %(message)s',
+    level=logging.INFO,
+    format='%(asctime)s %(levelname)-8s %(message)s',
     datefmt='%Y-%m-%d %H:%M:%S')
 logger = logging.getLogger(__name__)
 
 
-def read_json_file(json_path:str):
-    with open(json_path,"r") as f:
+def read_json_file(json_path: str):
+    with open(json_path, "r") as f:
         return [json.loads(line) for line in f]
 
-def write_json_file(json_path:str,obj:dict):
+
+def write_json_file(json_path: str, obj: dict):
     with open(json_path, "w") as jsonFile:
         json.dump(obj, jsonFile)
 
+
 class GitHubDiffFilter:
     def __init__(self) -> None:
         repo_file_url = "https://raw.githubusercontent.com/EleutherAI/github-downloader/master/github_repositories.csv"
@@ -32,72 +34,65 @@ def __init__(self) -> None:
         if not os.path.isdir("tmp"):
             logger.info(f"Sucessfully added `tmp/` path")
             os.mkdir("tmp")
-        self.ckpt_path = os.path.join("tmp","ckpt.json")
-        
+        self.ckpt_path = os.path.join("tmp", "ckpt.json")
+
         if os.path.exists(self.ckpt_path):
             self.to_start = read_json_file(self.ckpt_path)["index"]
         else:
             self.to_start = 0
-        self.repos_df =  pd.read_csv(repo_file_string)
+        self.repos_df = pd.read_csv(repo_file_string)
         self.top_repos_list = self.get_top_repo_list()
-        self.checkpoint_list = [] #Would contain the curr_index of the last saved file.
+        self.checkpoint_list = []  # Would contain the curr_index of the last saved file.
         logger.info(f"Size of top repos : {len(self.top_repos_list)}")
 
-    def get_top_repo_list(self)->list:
-        return self.repos_df.iloc[:,0].values.tolist()
+    def get_top_repo_list(self) -> list:
+        return self.repos_df.iloc[:, 0].values.tolist()
 
-    def get_diff_path_list(self,github_diff_path:str)->list:
+    def get_diff_path_list(self, github_diff_path: str) -> list:
         """
         Get all the files in the given path
         returns :
                 github_diff_list (list) : GitHub Diff path.
         """
         return os.listdir(github_diff_path)
-    
-    def __call__(self,github_diff_path,output_file_path):
+
+    def __call__(self, github_diff_path, output_file_path):
         github_diff_files = self.get_diff_path_list(github_diff_path)
         logger.info(f"Starting to process {len(github_diff_files)} files..")
-        #index and stuffs.
+        # index and stuffs.
         content_output_count = 0
         file_index = 0
         output_list = []
         output_file_index = 0
-        
-        #Start_ind
+
+        # Start_ind
         github_diff_files = github_diff_files[self.to_start:]
-        for github_diff_file in tqdm(github_diff_files,total=len(github_diff_files)):
-            file_index += 1 # File index.
+        for github_diff_file in tqdm(github_diff_files, total=len(github_diff_files)):
+            file_index += 1  # File index.
 
             if file_index == self.save_every:
-                write_json_file(self.ckpt_path,{"index":file_index})
-            github_diff_file_path = os.path.join(github_diff_path,github_diff_file)
+                write_json_file(self.ckpt_path, {"index": file_index})
+            github_diff_file_path = os.path.join(github_diff_path, github_diff_file)
             github_diff_content_list = read_json_file(github_diff_file_path)
-            for ind_content in tqdm(github_diff_content_list,total=len(github_diff_content_list),leave=False):
-                if ind_content["repo_name"] not in self.top_repos_list:
+            for ind_content in tqdm(github_diff_content_list, total=len(github_diff_content_list), leave=False):
+                if ind_content["repo_name"] in self.top_repos_list:
                     output_list.append(ind_content)
                     content_output_count += 1
                     self.checkpoint_list.append(file_index)
-                
+
                 if content_output_count == self.single_file_count_threshold:
-                    #If the number of elements in the list is 
-                    content_df = pd.DataFrame.from_dict(output_list,orient="columns")
-                    output_inter_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet")
-                    content_df.to_parquet(output_inter_file_path)
+                    # If the number of elements in the list is
+                    content_df = pd.DataFrame.from_dict(output_list, orient="columns")
+                    output_inter_file_path = os.path.join(output_file_path, str(output_file_index) + ".json")
+                    content_df.to_json(output_inter_file_path, orient="records", lines=True)
                     output_file_index += 1
-            
-            #Final Chunk goes here..
-            write_json_file(self.ckpt_path,{"index":file_index})
-            content_df = pd.DataFrame.from_dict(output_list,orient="columns")
-            output_final_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet")
-            content_df.to_parquet(output_final_file_path)
-
-
-
-
-         
 
-    
+            # Final Chunk goes here..
+            write_json_file(self.ckpt_path, {"index": file_index})
+            content_df = pd.DataFrame.from_dict(output_list, orient="columns")
+            output_final_file_path = os.path.join(output_file_path, str(output_file_index) + ".json")
+            content_df.to_json(output_final_file_path, orient="records", lines=True)
 
 
 if __name__ == "__main__":
-    gh_diff_filter = GitHubDiffFilter()
\ No newline at end of file
+    gh_diff_filter = GitHubDiffFilter()