From 3f9f1ba5da7ea785b4b35d65a9bd22219340b44b Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Wed, 5 Oct 2022 21:09:56 +0100 Subject: [PATCH 01/16] Fix bugs in scraping code and add dask --- codepile/ghdiffs/ghdiffs_scrape.py | 113 +++++++++++++++++++++++++++++ codepile/ghdiffs/test_file.json | 20 +++++ 2 files changed, 133 insertions(+) create mode 100644 codepile/ghdiffs/ghdiffs_scrape.py create mode 100644 codepile/ghdiffs/test_file.json diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py new file mode 100644 index 0000000..332bdee --- /dev/null +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -0,0 +1,113 @@ +import json +import urllib.request +from pathlib import Path +from pprint import pprint + +import dask.bag as db +import dask.dataframe as dd +import pyarrow as pa +from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources, + Processor, RawDataset, Scraper) +from unidiff import PatchSet + + +def process_ind_patch(patch_diff) -> dict: + """Process patch to get diff data.""" + patch_parsed_diff: dict = { + "hunks": [], + } + + patch_parsed_diff["addition_count"] = patch_diff.added + patch_parsed_diff["deletion_count"] = patch_diff.removed + patch_parsed_diff["src_file"] = patch_diff.source_file + patch_parsed_diff["tgt_file"] = patch_diff.target_file + # patch_parsed_diff["patch_info"] = patch_diff.patch_info + patch_diff_list = list(patch_diff) + for patch_diff_ind in patch_diff_list: + patch_diff_ind = str(patch_diff_ind) + patch_diff_split = patch_diff_ind.split("@@") + patch_diff_line = patch_diff_split[2].split("\n") + patch_diff_line_numbers = [list(map(int, hunk.strip("-+").split(","))) + for hunk in patch_diff_split[1].strip().split(" ")] + patch_parsed_diff["hunks"].append(patch_diff_line_numbers + patch_diff_line[:-1]) + return patch_parsed_diff + + +def patch_parse(commit_hash: str, repo_name: str) -> list: + """Parse a commit to get diff data.""" + diff_url = (f"https://github.com/{repo_name}/commit/{commit_hash}.diff") + diff = urllib.request.urlopen(diff_url) + encoding = diff.headers.get_charsets()[0] + patch = PatchSet(diff, encoding=encoding) + diff_list: list = [] + for patch_ind in patch: + # Skip if the file is not a python file. + # if not patch_ind.target_file.endswith(".py"): + # continue + diff_list.append(process_ind_patch(patch_ind)) + return diff_list + + +def apply_reverse_patch(diff_list: list, commit_hash: str, repo_name: str, length_threshold: int = 4096) -> list: + """Apply reverse patch to get before files. Returns list of modified files.""" + files_list: list = [] + repo_owner, repo_name = repo_name.split("/") + for diff in diff_list: + if diff["src_file"] == "/dev/null": + files_list.append([]) + continue + # Get raw after file. + file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/" + f"{repo_name}/{commit_hash}/{diff['tgt_file'][2:]}") + raw_file = urllib.request.urlopen(file_raw_url) + raw_file_encoding = raw_file.headers.get_charsets()[0] + raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] + # file_length = sum(1 for _ in raw_file) + # if file_length < length_threshold: + files_list.append(raw_file) + # Iterate over hunks for this file and apply the reverse patch. + for hunk in diff["hunks"]: + hunk_list = [] + for line in hunk[3:]: + if line.startswith("-") or line.startswith(" "): + hunk_list.append(line[1:] + "\n") + files_list[-1][hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list + + return files_list + + +def process_commit(commit_data: dict) -> dict: + """Process a commit hash and repo name to get the before files and diff dict.""" + # Get dict containing diff data. + diff_list = patch_parse(commit_data["commit"], commit_data["repo_name"]) + # Get list of files, each of which is a list of strings, one for each line. + files_list = apply_reverse_patch(diff_list, commit_data["commit"], commit_data["repo_name"]) + commit_data["before_files"] = files_list + commit_data["diff"] = [json.dumps(diff) for diff in diff_list] + return commit_data + + +class GitHubDiffDataset(Dataset): + def __init__(self, config): + pass + + +class GitHubDiffScraper(Scraper): + def __init__(self, config): + pass + + def scrape(self): + pass + + +if __name__ == "__main__": + read_path = Path(__file__).parent / "test_file.json" + schema = pa.schema([ + (pa.field("commit", pa.string())), + (pa.field("message", pa.string())), + (pa.field("repo_name", pa.string())), + (pa.field("before_files", pa.list_(pa.list_(pa.string())))), + (pa.field("diff", pa.list_(pa.string()))), + ]) + db.read_text(read_path).map(json.loads).map(process_commit).to_dataframe().to_parquet("test.parquet", + schema=schema) diff --git a/codepile/ghdiffs/test_file.json b/codepile/ghdiffs/test_file.json new file mode 100644 index 0000000..a515a81 --- /dev/null +++ b/codepile/ghdiffs/test_file.json @@ -0,0 +1,20 @@ +{"commit":"f96ce6590c9bad17df325fe965222fadc8a5d485","message":"Merge branch 'master' into assignments_small_fixes","repo_name":"tensorflow/tensorflow"} +{"commit":"0ec0a9bc0437f069ea0b08c46f1734ab837d2bf6","message":"Unroll partial merge from #1472\n","repo_name":"tensorflow/tensorflow"} +{"commit":"c12b21ca8fb6f38b89fe769176079d8530c8683b","message":"In regular DataFeeder for classification convert y into int64 to not raise numpy warnings on itemset\n","repo_name":"tensorflow/tensorflow"} +{"commit":"038ccd6ab030222d6041421d949b71891809de6c","message":"Merge pull request #48 from frol/master\n\nFixed travis scripts to be POSIX sh complaint and proof against whitespace issues","repo_name":"tensorflow/tensorflow"} +{"commit":"e52003b01cd4e7e0bc7a698bcfec331f6d5638f1","message":"Switch to enumerate in seq2seq ops\n","repo_name":"tensorflow/tensorflow"} +{"commit":"a64f3d7eaeceea5999b0530d55e32b6d2ab7d1dc","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"} +{"commit":"bd443b52367cc64358d4247a535439e7d42c6641","message":"Merge pull request #24 from terrytangyuan/patch-1\n\nAdded missing CV import in multiple_gpu example","repo_name":"tensorflow/tensorflow"} +{"commit":"abdfb10f2887210d18ff3b33596085f104029d7d","message":"Up the required version of Tensorflow - due to recent RNN changes, skflow will now only support 0.7+\n","repo_name":"tensorflow/tensorflow"} +{"commit":"43cd5c01318e4c51d65eb53f77a2b27ef72c557d","message":"Fixes #22: Added cross_validation to iris.py and iris_custom_model.py\n","repo_name":"tensorflow/tensorflow"} +{"commit":"b75c2a316558cf26cf1c81644dad43a7c2edd332","message":"Added apache headers\n","repo_name":"tensorflow/tensorflow"} +{"commit":"21df765ae4a35f655b068ae7d2440da98944cb7e","message":"Adding rnn_decoder with sampling sub-graph. Adding rnn seq2seq that will work with it.\n","repo_name":"tensorflow/tensorflow"} +{"commit":"67e33815af883113216001db33a8d4d14a674823","message":"Refactored out Trainer and ops from main file\n","repo_name":"tensorflow/tensorflow"} +{"commit":"881265ff80209710b2f064a98e2b7bc4608bbf64","message":"Fixes #42: Added a test for GridSearchCV to check that DNN works correctly with it\n","repo_name":"tensorflow/tensorflow"} +{"commit":"135018dbe63d7d72201862d0c5e5878d1fa4a923","message":"Added more coverage in tests\n","repo_name":"tensorflow/tensorflow"} +{"commit":"9cfbd7bcc1b9fe89fc22261ba316d6d5336985b4","message":"Merge pull request #36 from terrytangyuan/master\n\nAllow pandas.DataFrame data types","repo_name":"tensorflow/tensorflow"} +{"commit":"e27aa15ed7fd4f3d735bf7a8a29b97fdede710bd","message":"Add usage of batch norm in conv test and fix usage of is_training collection\n","repo_name":"tensorflow/tensorflow"} +{"commit":"05cae1f96f7ac2dfb6368fd87e6254194a04281a","message":"Merge pull request #89 from lopuhin/fix-dropout\n\nFix dropout management in TensorFlowEstimator._predict","repo_name":"tensorflow/tensorflow"} +{"commit":"a9a488de5754936a7e67245033f0d6b5fe7b4c88","message":"Fix #130: Added explicit save/restore to VocabularyProcessor\n","repo_name":"tensorflow/tensorflow"} +{"commit":"edc98e6269dc448f8e55be9af6018e4ee768b223","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"} +{"commit":"845b91dd680a15e37e516d80fc7bff498e827f94","message":"[tf.learn] Allow Attention in RNNClassifier and Regressor (#2820)\n\n* Added attention cell wrappers in rnn_model\r\n\r\n* refactor and add attn_rnn tests\r\n\r\n* revised according to comments on lint\r\n","repo_name":"tensorflow/tensorflow"} From 2b679397ed5d9e4f4d1fdfe12664d3a656a99f16 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Wed, 5 Oct 2022 21:11:36 +0100 Subject: [PATCH 02/16] Add dask client --- codepile/ghdiffs/ghdiffs_scrape.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index 332bdee..a1c66b7 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -4,10 +4,10 @@ from pprint import pprint import dask.bag as db -import dask.dataframe as dd import pyarrow as pa from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources, Processor, RawDataset, Scraper) +from dask.distributed import Client from unidiff import PatchSet @@ -102,6 +102,7 @@ def scrape(self): if __name__ == "__main__": read_path = Path(__file__).parent / "test_file.json" + client = Client(n_workers=8, threads_per_worker=2) schema = pa.schema([ (pa.field("commit", pa.string())), (pa.field("message", pa.string())), From f448fc4c2250c678d88e646707c6036c72359b15 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Thu, 6 Oct 2022 01:16:27 +0100 Subject: [PATCH 03/16] Fix parquet saving --- codepile/ghdiffs/ghdiffs_scrape.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index a1c66b7..8f6e0e3 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -4,6 +4,7 @@ from pprint import pprint import dask.bag as db +import numpy as np import pyarrow as pa from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources, Processor, RawDataset, Scraper) @@ -15,6 +16,7 @@ def process_ind_patch(patch_diff) -> dict: """Process patch to get diff data.""" patch_parsed_diff: dict = { "hunks": [], + "hunks_process": [], } patch_parsed_diff["addition_count"] = patch_diff.added @@ -29,7 +31,8 @@ def process_ind_patch(patch_diff) -> dict: patch_diff_line = patch_diff_split[2].split("\n") patch_diff_line_numbers = [list(map(int, hunk.strip("-+").split(","))) for hunk in patch_diff_split[1].strip().split(" ")] - patch_parsed_diff["hunks"].append(patch_diff_line_numbers + patch_diff_line[:-1]) + patch_parsed_diff["hunks_process"].append(patch_diff_line_numbers + patch_diff_line[:-1]) + patch_parsed_diff["hunks"].append(patch_diff_ind) return patch_parsed_diff @@ -66,12 +69,13 @@ def apply_reverse_patch(diff_list: list, commit_hash: str, repo_name: str, lengt # if file_length < length_threshold: files_list.append(raw_file) # Iterate over hunks for this file and apply the reverse patch. - for hunk in diff["hunks"]: + for hunk in diff["hunks_process"]: hunk_list = [] for line in hunk[3:]: if line.startswith("-") or line.startswith(" "): hunk_list.append(line[1:] + "\n") files_list[-1][hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list + del diff["hunks_process"] return files_list @@ -83,7 +87,7 @@ def process_commit(commit_data: dict) -> dict: # Get list of files, each of which is a list of strings, one for each line. files_list = apply_reverse_patch(diff_list, commit_data["commit"], commit_data["repo_name"]) commit_data["before_files"] = files_list - commit_data["diff"] = [json.dumps(diff) for diff in diff_list] + commit_data["diff"] = diff_list return commit_data @@ -103,12 +107,19 @@ def scrape(self): if __name__ == "__main__": read_path = Path(__file__).parent / "test_file.json" client = Client(n_workers=8, threads_per_worker=2) + diff_struct = pa.struct([ + ("addition_count", pa.int32()), + ("deletion_count", pa.int32()), + ("src_file", pa.string()), + ("tgt_file", pa.string()), + ("hunks", pa.list_(pa.string())), + ]) schema = pa.schema([ (pa.field("commit", pa.string())), (pa.field("message", pa.string())), (pa.field("repo_name", pa.string())), (pa.field("before_files", pa.list_(pa.list_(pa.string())))), - (pa.field("diff", pa.list_(pa.string()))), + (pa.field("diff", pa.list_(diff_struct))), ]) db.read_text(read_path).map(json.loads).map(process_commit).to_dataframe().to_parquet("test.parquet", schema=schema) From d421d6a0d0d689ba1f38bc90ef029b700d0e08fd Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Thu, 6 Oct 2022 17:55:34 +0100 Subject: [PATCH 04/16] Implement GitHubDiffDataset, fix dask loading --- codepile/ghdiffs/ghdiffs_scrape.py | 134 +++++++++++++++-------------- 1 file changed, 68 insertions(+), 66 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index 8f6e0e3..73c4cb4 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -2,6 +2,7 @@ import urllib.request from pathlib import Path from pprint import pprint +from xml.dom.minidom import ReadOnlySequentialNamedNodeMap import dask.bag as db import numpy as np @@ -33,93 +34,94 @@ def process_ind_patch(patch_diff) -> dict: for hunk in patch_diff_split[1].strip().split(" ")] patch_parsed_diff["hunks_process"].append(patch_diff_line_numbers + patch_diff_line[:-1]) patch_parsed_diff["hunks"].append(patch_diff_ind) + patch_parsed_diff["hunks"] = "".join(patch_parsed_diff["hunks"]) return patch_parsed_diff -def patch_parse(commit_hash: str, repo_name: str) -> list: - """Parse a commit to get diff data.""" - diff_url = (f"https://github.com/{repo_name}/commit/{commit_hash}.diff") +def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str: + repo_owner, repo_name = repo_name.split("/") + if file_diff["src_file"] == "/dev/null": + return "" + # Get raw after file. + file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/" + f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}") + raw_file = urllib.request.urlopen(file_raw_url) + raw_file_encoding = raw_file.headers.get_charsets()[0] + raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] + # file_length = sum(1 for _ in raw_file) + # if file_length < length_threshold: + # files_list.append(raw_file) + # Iterate over hunks for this file and apply the reverse patch. + for hunk in file_diff["hunks_process"]: + hunk_list = [] + for line in hunk[3:]: + if line.startswith("-") or line.startswith(" "): + hunk_list.append(line[1:] + "\n") + raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list + del file_diff["hunks_process"] # Deletes this item from the dict in parent functions + return "".join(raw_file) + + +def process_commit(commit_data: dict) -> list[dict]: + """ + Process a commit dictionary to get the before files and diff dict. + + Args: + commit_data (dict): Dictionary containing commit hash, repo name, and + commit message. + + Returns: + list[dict]: A list of dicts, where each dict contains the data for a + change to a single file. + """ + # Scrape a commit's diff file. + diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff" diff = urllib.request.urlopen(diff_url) encoding = diff.headers.get_charsets()[0] patch = PatchSet(diff, encoding=encoding) - diff_list: list = [] + commit_list: list[dict] = [] for patch_ind in patch: # Skip if the file is not a python file. # if not patch_ind.target_file.endswith(".py"): # continue - diff_list.append(process_ind_patch(patch_ind)) - return diff_list - - -def apply_reverse_patch(diff_list: list, commit_hash: str, repo_name: str, length_threshold: int = 4096) -> list: - """Apply reverse patch to get before files. Returns list of modified files.""" - files_list: list = [] - repo_owner, repo_name = repo_name.split("/") - for diff in diff_list: - if diff["src_file"] == "/dev/null": - files_list.append([]) - continue - # Get raw after file. - file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/" - f"{repo_name}/{commit_hash}/{diff['tgt_file'][2:]}") - raw_file = urllib.request.urlopen(file_raw_url) - raw_file_encoding = raw_file.headers.get_charsets()[0] - raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] - # file_length = sum(1 for _ in raw_file) - # if file_length < length_threshold: - files_list.append(raw_file) - # Iterate over hunks for this file and apply the reverse patch. - for hunk in diff["hunks_process"]: - hunk_list = [] - for line in hunk[3:]: - if line.startswith("-") or line.startswith(" "): - hunk_list.append(line[1:] + "\n") - files_list[-1][hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list - del diff["hunks_process"] - - return files_list - - -def process_commit(commit_data: dict) -> dict: - """Process a commit hash and repo name to get the before files and diff dict.""" - # Get dict containing diff data. - diff_list = patch_parse(commit_data["commit"], commit_data["repo_name"]) - # Get list of files, each of which is a list of strings, one for each line. - files_list = apply_reverse_patch(diff_list, commit_data["commit"], commit_data["repo_name"]) - commit_data["before_files"] = files_list - commit_data["diff"] = diff_list - return commit_data + diff_dict: dict = process_ind_patch(patch_ind) + diff_dict.update(commit_data) + diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"]) + commit_list.append(diff_dict) + return commit_list class GitHubDiffDataset(Dataset): def __init__(self, config): - pass + self.config = config + self.scraper = GitHubDiffScraper(self.config) + + def download(self, *args, **kwargs) -> RawDataset: + return self.scraper.scrape() + + def process(self): + raise NotImplementedError class GitHubDiffScraper(Scraper): def __init__(self, config): - pass + # TODO: Dask multi-node scheduling here + client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + self.read_path = Path(config.read_path) + self.save_path = Path(config.save_path) - def scrape(self): - pass + def scrape(self) -> RawDataset: + _ = ( + db.read_text(self.read_path).map(json.loads) + .map(process_commit).flatten().to_dataframe() + .to_parquet(self.save_path) + ) + dataset = RawDataset(storage_uris=self.save_path, complete=True) + return dataset if __name__ == "__main__": read_path = Path(__file__).parent / "test_file.json" + save_path = Path(__file__).parent / "test.parquet" client = Client(n_workers=8, threads_per_worker=2) - diff_struct = pa.struct([ - ("addition_count", pa.int32()), - ("deletion_count", pa.int32()), - ("src_file", pa.string()), - ("tgt_file", pa.string()), - ("hunks", pa.list_(pa.string())), - ]) - schema = pa.schema([ - (pa.field("commit", pa.string())), - (pa.field("message", pa.string())), - (pa.field("repo_name", pa.string())), - (pa.field("before_files", pa.list_(pa.list_(pa.string())))), - (pa.field("diff", pa.list_(diff_struct))), - ]) - db.read_text(read_path).map(json.loads).map(process_commit).to_dataframe().to_parquet("test.parquet", - schema=schema) + db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path) From 0177806e00b2aab3e948657aa966b43995488ca9 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Fri, 7 Oct 2022 00:41:26 +0100 Subject: [PATCH 05/16] Fix file deletion bug --- codepile/ghdiffs/ghdiffs_scrape.py | 57 +++++++++++++++++++----------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index 73c4cb4..835c437 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -1,14 +1,9 @@ import json import urllib.request from pathlib import Path -from pprint import pprint -from xml.dom.minidom import ReadOnlySequentialNamedNodeMap - +import datetime import dask.bag as db -import numpy as np -import pyarrow as pa -from codepile.dataset import (Analyser, Dataset, DatasetInfo, DatasetSources, - Processor, RawDataset, Scraper) +from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper) from dask.distributed import Client from unidiff import PatchSet @@ -24,6 +19,7 @@ def process_ind_patch(patch_diff) -> dict: patch_parsed_diff["deletion_count"] = patch_diff.removed patch_parsed_diff["src_file"] = patch_diff.source_file patch_parsed_diff["tgt_file"] = patch_diff.target_file + patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix # patch_parsed_diff["patch_info"] = patch_diff.patch_info patch_diff_list = list(patch_diff) for patch_diff_ind in patch_diff_list: @@ -40,14 +36,19 @@ def process_ind_patch(patch_diff) -> dict: def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str: repo_owner, repo_name = repo_name.split("/") - if file_diff["src_file"] == "/dev/null": + if file_diff["src_file"] == "/dev/null" or file_diff["tgt_file"] == "/dev/null": return "" # Get raw after file. file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/" f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}") - raw_file = urllib.request.urlopen(file_raw_url) - raw_file_encoding = raw_file.headers.get_charsets()[0] - raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] + try: + raw_file = urllib.request.urlopen(file_raw_url) + raw_file_encoding = raw_file.headers.get_charsets()[0] + raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] + except Exception as e: + print(e) + print(file_raw_url) + return "" # file_length = sum(1 for _ in raw_file) # if file_length < length_threshold: # files_list.append(raw_file) @@ -76,7 +77,10 @@ def process_commit(commit_data: dict) -> list[dict]: """ # Scrape a commit's diff file. diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff" - diff = urllib.request.urlopen(diff_url) + try: + diff = urllib.request.urlopen(diff_url) + except Exception as e: + return [] encoding = diff.headers.get_charsets()[0] patch = PatchSet(diff, encoding=encoding) commit_list: list[dict] = [] @@ -102,13 +106,23 @@ def download(self, *args, **kwargs) -> RawDataset: def process(self): raise NotImplementedError + @property + def info(self) -> DatasetInfo: + return DatasetInfo( + id="GitHubDiffDataset", + description="Dataset of diffs from GitHub") + + @property + def id(self) -> str: + return "" + class GitHubDiffScraper(Scraper): def __init__(self, config): # TODO: Dask multi-node scheduling here - client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) - self.read_path = Path(config.read_path) - self.save_path = Path(config.save_path) + client = Client(n_workers=16, threads_per_worker=1) + self.read_path = Path(config["read_path"]) + self.save_path = Path(config["save_path"]) def scrape(self) -> RawDataset: _ = ( @@ -116,12 +130,15 @@ def scrape(self) -> RawDataset: .map(process_commit).flatten().to_dataframe() .to_parquet(self.save_path) ) - dataset = RawDataset(storage_uris=self.save_path, complete=True) + dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True) return dataset if __name__ == "__main__": - read_path = Path(__file__).parent / "test_file.json" - save_path = Path(__file__).parent / "test.parquet" - client = Client(n_workers=8, threads_per_worker=2) - db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path) + read_path = Path(__file__).parent / "full_test.json" + save_path = Path(__file__).parent / "full_test.parquet" + config = {"read_path": read_path, "save_path": save_path} + ghdiff_dataset = GitHubDiffDataset(config) + ghdiff_dataset.download() + # client = Client(n_workers=16, threads_per_worker=1) + # db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path) From ac781b3b38375249740a054a4d7bf84bace21500 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Fri, 7 Oct 2022 13:24:44 +0100 Subject: [PATCH 06/16] Fix deleted file bugs, add argparse --- .gitignore | 1 + codepile/ghdiffs/ghdiffs_scrape.py | 86 +++++++++++++++++------------- 2 files changed, 49 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 68bc17f..0561ce6 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,4 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +.vscode/ diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index 835c437..3f879b9 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -1,10 +1,10 @@ import json import urllib.request +import argparse from pathlib import Path -import datetime import dask.bag as db from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper) -from dask.distributed import Client +from dask.distributed import Client, progress from unidiff import PatchSet @@ -19,10 +19,12 @@ def process_ind_patch(patch_diff) -> dict: patch_parsed_diff["deletion_count"] = patch_diff.removed patch_parsed_diff["src_file"] = patch_diff.source_file patch_parsed_diff["tgt_file"] = patch_diff.target_file - patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix + if patch_parsed_diff["tgt_file"] == "/dev/null": + patch_parsed_diff["file_extension"] = Path(patch_diff.source_file).suffix + else: + patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix # patch_parsed_diff["patch_info"] = patch_diff.patch_info - patch_diff_list = list(patch_diff) - for patch_diff_ind in patch_diff_list: + for patch_diff_ind in patch_diff: patch_diff_ind = str(patch_diff_ind) patch_diff_split = patch_diff_ind.split("@@") patch_diff_line = patch_diff_split[2].split("\n") @@ -36,29 +38,33 @@ def process_ind_patch(patch_diff) -> dict: def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str: repo_owner, repo_name = repo_name.split("/") - if file_diff["src_file"] == "/dev/null" or file_diff["tgt_file"] == "/dev/null": - return "" - # Get raw after file. - file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/" - f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}") - try: - raw_file = urllib.request.urlopen(file_raw_url) - raw_file_encoding = raw_file.headers.get_charsets()[0] - raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] - except Exception as e: - print(e) - print(file_raw_url) - return "" - # file_length = sum(1 for _ in raw_file) - # if file_length < length_threshold: - # files_list.append(raw_file) - # Iterate over hunks for this file and apply the reverse patch. - for hunk in file_diff["hunks_process"]: - hunk_list = [] - for line in hunk[3:]: - if line.startswith("-") or line.startswith(" "): - hunk_list.append(line[1:] + "\n") - raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list + if file_diff["src_file"] == "/dev/null": + raw_file = ["Add"] + elif file_diff["tgt_file"] == "/dev/null": + # If file is deleted, get before file from the raw diff, which will be the full file. + raw_file = [line[1:] + "\n" for line in file_diff["hunks_process"][0][3:]] + else: + # Get raw after file. + file_raw_url = (f"https://raw.githubusercontent.com/{repo_owner}/" + f"{repo_name}/{commit_hash}/{file_diff['tgt_file'][2:]}") + try: + raw_file = urllib.request.urlopen(file_raw_url) + raw_file_encoding = raw_file.headers.get_charsets()[0] + raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] + except Exception as e: + print(e) + print(file_raw_url) + return "" + # file_length = sum(1 for _ in raw_file) + # if file_length < length_threshold: + # files_list.append(raw_file) + # Iterate over hunks for this file and apply the reverse patch. + for hunk in file_diff["hunks_process"]: + hunk_list = [] + for line in hunk[3:]: + if line.startswith("-") or line.startswith(" "): + hunk_list.append(line[1:] + "\n") + raw_file[hunk[0][0] - 1:hunk[0][0] + hunk[1][1] - 1] = hunk_list del file_diff["hunks_process"] # Deletes this item from the dict in parent functions return "".join(raw_file) @@ -80,13 +86,14 @@ def process_commit(commit_data: dict) -> list[dict]: try: diff = urllib.request.urlopen(diff_url) except Exception as e: + print(e) return [] encoding = diff.headers.get_charsets()[0] patch = PatchSet(diff, encoding=encoding) commit_list: list[dict] = [] for patch_ind in patch: # Skip if the file is not a python file. - # if not patch_ind.target_file.endswith(".py"): + # if not Path(patch_ind.target_file).suffix == ".py": # continue diff_dict: dict = process_ind_patch(patch_ind) diff_dict.update(commit_data) @@ -120,25 +127,28 @@ def id(self) -> str: class GitHubDiffScraper(Scraper): def __init__(self, config): # TODO: Dask multi-node scheduling here - client = Client(n_workers=16, threads_per_worker=1) - self.read_path = Path(config["read_path"]) - self.save_path = Path(config["save_path"]) + client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + self.read_path = Path(config.read_path) + self.save_path = Path(config.save_path) def scrape(self) -> RawDataset: - _ = ( + result = ( db.read_text(self.read_path).map(json.loads) .map(process_commit).flatten().to_dataframe() .to_parquet(self.save_path) ) + progress(result) dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True) return dataset if __name__ == "__main__": - read_path = Path(__file__).parent / "full_test.json" - save_path = Path(__file__).parent / "full_test.parquet" - config = {"read_path": read_path, "save_path": save_path} + parser = argparse.ArgumentParser('codepile dataset tool') + + parser.add_argument('--read_path', type=str) + parser.add_argument('--save_path', type=str) + parser.add_argument('--n_workers', type=int, default=8) + parser.add_argument('--threads_per_worker', type=int, default=2) + config = parser.parse_args() ghdiff_dataset = GitHubDiffDataset(config) ghdiff_dataset.download() - # client = Client(n_workers=16, threads_per_worker=1) - # db.read_text(read_path).map(json.loads).map(process_commit).flatten().to_dataframe().to_parquet(save_path) From 91b091eff92f3094657951b791633b655d362f3c Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Fri, 7 Oct 2022 13:27:10 +0100 Subject: [PATCH 07/16] Delete test json --- codepile/ghdiffs/test_file.json | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 codepile/ghdiffs/test_file.json diff --git a/codepile/ghdiffs/test_file.json b/codepile/ghdiffs/test_file.json deleted file mode 100644 index a515a81..0000000 --- a/codepile/ghdiffs/test_file.json +++ /dev/null @@ -1,20 +0,0 @@ -{"commit":"f96ce6590c9bad17df325fe965222fadc8a5d485","message":"Merge branch 'master' into assignments_small_fixes","repo_name":"tensorflow/tensorflow"} -{"commit":"0ec0a9bc0437f069ea0b08c46f1734ab837d2bf6","message":"Unroll partial merge from #1472\n","repo_name":"tensorflow/tensorflow"} -{"commit":"c12b21ca8fb6f38b89fe769176079d8530c8683b","message":"In regular DataFeeder for classification convert y into int64 to not raise numpy warnings on itemset\n","repo_name":"tensorflow/tensorflow"} -{"commit":"038ccd6ab030222d6041421d949b71891809de6c","message":"Merge pull request #48 from frol/master\n\nFixed travis scripts to be POSIX sh complaint and proof against whitespace issues","repo_name":"tensorflow/tensorflow"} -{"commit":"e52003b01cd4e7e0bc7a698bcfec331f6d5638f1","message":"Switch to enumerate in seq2seq ops\n","repo_name":"tensorflow/tensorflow"} -{"commit":"a64f3d7eaeceea5999b0530d55e32b6d2ab7d1dc","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"} -{"commit":"bd443b52367cc64358d4247a535439e7d42c6641","message":"Merge pull request #24 from terrytangyuan/patch-1\n\nAdded missing CV import in multiple_gpu example","repo_name":"tensorflow/tensorflow"} -{"commit":"abdfb10f2887210d18ff3b33596085f104029d7d","message":"Up the required version of Tensorflow - due to recent RNN changes, skflow will now only support 0.7+\n","repo_name":"tensorflow/tensorflow"} -{"commit":"43cd5c01318e4c51d65eb53f77a2b27ef72c557d","message":"Fixes #22: Added cross_validation to iris.py and iris_custom_model.py\n","repo_name":"tensorflow/tensorflow"} -{"commit":"b75c2a316558cf26cf1c81644dad43a7c2edd332","message":"Added apache headers\n","repo_name":"tensorflow/tensorflow"} -{"commit":"21df765ae4a35f655b068ae7d2440da98944cb7e","message":"Adding rnn_decoder with sampling sub-graph. Adding rnn seq2seq that will work with it.\n","repo_name":"tensorflow/tensorflow"} -{"commit":"67e33815af883113216001db33a8d4d14a674823","message":"Refactored out Trainer and ops from main file\n","repo_name":"tensorflow/tensorflow"} -{"commit":"881265ff80209710b2f064a98e2b7bc4608bbf64","message":"Fixes #42: Added a test for GridSearchCV to check that DNN works correctly with it\n","repo_name":"tensorflow/tensorflow"} -{"commit":"135018dbe63d7d72201862d0c5e5878d1fa4a923","message":"Added more coverage in tests\n","repo_name":"tensorflow/tensorflow"} -{"commit":"9cfbd7bcc1b9fe89fc22261ba316d6d5336985b4","message":"Merge pull request #36 from terrytangyuan/master\n\nAllow pandas.DataFrame data types","repo_name":"tensorflow/tensorflow"} -{"commit":"e27aa15ed7fd4f3d735bf7a8a29b97fdede710bd","message":"Add usage of batch norm in conv test and fix usage of is_training collection\n","repo_name":"tensorflow/tensorflow"} -{"commit":"05cae1f96f7ac2dfb6368fd87e6254194a04281a","message":"Merge pull request #89 from lopuhin/fix-dropout\n\nFix dropout management in TensorFlowEstimator._predict","repo_name":"tensorflow/tensorflow"} -{"commit":"a9a488de5754936a7e67245033f0d6b5fe7b4c88","message":"Fix #130: Added explicit save/restore to VocabularyProcessor\n","repo_name":"tensorflow/tensorflow"} -{"commit":"edc98e6269dc448f8e55be9af6018e4ee768b223","message":"Merge branch 'master' of github.com:google/skflow\n","repo_name":"tensorflow/tensorflow"} -{"commit":"845b91dd680a15e37e516d80fc7bff498e827f94","message":"[tf.learn] Allow Attention in RNNClassifier and Regressor (#2820)\n\n* Added attention cell wrappers in rnn_model\r\n\r\n* refactor and add attn_rnn tests\r\n\r\n* revised according to comments on lint\r\n","repo_name":"tensorflow/tensorflow"} From b60ca7d95a51bf3f6902a0d0c341c545b38a6b42 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Fri, 7 Oct 2022 13:49:46 +0100 Subject: [PATCH 08/16] Fix self.client --- codepile/ghdiffs/ghdiffs_scrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index 3f879b9..010b497 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -127,7 +127,7 @@ def id(self) -> str: class GitHubDiffScraper(Scraper): def __init__(self, config): # TODO: Dask multi-node scheduling here - client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) self.read_path = Path(config.read_path) self.save_path = Path(config.save_path) From ef100741a22fec7635f56de6982bbb93cf6725d2 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Mon, 17 Oct 2022 13:32:30 +0100 Subject: [PATCH 09/16] Add filtering --- codepile/ghdiffs/ghdiffs_scrape.py | 99 ++++++++++++++++++------------ 1 file changed, 59 insertions(+), 40 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index 010b497..cf06bea 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -1,4 +1,5 @@ import json +from typing import Any import urllib.request import argparse from pathlib import Path @@ -36,10 +37,10 @@ def process_ind_patch(patch_diff) -> dict: return patch_parsed_diff -def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str: +def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_threshold: int) -> str: repo_owner, repo_name = repo_name.split("/") if file_diff["src_file"] == "/dev/null": - raw_file = ["Add"] + raw_file: Any = ["ADDFILE"] elif file_diff["tgt_file"] == "/dev/null": # If file is deleted, get before file from the raw diff, which will be the full file. raw_file = [line[1:] + "\n" for line in file_diff["hunks_process"][0][3:]] @@ -51,13 +52,12 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str: raw_file = urllib.request.urlopen(file_raw_url) raw_file_encoding = raw_file.headers.get_charsets()[0] raw_file = [line.decode(raw_file_encoding) for line in raw_file.readlines()] + if length_threshold > 0 and len(raw_file) > length_threshold: + return "" except Exception as e: print(e) print(file_raw_url) return "" - # file_length = sum(1 for _ in raw_file) - # if file_length < length_threshold: - # files_list.append(raw_file) # Iterate over hunks for this file and apply the reverse patch. for hunk in file_diff["hunks_process"]: hunk_list = [] @@ -69,39 +69,6 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str) -> str: return "".join(raw_file) -def process_commit(commit_data: dict) -> list[dict]: - """ - Process a commit dictionary to get the before files and diff dict. - - Args: - commit_data (dict): Dictionary containing commit hash, repo name, and - commit message. - - Returns: - list[dict]: A list of dicts, where each dict contains the data for a - change to a single file. - """ - # Scrape a commit's diff file. - diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff" - try: - diff = urllib.request.urlopen(diff_url) - except Exception as e: - print(e) - return [] - encoding = diff.headers.get_charsets()[0] - patch = PatchSet(diff, encoding=encoding) - commit_list: list[dict] = [] - for patch_ind in patch: - # Skip if the file is not a python file. - # if not Path(patch_ind.target_file).suffix == ".py": - # continue - diff_dict: dict = process_ind_patch(patch_ind) - diff_dict.update(commit_data) - diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"]) - commit_list.append(diff_dict) - return commit_list - - class GitHubDiffDataset(Dataset): def __init__(self, config): self.config = config @@ -127,20 +94,65 @@ def id(self) -> str: class GitHubDiffScraper(Scraper): def __init__(self, config): # TODO: Dask multi-node scheduling here - self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + # self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) self.read_path = Path(config.read_path) self.save_path = Path(config.save_path) + self.python_only = config.python_only + self.diff_length_threshold = config.diff_length_threshold + self.code_length_threshold = config.code_length_threshold + self.ignore_deletions = config.ignore_deletions def scrape(self) -> RawDataset: + # TODO: pass meta dataframe result = ( db.read_text(self.read_path).map(json.loads) - .map(process_commit).flatten().to_dataframe() + .map(self.process_commit).flatten().to_dataframe() .to_parquet(self.save_path) ) progress(result) dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True) return dataset + def process_commit(self, commit_data: dict) -> list[dict]: + """ + Process a commit dictionary to get the before files and diff dict. + + Args: + commit_data (dict): Dictionary containing commit hash, repo name, and + commit message. + + Returns: + list[dict]: A list of dicts, where each dict contains the data for a + change to a single file. + """ + if self.python_only and commit_data["language_name"] != "Python": + return [] + # Scrape a commit's diff file. + diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff" + try: + diff = urllib.request.urlopen(diff_url) + encoding = diff.headers.get_charsets()[0] + patch = PatchSet(diff, encoding=encoding) + except Exception as e: + print(e) + return [] + commit_list: list[dict] = [] + # Iterate over files within the diff. + for patch_ind in patch: + if self.ignore_deletions and patch_ind.target_file == "/dev/null": + continue + if self.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > self.diff_length_threshold: + continue + diff_dict: dict = process_ind_patch(patch_ind) + diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"], + length_threshold=self.code_length_threshold) + if not diff_dict["before_file"]: + # Happens if exception is thrown or file is too long. + continue + diff_dict.update(commit_data) + commit_list.append(diff_dict) + return commit_list + if __name__ == "__main__": parser = argparse.ArgumentParser('codepile dataset tool') @@ -149,6 +161,13 @@ def scrape(self) -> RawDataset: parser.add_argument('--save_path', type=str) parser.add_argument('--n_workers', type=int, default=8) parser.add_argument('--threads_per_worker', type=int, default=2) + parser.add_argument('--python_only', type=bool, default=False) + parser.add_argument('--diff_length_threshold', type=int, default=1000, + help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.") + parser.add_argument('--code_length_threshold', type=int, default=1000, + help="Maximum number of lines in code files") + parser.add_argument('--ignore_deletions', type=bool, default=True, + help="Ignore file deletion diffs.") config = parser.parse_args() ghdiff_dataset = GitHubDiffDataset(config) ghdiff_dataset.download() From 4f28074e6ad1889ea08b26911c89a775018487b4 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Mon, 17 Oct 2022 15:27:44 +0100 Subject: [PATCH 10/16] Fix saving with metadata --- codepile/ghdiffs/ghdiffs_scrape.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index cf06bea..8044048 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -24,7 +24,6 @@ def process_ind_patch(patch_diff) -> dict: patch_parsed_diff["file_extension"] = Path(patch_diff.source_file).suffix else: patch_parsed_diff["file_extension"] = Path(patch_diff.target_file).suffix - # patch_parsed_diff["patch_info"] = patch_diff.patch_info for patch_diff_ind in patch_diff: patch_diff_ind = str(patch_diff_ind) patch_diff_split = patch_diff_ind.split("@@") @@ -55,8 +54,7 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_th if length_threshold > 0 and len(raw_file) > length_threshold: return "" except Exception as e: - print(e) - print(file_raw_url) + # print(e, file_raw_url) return "" # Iterate over hunks for this file and apply the reverse patch. for hunk in file_diff["hunks_process"]: @@ -94,7 +92,7 @@ def id(self) -> str: class GitHubDiffScraper(Scraper): def __init__(self, config): # TODO: Dask multi-node scheduling here - # self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) self.read_path = Path(config.read_path) self.save_path = Path(config.save_path) self.python_only = config.python_only @@ -103,10 +101,14 @@ def __init__(self, config): self.ignore_deletions = config.ignore_deletions def scrape(self) -> RawDataset: - # TODO: pass meta dataframe + meta_spec = {'hunks': str, 'addition_count': int, 'deletion_count': int, + 'src_file': str, 'tgt_file': str, 'file_extension': str, + 'before_file': str, 'commit': str, 'message': str, + 'repo_name': str, 'language_name': str, 'author_name': str, + 'license': str} result = ( db.read_text(self.read_path).map(json.loads) - .map(self.process_commit).flatten().to_dataframe() + .map(self.process_commit).flatten().to_dataframe(meta=meta_spec) .to_parquet(self.save_path) ) progress(result) @@ -133,8 +135,10 @@ def process_commit(self, commit_data: dict) -> list[dict]: diff = urllib.request.urlopen(diff_url) encoding = diff.headers.get_charsets()[0] patch = PatchSet(diff, encoding=encoding) + if len(patch) == 0: + return [] except Exception as e: - print(e) + # print(e, diff_url) return [] commit_list: list[dict] = [] # Iterate over files within the diff. @@ -149,7 +153,12 @@ def process_commit(self, commit_data: dict) -> list[dict]: if not diff_dict["before_file"]: # Happens if exception is thrown or file is too long. continue - diff_dict.update(commit_data) + diff_dict["commit"] = commit_data["commit"] + diff_dict["message"] = commit_data["message"] + diff_dict["repo_name"] = commit_data["repo_name"] + diff_dict["language_name"] = commit_data["language_name"] + diff_dict["author_name"] = commit_data["author"]["name"] + diff_dict["license"] = commit_data["license"] commit_list.append(diff_dict) return commit_list @@ -165,7 +174,7 @@ def process_commit(self, commit_data: dict) -> list[dict]: parser.add_argument('--diff_length_threshold', type=int, default=1000, help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.") parser.add_argument('--code_length_threshold', type=int, default=1000, - help="Maximum number of lines in code files") + help="Maximum number of lines in code files. Set to 0 for no limit.") parser.add_argument('--ignore_deletions', type=bool, default=True, help="Ignore file deletion diffs.") config = parser.parse_args() From b50ca0bc8092d7fe35ca93234fce37e8479943f2 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Mon, 17 Oct 2022 18:06:29 +0100 Subject: [PATCH 11/16] Fix dask bug with map --- codepile/ghdiffs/ghdiffs_scrape.py | 103 ++++++++++++++--------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index 8044048..d2085bc 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -67,6 +67,54 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_th return "".join(raw_file) +def process_commit(commit_data: dict, config: argparse.Namespace) -> list[dict]: + """ + Process a commit dictionary to get the before files and diff dict. + + Args: + commit_data (dict): Dictionary containing commit hash, repo name, and + commit message. + + Returns: + list[dict]: A list of dicts, where each dict contains the data for a + change to a single file. + """ + if config.python_only and commit_data["language_name"] != "Python": + return [] + # Scrape a commit's diff file. + diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff" + try: + diff = urllib.request.urlopen(diff_url) + encoding = diff.headers.get_charsets()[0] + patch = PatchSet(diff, encoding=encoding) + if len(patch) == 0: + return [] + except Exception as e: + # print(e, diff_url) + return [] + commit_list: list[dict] = [] + # Iterate over files within the diff. + for patch_ind in patch: + if config.ignore_deletions and patch_ind.target_file == "/dev/null": + continue + if config.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > config.diff_length_threshold: + continue + diff_dict: dict = process_ind_patch(patch_ind) + diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"], + length_threshold=config.code_length_threshold) + if not diff_dict["before_file"]: + # Happens if exception is thrown or file is too long. + continue + diff_dict["commit"] = commit_data["commit"] + diff_dict["message"] = commit_data["message"] + diff_dict["repo_name"] = commit_data["repo_name"] + diff_dict["language_name"] = commit_data["language_name"] + diff_dict["author_name"] = commit_data["author"]["name"] + diff_dict["license"] = commit_data["license"] + commit_list.append(diff_dict) + return commit_list + + class GitHubDiffDataset(Dataset): def __init__(self, config): self.config = config @@ -92,13 +140,11 @@ def id(self) -> str: class GitHubDiffScraper(Scraper): def __init__(self, config): # TODO: Dask multi-node scheduling here + # TODO: Release the GIL inside process_commit + self.config = config self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) self.read_path = Path(config.read_path) self.save_path = Path(config.save_path) - self.python_only = config.python_only - self.diff_length_threshold = config.diff_length_threshold - self.code_length_threshold = config.code_length_threshold - self.ignore_deletions = config.ignore_deletions def scrape(self) -> RawDataset: meta_spec = {'hunks': str, 'addition_count': int, 'deletion_count': int, @@ -108,60 +154,13 @@ def scrape(self) -> RawDataset: 'license': str} result = ( db.read_text(self.read_path).map(json.loads) - .map(self.process_commit).flatten().to_dataframe(meta=meta_spec) + .map(process_commit, config=self.config).flatten().to_dataframe(meta=meta_spec) .to_parquet(self.save_path) ) progress(result) dataset = RawDataset(storage_uris=["https://github.com/CarperAI/Code-Pile"], complete=True) return dataset - def process_commit(self, commit_data: dict) -> list[dict]: - """ - Process a commit dictionary to get the before files and diff dict. - - Args: - commit_data (dict): Dictionary containing commit hash, repo name, and - commit message. - - Returns: - list[dict]: A list of dicts, where each dict contains the data for a - change to a single file. - """ - if self.python_only and commit_data["language_name"] != "Python": - return [] - # Scrape a commit's diff file. - diff_url = f"https://github.com/{commit_data['repo_name']}/commit/{commit_data['commit']}.diff" - try: - diff = urllib.request.urlopen(diff_url) - encoding = diff.headers.get_charsets()[0] - patch = PatchSet(diff, encoding=encoding) - if len(patch) == 0: - return [] - except Exception as e: - # print(e, diff_url) - return [] - commit_list: list[dict] = [] - # Iterate over files within the diff. - for patch_ind in patch: - if self.ignore_deletions and patch_ind.target_file == "/dev/null": - continue - if self.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > self.diff_length_threshold: - continue - diff_dict: dict = process_ind_patch(patch_ind) - diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"], - length_threshold=self.code_length_threshold) - if not diff_dict["before_file"]: - # Happens if exception is thrown or file is too long. - continue - diff_dict["commit"] = commit_data["commit"] - diff_dict["message"] = commit_data["message"] - diff_dict["repo_name"] = commit_data["repo_name"] - diff_dict["language_name"] = commit_data["language_name"] - diff_dict["author_name"] = commit_data["author"]["name"] - diff_dict["license"] = commit_data["license"] - commit_list.append(diff_dict) - return commit_list - if __name__ == "__main__": parser = argparse.ArgumentParser('codepile dataset tool') From b5f4b92a3a4e6068486b96ebe1e6dc141348400b Mon Sep 17 00:00:00 2001 From: Reshinth Adithyan <36307201+reshinthadithyan@users.noreply.github.com> Date: Tue, 18 Oct 2022 18:00:05 +0530 Subject: [PATCH 12/16] GitHub Diff Repo Filter --- codepile/ghdiffs/ghdiffs_repo_filter.py | 96 +++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 codepile/ghdiffs/ghdiffs_repo_filter.py diff --git a/codepile/ghdiffs/ghdiffs_repo_filter.py b/codepile/ghdiffs/ghdiffs_repo_filter.py new file mode 100644 index 0000000..024b8d9 --- /dev/null +++ b/codepile/ghdiffs/ghdiffs_repo_filter.py @@ -0,0 +1,96 @@ +import re +from turtle import position +import pandas as pd +import requests +import logging +from io import StringIO +import os +from tqdm import tqdm +import json + +logging.basicConfig( + level = logging.INFO, + format= '%(asctime)s %(levelname)-8s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger(__name__) + + +def read_json_file(json_path:str): + with open(json_path,"r") as f: + return [json.loads(line) for line in f] + +def write_json_file(json_path:str,obj:dict): + with open(json_path, "w") as jsonFile: + json.dump(obj, jsonFile) + +class GitHubDiffFilter: + def __init__(self) -> None: + repo_file_url = "https://raw.githubusercontent.com/EleutherAI/github-downloader/master/github_repositories.csv" + repo_file_string = StringIO(requests.get(repo_file_url).text) + self.single_file_count_threshold = 10_000 + self.save_every = 1000 + if not os.path.isdir("tmp"): + logger.info(f"Sucessfully added `tmp/` path") + os.mkdir("tmp") + + self.ckpt_path = os.path.join("tmp","ckpt.json") + self.repos_df = pd.read_csv(repo_file_string) + self.top_repos_list = self.get_top_repo_list() + self.checkpoint_list = [] #Would contain the curr_index of the last saved file. + logger.info(f"Size of top repos : {len(self.top_repos_list)}") + + def get_top_repo_list(self)->list: + return self.repos_df.iloc[:,0].values.tolist() + + def get_diff_path_list(self,github_diff_path:str)->list: + """ + Get all the files in the given path + returns : + github_diff_list (list) : GitHub Diff path. + """ + return os.listdir(github_diff_path) + + def __call__(self,github_diff_path,output_file_path): + github_diff_files = self.get_diff_path_list(github_diff_path) + logger.info(f"Starting to process {len(github_diff_files)} files..") + #index and stuffs. + content_output_count = 0 + file_index = 0 + output_list = [] + output_file_index = 0 + for github_diff_file in tqdm(github_diff_files,total=len(github_diff_files)): + file_index += 1 # File index. + + if file_index == self.save_every: + write_json_file(self.ckpt_path,{"index":file_index}) + github_diff_file_path = os.path.join(github_diff_path,github_diff_file) + github_diff_content_list = read_json_file(github_diff_file_path) + for ind_content in tqdm(github_diff_content_list,total=len(github_diff_content_list),leave=False): + if ind_content["repo_name"] not in self.top_repos_list: + output_list.append(ind_content) + content_output_count += 1 + self.checkpoint_list.append(file_index) + + if content_output_count == self.single_file_count_threshold: + #If the number of elements in the list is + content_df = pd.DataFrame.from_dict(output_list,orient="columns") + output_inter_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet") + content_df.to_parquet(output_inter_file_path) + output_file_index += 1 + + #Final Chunk goes here.. + write_json_file(self.ckpt_path,{"index":file_index}) + content_df = pd.DataFrame.from_dict(output_list,orient="columns") + output_final_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet") + content_df.to_parquet(output_final_file_path) + + + + + + + + + +if __name__ == "__main__": + gh_diff_filter = GitHubDiffFilter() \ No newline at end of file From 84dcb2cc890de49f2415d1fb44e499d0aa33bf68 Mon Sep 17 00:00:00 2001 From: Reshinth Adithyan <36307201+reshinthadithyan@users.noreply.github.com> Date: Tue, 18 Oct 2022 18:08:16 +0530 Subject: [PATCH 13/16] Add resume checkpoint util --- codepile/ghdiffs/ghdiffs_repo_filter.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/codepile/ghdiffs/ghdiffs_repo_filter.py b/codepile/ghdiffs/ghdiffs_repo_filter.py index 024b8d9..9a14ea0 100644 --- a/codepile/ghdiffs/ghdiffs_repo_filter.py +++ b/codepile/ghdiffs/ghdiffs_repo_filter.py @@ -32,8 +32,12 @@ def __init__(self) -> None: if not os.path.isdir("tmp"): logger.info(f"Sucessfully added `tmp/` path") os.mkdir("tmp") - self.ckpt_path = os.path.join("tmp","ckpt.json") + + if os.path.exists(self.ckpt_path): + self.to_start = read_json_file(self.ckpt_path)["index"] + else: + self.to_start = 0 self.repos_df = pd.read_csv(repo_file_string) self.top_repos_list = self.get_top_repo_list() self.checkpoint_list = [] #Would contain the curr_index of the last saved file. @@ -58,6 +62,9 @@ def __call__(self,github_diff_path,output_file_path): file_index = 0 output_list = [] output_file_index = 0 + + #Start_ind + github_diff_files = github_diff_files[self.to_start:] for github_diff_file in tqdm(github_diff_files,total=len(github_diff_files)): file_index += 1 # File index. From 9ac9ece57996df445a6165273b1c6b706762a294 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Wed, 19 Oct 2022 02:21:58 +0100 Subject: [PATCH 14/16] Filter out zero diffs --- codepile/ghdiffs/ghdiffs_scrape.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_scrape.py b/codepile/ghdiffs/ghdiffs_scrape.py index d2085bc..753446f 100644 --- a/codepile/ghdiffs/ghdiffs_scrape.py +++ b/codepile/ghdiffs/ghdiffs_scrape.py @@ -5,7 +5,7 @@ from pathlib import Path import dask.bag as db from codepile.dataset import (Dataset, DatasetInfo, RawDataset, Scraper) -from dask.distributed import Client, progress +from dask.distributed import Client, progress, LocalCluster from unidiff import PatchSet @@ -54,7 +54,6 @@ def get_before_file(file_diff: dict, commit_hash: str, repo_name: str, length_th if length_threshold > 0 and len(raw_file) > length_threshold: return "" except Exception as e: - # print(e, file_raw_url) return "" # Iterate over hunks for this file and apply the reverse patch. for hunk in file_diff["hunks_process"]: @@ -99,6 +98,9 @@ def process_commit(commit_data: dict, config: argparse.Namespace) -> list[dict]: continue if config.diff_length_threshold > 0 and sum(len(hunk) for hunk in patch_ind) > config.diff_length_threshold: continue + # Filter non-text files. + if patch_ind.added == 0 and patch_ind.removed == 0: + continue diff_dict: dict = process_ind_patch(patch_ind) diff_dict["before_file"] = get_before_file(diff_dict, commit_data["commit"], commit_data["repo_name"], length_threshold=config.code_length_threshold) @@ -140,9 +142,9 @@ def id(self) -> str: class GitHubDiffScraper(Scraper): def __init__(self, config): # TODO: Dask multi-node scheduling here - # TODO: Release the GIL inside process_commit self.config = config - self.client = Client(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + cluster = LocalCluster(n_workers=config.n_workers, threads_per_worker=config.threads_per_worker) + self.client = Client(cluster) self.read_path = Path(config.read_path) self.save_path = Path(config.save_path) @@ -168,7 +170,7 @@ def scrape(self) -> RawDataset: parser.add_argument('--read_path', type=str) parser.add_argument('--save_path', type=str) parser.add_argument('--n_workers', type=int, default=8) - parser.add_argument('--threads_per_worker', type=int, default=2) + parser.add_argument('--threads_per_worker', type=int, default=1) parser.add_argument('--python_only', type=bool, default=False) parser.add_argument('--diff_length_threshold', type=int, default=1000, help="Maximum number of lines in the diff for a *single* file. Set to 0 for no limit.") From 6633763fed9d7c6a975381b3d20ba5ebba9a2de0 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Wed, 19 Oct 2022 02:22:09 +0100 Subject: [PATCH 15/16] Add dummy parquet --- codepile/ghdiffs/test/ghdiffs_dummy.parquet | Bin 0 -> 7735 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 codepile/ghdiffs/test/ghdiffs_dummy.parquet diff --git a/codepile/ghdiffs/test/ghdiffs_dummy.parquet b/codepile/ghdiffs/test/ghdiffs_dummy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e30733b2bf10784f09093357928000723f8010ee GIT binary patch literal 7735 zcmcgx&2QV-5$EQ^NjBcyqOA?PaN9t@3MlfS@2s`AM>(RY+OngpNlHZ_@JkY9lF~|~ zWjP2?6g}kFL(xM{d+4cuK>wf~dRXkKhn{li%p)Z}iLy4z3T7foeDm>}H#2W$2%nN# zX6w!tw{`0XZrsl~j!P!@wl;SQKjSubckXt4_k6f@Yn$8N+x|_GOWyz8Uce5F^KyUx z1Zbh)=c-{?p4D@kde3(~M#JS24exF-8Vu7m7ixI_$r2Uc-HxokKOE@ImSvj^{GV?x z5q@`v5%${Nyzu*F!hgFP34e~LP4m(--61%E!T;Z#W%wJBiieun>J7}833r#t-;3lo z^q%8btmofvED`{oX?UpmUZ*#hm;e6LW%8Sm{5{*!A!sw!KU^aFr_Z94{YA6s`nKIP z)Zuy4at-sc*1UD|06aP8}}J z9*#hEn(h;ERCs@q#_Zx;da@)o?b!7&o!F&Gny_n8_}SIju(A|3TxM#m+y!c%2Aph2FD$#~miVe)NaL zf=ZJ3KsCK_9~l$AYw111OaQm*JN@yL z(9^o6?%}#)da9v%DzLv!m@cqeZX0FKz9~Q5PrP{(88d4|f=&#mV#bhY`6to0XuyOhvzU6w)^Knm1ml;RIz-Nna96U~6Blu*tiNms#Il$E-AKUPxm&7qy z%6c(pC#g7coW5RsW(K8<6~5!Z_xPPTDWhOl15MO`e`8LnZPvt(M3KcHjLxtDVSS2SfYLcfU^2 zi~ODt@)ACH!S6ib+oViLo0Qsgjpt|RBV2qjrJs<);2G{z3#E&Mj0niVH&p`PR0!^M z`0%5;TsW^Q$AH)adHyTWvHgZ1<&|vpLOYZ^%{{hGEM61sVoP(Sp;j&ee|4Z_L%z(u zrMw!oMJMF9i+#}iQQgX&J8WiDSM4q;&V|bWHezk6iXFHm8D^$T=eU5#zsD> zy99b%z60_dJyT4Z3QuXGEBUHiy=Vw$)=`OA!0YLUcFQ=lM^taAa#45sc0KnppvzLa zF4sDS@B(dzJxP8&TRd0gf{R$9EuGb@bY65b#Tnbfy_)07)GpEKWHiUS5-r?gqu-D( z&y{SUqseEWn^@{0*p@q{xVlgV-<7YFT&%ERt#1hS zg=QDKT2AsqoG|WH)`xx2ldT4JEU?q|je~kR@I%Hngfiqotuu|?VR|SYGO;dI&cRo} z7sHOlUW9zn1Sze`MIZAHd=}s=$_2>J$PUOSg)gxFh8U---GzAod%7I9AXjJN>sH70 zOc89lw6BY?o~v|~Y@@G*`FB%sHUuNh;x349x{<53V0{?o*P^&W{@PtFTMPFW_nDTh zeA|$#tHhT2OUX)kLy&wu3o*86{w|0u`b}yzluo)K7p^eI)X$4z`_s<(;`llhyWtea z4QY9d>yG4Txr)vD`=;UyXV}wZJM@3N#>~)f%x@W6&s95(Y*2F-Q!Afg1ovZ zyWEPc3($*#p0co)h;g2qIu}Fer>-HFUC6hfhg{P)H0%{Q3H&Rud$}n)JBCv0LB6xt z3(jsiV+frK-DdW3Q*m}Kp>JYsxioSRPI+wNY*DI=Ou0PPonkJqqhafA z6=sS&om);`NgX8cze=StbMpLT-bW6+&f&w48w!6h z!G-7J)I}=QL3w0 z>J0IKZ@GVFdn*}J`wq~(2yu;nFrWDNoB?J%M)|8b`IvvuemZ7lI-}%82(^M0r$9I&_sfiRLSPBq#UYyz?!=mMtd0j`5_(;$ZVFuz}VK!o5uClU!9 whR@v2Z~N-tTi^7a>izyx`ri8}e6f8*-*|U_fIl^T;D7F4zv8%4_>=Mf0pKy7h5!Hn literal 0 HcmV?d00001 From 6b61fbe4ea23af9543dd45e945a9f7b7726dc5c8 Mon Sep 17 00:00:00 2001 From: Herbie Bradley Date: Wed, 19 Oct 2022 02:22:40 +0100 Subject: [PATCH 16/16] Fix bug & change output to json for easy scraping --- codepile/ghdiffs/ghdiffs_repo_filter.py | 77 ++++++++++++------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/codepile/ghdiffs/ghdiffs_repo_filter.py b/codepile/ghdiffs/ghdiffs_repo_filter.py index 9a14ea0..0082cfe 100644 --- a/codepile/ghdiffs/ghdiffs_repo_filter.py +++ b/codepile/ghdiffs/ghdiffs_repo_filter.py @@ -9,20 +9,22 @@ import json logging.basicConfig( - level = logging.INFO, - format= '%(asctime)s %(levelname)-8s %(message)s', + level=logging.INFO, + format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger(__name__) -def read_json_file(json_path:str): - with open(json_path,"r") as f: +def read_json_file(json_path: str): + with open(json_path, "r") as f: return [json.loads(line) for line in f] -def write_json_file(json_path:str,obj:dict): + +def write_json_file(json_path: str, obj: dict): with open(json_path, "w") as jsonFile: json.dump(obj, jsonFile) + class GitHubDiffFilter: def __init__(self) -> None: repo_file_url = "https://raw.githubusercontent.com/EleutherAI/github-downloader/master/github_repositories.csv" @@ -32,72 +34,65 @@ def __init__(self) -> None: if not os.path.isdir("tmp"): logger.info(f"Sucessfully added `tmp/` path") os.mkdir("tmp") - self.ckpt_path = os.path.join("tmp","ckpt.json") - + self.ckpt_path = os.path.join("tmp", "ckpt.json") + if os.path.exists(self.ckpt_path): self.to_start = read_json_file(self.ckpt_path)["index"] else: self.to_start = 0 - self.repos_df = pd.read_csv(repo_file_string) + self.repos_df = pd.read_csv(repo_file_string) self.top_repos_list = self.get_top_repo_list() - self.checkpoint_list = [] #Would contain the curr_index of the last saved file. + self.checkpoint_list = [] # Would contain the curr_index of the last saved file. logger.info(f"Size of top repos : {len(self.top_repos_list)}") - def get_top_repo_list(self)->list: - return self.repos_df.iloc[:,0].values.tolist() + def get_top_repo_list(self) -> list: + return self.repos_df.iloc[:, 0].values.tolist() - def get_diff_path_list(self,github_diff_path:str)->list: + def get_diff_path_list(self, github_diff_path: str) -> list: """ Get all the files in the given path returns : github_diff_list (list) : GitHub Diff path. """ return os.listdir(github_diff_path) - - def __call__(self,github_diff_path,output_file_path): + + def __call__(self, github_diff_path, output_file_path): github_diff_files = self.get_diff_path_list(github_diff_path) logger.info(f"Starting to process {len(github_diff_files)} files..") - #index and stuffs. + # index and stuffs. content_output_count = 0 file_index = 0 output_list = [] output_file_index = 0 - - #Start_ind + + # Start_ind github_diff_files = github_diff_files[self.to_start:] - for github_diff_file in tqdm(github_diff_files,total=len(github_diff_files)): - file_index += 1 # File index. + for github_diff_file in tqdm(github_diff_files, total=len(github_diff_files)): + file_index += 1 # File index. if file_index == self.save_every: - write_json_file(self.ckpt_path,{"index":file_index}) - github_diff_file_path = os.path.join(github_diff_path,github_diff_file) + write_json_file(self.ckpt_path, {"index": file_index}) + github_diff_file_path = os.path.join(github_diff_path, github_diff_file) github_diff_content_list = read_json_file(github_diff_file_path) - for ind_content in tqdm(github_diff_content_list,total=len(github_diff_content_list),leave=False): - if ind_content["repo_name"] not in self.top_repos_list: + for ind_content in tqdm(github_diff_content_list, total=len(github_diff_content_list), leave=False): + if ind_content["repo_name"] in self.top_repos_list: output_list.append(ind_content) content_output_count += 1 self.checkpoint_list.append(file_index) - + if content_output_count == self.single_file_count_threshold: - #If the number of elements in the list is - content_df = pd.DataFrame.from_dict(output_list,orient="columns") - output_inter_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet") - content_df.to_parquet(output_inter_file_path) + # If the number of elements in the list is + content_df = pd.DataFrame.from_dict(output_list, orient="columns") + output_inter_file_path = os.path.join(output_file_path, str(output_file_index) + ".json") + content_df.to_json(output_inter_file_path, orient="records", lines=True) output_file_index += 1 - - #Final Chunk goes here.. - write_json_file(self.ckpt_path,{"index":file_index}) - content_df = pd.DataFrame.from_dict(output_list,orient="columns") - output_final_file_path = os.path.join(output_file_path,str(output_file_index)+".parquet") - content_df.to_parquet(output_final_file_path) - - - - - - + # Final Chunk goes here.. + write_json_file(self.ckpt_path, {"index": file_index}) + content_df = pd.DataFrame.from_dict(output_list, orient="columns") + output_final_file_path = os.path.join(output_file_path, str(output_file_index) + ".json") + content_df.to_json(output_final_file_path, orient="records", lines=True) if __name__ == "__main__": - gh_diff_filter = GitHubDiffFilter() \ No newline at end of file + gh_diff_filter = GitHubDiffFilter()