diff --git a/mtdata/__init__.py b/mtdata/__init__.py index 4e7cf93..1e9c09b 100644 --- a/mtdata/__init__.py +++ b/mtdata/__init__.py @@ -4,7 +4,7 @@ # Created: 4/4/20 -__version__ = '0.4.3' +__version__ = '0.4.4-dev' __description__ = 'mtdata is a tool to download datasets for machine translation' __author__ = 'Thamme Gowda' diff --git a/mtdata/index/other.py b/mtdata/index/other.py index 006906b..12042ce 100644 --- a/mtdata/index/other.py +++ b/mtdata/index/other.py @@ -144,3 +144,23 @@ def load_all(index: Index): l2_ext = l2.replace('_', "-") index += Entry(did=f"Microsoft-ntrex-128-{l1}-{l2}", url=_url, filename="NTREX-52b9c57c.tar.gz", in_ext='txt', in_paths=[f"*/NTREX-128/newstest2019-ref.{l1_ext}.txt", f"*/NTREX-128/newstest2019-ref.{l2_ext}.txt"]) + + ### English - Bhojpuri ### + url="https://github.com/shashwatup9k/BHLTR/archive/2d2550033222.zip" + filename = url.split('/')[-1] # this will force to share the file across the entries + cite = ("ojha2019english",) + # Parallel corpora + for split, splitname in [ # shortname, fullname, suffix + ("train", "training"), + ("dev", "development"), + ("test", "test.*")]: + f1 = f"*/parallel-corpora/eng--bho.{splitname}.eng" + f2 = f"*/parallel-corpora/eng--bho.{splitname}.bho" + index += Entry(did=DatasetId(group='BHLTR', name=split, version='1', langs=('eng', 'bho')), + url=url, filename=filename, ext='zip', in_ext='txt', in_paths=[f1, f2], cite=cite) + # monolingual corpora + for version, f1 in [ + ("1", "*/mono-bho-corpus/monolingual.bho"), + ("2", "*/mono-bho-corpus/monolingual-v0.2.bho")]: + index += Entry(did=DatasetId(group='BHLTR', name=f'mono', version=version, langs=('bho',)), + url=url, filename=filename, ext='zip', in_ext='txt', in_paths=[f1], cite=cite) diff --git a/mtdata/map.py b/mtdata/map.py index 3297a64..25d5dc5 100644 --- a/mtdata/map.py +++ b/mtdata/map.py @@ -22,7 +22,8 @@ from mtdata.utils import IO -DELIM = '\t' +#DELIM = '\t' +DELIM = None SENTINEL = None @@ -49,11 +50,11 @@ def read_paths(paths: Iterator[List[Path]]) -> Iterator[Union[dict,list]]: for rec in zip_longest(*streams): if len(inps) > 1 and any(x is None for x in rec): raise ValueError(f"Unequal number of lines detected in {inps} @ count: {counter}") - rec = '\t'.join(x.strip().replace('\t', ' ') for x in rec) + rec = '\t'.join(x.strip() for x in rec) yield rec counter += 1 n_data += counter - log.info(f"Producer: end of {inps}; count: {counter}") + log.info(f"Producer: End of {','.join(str(x) for x in inps)}; count: {counter}") except Exception as e: log.exception(f"Producer: error in {inps}: {e}") log.info(f"Producer: finishing... n_ctrls: {n_ctrls}; n_data: {n_data:,}") @@ -188,7 +189,11 @@ def read_stream(cls, paths: Iterator[List[Path]]) -> Iterator[Union[dict,list]]: def read_input_paths(input, delim=DELIM): for line in input: - parts = line.rstrip("\n").split(delim) + parts = line.rstrip("\n") + if delim: + parts = parts.split(delim) + else: + parts = parts.split() # white spaces parts = [Path(p) for p in parts] yield parts @@ -219,14 +224,29 @@ def main(): stream = trim_stream(stream, skip=n_skip, limit=n_limit) mapper = SubprocMapper(cmdline=args['cmdline']) + out = None try: out_stream = mapper(stream) for rec in out_stream: - print(rec) + if isinstance(rec, dict): + if out is not None: + log.info(f"[[closing]] {out.name}") + out.close() + log.info(f"[[opening]] {rec['output']}") + out_path = Path(rec['output']) + if args['make_parents']: + out_path.parent.mkdir(parents=True, exist_ok=True) + out = out_path.open('w', encoding='utf-8', errors='replace') + else: + assert out is not None, f"Output file is not opened yet" + out.write(rec + '\n') except Exception as e: mapper.close() raise - + finally: + if out is not None: + log.info(f"((closing)) {out.name}") + out.close() def parse_args(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -234,11 +254,14 @@ def parse_args(): help="Mapper command that maps line-by-line, maintains 1:1 mapping and the input order. For example: 'cat'") parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin, help="Listing file containing file paths. Atleast two paths per line is expected first one is input and last one is output") - parser.add_argument('-d', '--delim', type=str, default=DELIM, help="delimiter for paths in input") + parser.add_argument('-d', '--delim', type=str, default=DELIM, help="delimiter for paths in input. default=None => split by all whitespaces (space, tab etc.)") parser.add_argument('-l', '--limit', type=int, default=0, help="Limit data stream to these many lines. Score: for debugging and testing") parser.add_argument('-s', '--skip', type=int, default=0, help="Skip the first n records. Scope: for debugging and testing") + parser.add_argument('-p', '--parents', action='store_true', dest='make_parents', + help="Create parent directories for output files if they do not exist") + # return the parsed arguments return parser.parse_args() if __name__ == '__main__': diff --git a/mtdata/resource/refs.bib b/mtdata/resource/refs.bib index eed9dc7..58d4568 100644 --- a/mtdata/resource/refs.bib +++ b/mtdata/resource/refs.bib @@ -720,3 +720,10 @@ @misc{nagata2024japanesechinese archivePrefix={arXiv}, primaryClass={cs.CL}, } + +@article{ojha2019english, + title={English-Bhojpuri SMT System: Insights from the Karaka Model}, + author={Ojha, Atul Kr}, + journal={arXiv preprint arXiv:1905.02239}, + year={2019} +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 118fe87..f872014 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ test = [ "pytest", "pytest-cov[all]", "black", "isort", "mypy"] mtdata = "mtdata.__main__:main" mtdata-iso = "mtdata.iso.__main__:main" mtdata-bcp47 = "mtdata.iso.bcp47:main" +mtdata-map = "mtdata.map:main" [tool.setuptools.dynamic] diff --git a/tests/test_map_cli.py b/tests/test_map_cli.py new file mode 100644 index 0000000..1d87e6a --- /dev/null +++ b/tests/test_map_cli.py @@ -0,0 +1,89 @@ +import sys +import subprocess +from pathlib import Path + + +def run_map_cmd(cmd_args, cwd): + cmd = [sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(cmd_args['list_file'])] + if cmd_args.get('make_parents'): + cmd.append('-p') + if cmd_args.get('skip'): + cmd.extend(['-s', str(cmd_args['skip'])]) + if cmd_args.get('limit'): + cmd.extend(['-l', str(cmd_args['limit'])]) + return subprocess.run(cmd, capture_output=True, text=True, cwd=cwd) + + +def test_map_cli_basic(tmp_path): + repo_root = Path(__file__).resolve().parents[1] + # create a simple input file + inp = tmp_path / 'in1.txt' + inp.write_text('line1\nline2\n') + + outp = tmp_path / 'out' / 'o1.txt' + listing = tmp_path / 'list.txt' + listing.write_text(f"{inp}\t{outp}\n") + + res = run_map_cmd({'list_file': listing, 'make_parents': True}, cwd=repo_root) + assert res.returncode == 0, f"STDERR:\n{res.stderr}" + + assert outp.exists() + assert outp.read_text().splitlines() == ['line1', 'line2'] + + +def test_map_cli_skip_limit(tmp_path): + repo_root = Path(__file__).resolve().parents[1] + inp = tmp_path / 'in2.txt' + inp.write_text('L1\nL2\nL3\n') + + outp = tmp_path / 'out2.txt' + listing = tmp_path / 'list2.txt' + listing.write_text(f"{inp}\t{outp}\n") + + # skip=1 should skip the first data line; limit=3 yields ctrl + two data lines + res = subprocess.run([sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(listing), '-p', '-s', '1', '-l', '3'], + capture_output=True, text=True, cwd=repo_root) + assert res.returncode == 0, f"STDERR:\n{res.stderr}" + + assert outp.exists() + assert outp.read_text().splitlines() == ['L2', 'L3'] + + +def test_map_cli_multi_columns(tmp_path): + repo_root = Path(__file__).resolve().parents[1] + + # first group of three input files + in1a = tmp_path / 'g1_a.txt' + in1b = tmp_path / 'g1_b.txt' + in1c = tmp_path / 'g1_c.txt' + in1a.write_text('a1\na2\n') + in1b.write_text('b1\nb2\n') + in1c.write_text('c1\nc2\n') + + out1 = tmp_path / 'out_g1.txt' + + # second group of three input files + in2a = tmp_path / 'g2_a.txt' + in2b = tmp_path / 'g2_b.txt' + in2c = tmp_path / 'g2_c.txt' + in2a.write_text('x1\nx2\n') + in2b.write_text('y1\ny2\n') + in2c.write_text('z1\nz2\n') + + out2 = tmp_path / 'out_g2.txt' + + listing = tmp_path / 'list_multi.txt' + listing.write_text( + f"{in1a}\t{in1b}\t{in1c}\t{out1}\n" + f"{in2a}\t{in2b}\t{in2c}\t{out2}\n" + ) + + res = subprocess.run([sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(listing), '-p'], + capture_output=True, text=True, cwd=repo_root) + assert res.returncode == 0, f"STDERR:\n{res.stderr}" + + assert out1.exists() + assert out1.read_text().splitlines() == ['a1\tb1\tc1', 'a2\tb2\tc2'] + + assert out2.exists() + assert out2.read_text().splitlines() == ['x1\ty1\tz1', 'x2\ty2\tz2']