Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mtdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Created: 4/4/20


__version__ = '0.4.3'
__version__ = '0.4.4-dev'
__description__ = 'mtdata is a tool to download datasets for machine translation'
__author__ = 'Thamme Gowda'

Expand Down
20 changes: 20 additions & 0 deletions mtdata/index/other.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,3 +144,23 @@ def load_all(index: Index):
l2_ext = l2.replace('_', "-")
index += Entry(did=f"Microsoft-ntrex-128-{l1}-{l2}", url=_url, filename="NTREX-52b9c57c.tar.gz",
in_ext='txt', in_paths=[f"*/NTREX-128/newstest2019-ref.{l1_ext}.txt", f"*/NTREX-128/newstest2019-ref.{l2_ext}.txt"])

### English - Bhojpuri ###
url="https://github.com/shashwatup9k/BHLTR/archive/2d2550033222.zip"
filename = url.split('/')[-1] # this will force to share the file across the entries
cite = ("ojha2019english",)
# Parallel corpora
for split, splitname in [ # shortname, fullname, suffix
("train", "training"),
("dev", "development"),
("test", "test.*")]:
f1 = f"*/parallel-corpora/eng--bho.{splitname}.eng"
f2 = f"*/parallel-corpora/eng--bho.{splitname}.bho"
index += Entry(did=DatasetId(group='BHLTR', name=split, version='1', langs=('eng', 'bho')),
url=url, filename=filename, ext='zip', in_ext='txt', in_paths=[f1, f2], cite=cite)
# monolingual corpora
for version, f1 in [
("1", "*/mono-bho-corpus/monolingual.bho"),
("2", "*/mono-bho-corpus/monolingual-v0.2.bho")]:
index += Entry(did=DatasetId(group='BHLTR', name=f'mono', version=version, langs=('bho',)),
url=url, filename=filename, ext='zip', in_ext='txt', in_paths=[f1], cite=cite)
37 changes: 30 additions & 7 deletions mtdata/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from mtdata.utils import IO


DELIM = '\t'
#DELIM = '\t'
DELIM = None
SENTINEL = None


Expand All @@ -49,11 +50,11 @@ def read_paths(paths: Iterator[List[Path]]) -> Iterator[Union[dict,list]]:
for rec in zip_longest(*streams):
if len(inps) > 1 and any(x is None for x in rec):
raise ValueError(f"Unequal number of lines detected in {inps} @ count: {counter}")
rec = '\t'.join(x.strip().replace('\t', ' ') for x in rec)
rec = '\t'.join(x.strip() for x in rec)
yield rec
counter += 1
n_data += counter
log.info(f"Producer: end of {inps}; count: {counter}")
log.info(f"Producer: End of {','.join(str(x) for x in inps)}; count: {counter}")
except Exception as e:
log.exception(f"Producer: error in {inps}: {e}")
log.info(f"Producer: finishing... n_ctrls: {n_ctrls}; n_data: {n_data:,}")
Expand Down Expand Up @@ -188,7 +189,11 @@ def read_stream(cls, paths: Iterator[List[Path]]) -> Iterator[Union[dict,list]]:

def read_input_paths(input, delim=DELIM):
for line in input:
parts = line.rstrip("\n").split(delim)
parts = line.rstrip("\n")
if delim:
parts = parts.split(delim)
else:
parts = parts.split() # white spaces
parts = [Path(p) for p in parts]
yield parts

Expand Down Expand Up @@ -219,26 +224,44 @@ def main():
stream = trim_stream(stream, skip=n_skip, limit=n_limit)

mapper = SubprocMapper(cmdline=args['cmdline'])
out = None
try:
out_stream = mapper(stream)
for rec in out_stream:
print(rec)
if isinstance(rec, dict):
if out is not None:
log.info(f"[[closing]] {out.name}")
out.close()
log.info(f"[[opening]] {rec['output']}")
out_path = Path(rec['output'])
if args['make_parents']:
out_path.parent.mkdir(parents=True, exist_ok=True)
out = out_path.open('w', encoding='utf-8', errors='replace')
else:
assert out is not None, f"Output file is not opened yet"
out.write(rec + '\n')
except Exception as e:
mapper.close()
raise

finally:
if out is not None:
log.info(f"((closing)) {out.name}")
out.close()

def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-c', '--cmd', dest='cmdline', type=str, required=True,
help="Mapper command that maps line-by-line, maintains 1:1 mapping and the input order. For example: 'cat'")
parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
help="Listing file containing file paths. Atleast two paths per line is expected first one is input and last one is output")
parser.add_argument('-d', '--delim', type=str, default=DELIM, help="delimiter for paths in input")
parser.add_argument('-d', '--delim', type=str, default=DELIM, help="delimiter for paths in input. default=None => split by all whitespaces (space, tab etc.)")
parser.add_argument('-l', '--limit', type=int, default=0,
help="Limit data stream to these many lines. Score: for debugging and testing")
parser.add_argument('-s', '--skip', type=int, default=0,
help="Skip the first n records. Scope: for debugging and testing")
parser.add_argument('-p', '--parents', action='store_true', dest='make_parents',
help="Create parent directories for output files if they do not exist")
# return the parsed arguments
return parser.parse_args()

if __name__ == '__main__':
Expand Down
7 changes: 7 additions & 0 deletions mtdata/resource/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -720,3 +720,10 @@ @misc{nagata2024japanesechinese
archivePrefix={arXiv},
primaryClass={cs.CL},
}

@article{ojha2019english,
title={English-Bhojpuri SMT System: Insights from the Karaka Model},
author={Ojha, Atul Kr},
journal={arXiv preprint arXiv:1905.02239},
year={2019}
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ test = [ "pytest", "pytest-cov[all]", "black", "isort", "mypy"]
mtdata = "mtdata.__main__:main"
mtdata-iso = "mtdata.iso.__main__:main"
mtdata-bcp47 = "mtdata.iso.bcp47:main"
mtdata-map = "mtdata.map:main"


[tool.setuptools.dynamic]
Expand Down
89 changes: 89 additions & 0 deletions tests/test_map_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import sys
import subprocess
from pathlib import Path


def run_map_cmd(cmd_args, cwd):
cmd = [sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(cmd_args['list_file'])]
if cmd_args.get('make_parents'):
cmd.append('-p')
if cmd_args.get('skip'):
cmd.extend(['-s', str(cmd_args['skip'])])
if cmd_args.get('limit'):
cmd.extend(['-l', str(cmd_args['limit'])])
return subprocess.run(cmd, capture_output=True, text=True, cwd=cwd)


def test_map_cli_basic(tmp_path):
repo_root = Path(__file__).resolve().parents[1]
# create a simple input file
inp = tmp_path / 'in1.txt'
inp.write_text('line1\nline2\n')

outp = tmp_path / 'out' / 'o1.txt'
listing = tmp_path / 'list.txt'
listing.write_text(f"{inp}\t{outp}\n")

res = run_map_cmd({'list_file': listing, 'make_parents': True}, cwd=repo_root)
assert res.returncode == 0, f"STDERR:\n{res.stderr}"

assert outp.exists()
assert outp.read_text().splitlines() == ['line1', 'line2']


def test_map_cli_skip_limit(tmp_path):
repo_root = Path(__file__).resolve().parents[1]
inp = tmp_path / 'in2.txt'
inp.write_text('L1\nL2\nL3\n')

outp = tmp_path / 'out2.txt'
listing = tmp_path / 'list2.txt'
listing.write_text(f"{inp}\t{outp}\n")

# skip=1 should skip the first data line; limit=3 yields ctrl + two data lines
res = subprocess.run([sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(listing), '-p', '-s', '1', '-l', '3'],
capture_output=True, text=True, cwd=repo_root)
assert res.returncode == 0, f"STDERR:\n{res.stderr}"

assert outp.exists()
assert outp.read_text().splitlines() == ['L2', 'L3']


def test_map_cli_multi_columns(tmp_path):
repo_root = Path(__file__).resolve().parents[1]

# first group of three input files
in1a = tmp_path / 'g1_a.txt'
in1b = tmp_path / 'g1_b.txt'
in1c = tmp_path / 'g1_c.txt'
in1a.write_text('a1\na2\n')
in1b.write_text('b1\nb2\n')
in1c.write_text('c1\nc2\n')

out1 = tmp_path / 'out_g1.txt'

# second group of three input files
in2a = tmp_path / 'g2_a.txt'
in2b = tmp_path / 'g2_b.txt'
in2c = tmp_path / 'g2_c.txt'
in2a.write_text('x1\nx2\n')
in2b.write_text('y1\ny2\n')
in2c.write_text('z1\nz2\n')

out2 = tmp_path / 'out_g2.txt'

listing = tmp_path / 'list_multi.txt'
listing.write_text(
f"{in1a}\t{in1b}\t{in1c}\t{out1}\n"
f"{in2a}\t{in2b}\t{in2c}\t{out2}\n"
)

res = subprocess.run([sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(listing), '-p'],
capture_output=True, text=True, cwd=repo_root)
assert res.returncode == 0, f"STDERR:\n{res.stderr}"

assert out1.exists()
assert out1.read_text().splitlines() == ['a1\tb1\tc1', 'a2\tb2\tc2']

assert out2.exists()
assert out2.read_text().splitlines() == ['x1\ty1\tz1', 'x2\ty2\tz2']
Loading