thammegowda · thammegowda · Apr 27, 2025 · May 27, 2025 · Sep 17, 2025 · Sep 18, 2025
diff --git a/mtdata/__init__.py b/mtdata/__init__.py
@@ -4,7 +4,7 @@
 # Created: 4/4/20
 
 
-__version__ = '0.4.3'
+__version__ = '0.4.4-dev'
 __description__ = 'mtdata is a tool to download datasets for machine translation'
 __author__ = 'Thamme Gowda'
 

diff --git a/mtdata/index/other.py b/mtdata/index/other.py
@@ -144,3 +144,23 @@ def load_all(index: Index):
              l2_ext = l2.replace('_', "-")
              index += Entry(did=f"Microsoft-ntrex-128-{l1}-{l2}", url=_url, filename="NTREX-52b9c57c.tar.gz",
                        in_ext='txt', in_paths=[f"*/NTREX-128/newstest2019-ref.{l1_ext}.txt", f"*/NTREX-128/newstest2019-ref.{l2_ext}.txt"])
+
+    ### English - Bhojpuri ###
+    url="https://github.com/shashwatup9k/BHLTR/archive/2d2550033222.zip"
+    filename = url.split('/')[-1]  # this will force to share the file across the entries
+    cite = ("ojha2019english",)
+    # Parallel corpora
+    for split, splitname in [ # shortname, fullname, suffix
+            ("train", "training"),
+            ("dev", "development"),
+            ("test", "test.*")]:
+        f1 = f"*/parallel-corpora/eng--bho.{splitname}.eng"
+        f2 = f"*/parallel-corpora/eng--bho.{splitname}.bho"
+        index += Entry(did=DatasetId(group='BHLTR', name=split, version='1', langs=('eng', 'bho')),
+                    url=url, filename=filename, ext='zip', in_ext='txt', in_paths=[f1, f2], cite=cite)
+    # monolingual corpora
+    for version, f1 in [
+            ("1", "*/mono-bho-corpus/monolingual.bho"),
+            ("2", "*/mono-bho-corpus/monolingual-v0.2.bho")]:
+        index += Entry(did=DatasetId(group='BHLTR', name=f'mono', version=version, langs=('bho',)),
+                    url=url, filename=filename, ext='zip', in_ext='txt', in_paths=[f1], cite=cite)
diff --git a/mtdata/map.py b/mtdata/map.py
@@ -22,7 +22,8 @@
 from mtdata.utils import IO
 
 
-DELIM = '\t'
+#DELIM = '\t'
+DELIM = None
 SENTINEL = None
 
 
@@ -49,11 +50,11 @@ def read_paths(paths: Iterator[List[Path]]) -> Iterator[Union[dict,list]]:
             for rec in zip_longest(*streams):
                 if len(inps) > 1 and any(x is None for x in rec):
                     raise ValueError(f"Unequal number of lines detected in {inps} @ count: {counter}")
-                rec = '\t'.join(x.strip().replace('\t', ' ') for x in rec)
+                rec = '\t'.join(x.strip() for x in rec)
                 yield rec
                 counter += 1
             n_data += counter
-            log.info(f"Producer: end of {inps}; count: {counter}")
+            log.info(f"Producer: End of {','.join(str(x) for x in inps)}; count: {counter}")
         except Exception as e:
             log.exception(f"Producer: error in {inps}: {e}")
     log.info(f"Producer: finishing... n_ctrls: {n_ctrls};  n_data: {n_data:,}")
@@ -188,7 +189,11 @@ def read_stream(cls, paths: Iterator[List[Path]]) -> Iterator[Union[dict,list]]:
 
 def read_input_paths(input, delim=DELIM):
     for line in input:
-        parts = line.rstrip("\n").split(delim)
+        parts = line.rstrip("\n")
+        if delim:
+            parts = parts.split(delim)
+        else:
+            parts = parts.split() # white spaces
         parts = [Path(p) for p in parts]
         yield parts
 
@@ -219,26 +224,44 @@ def main():
         stream = trim_stream(stream, skip=n_skip, limit=n_limit)
 
     mapper = SubprocMapper(cmdline=args['cmdline'])
+    out = None
     try:
         out_stream = mapper(stream)
         for rec in out_stream:
-            print(rec)
+            if isinstance(rec, dict):
+                if out is not None:
+                    log.info(f"[[closing]] {out.name}")
+                    out.close()
+                log.info(f"[[opening]] {rec['output']}")
+                out_path = Path(rec['output'])
+                if args['make_parents']:
+                    out_path.parent.mkdir(parents=True, exist_ok=True)
+                out = out_path.open('w', encoding='utf-8', errors='replace')
+            else:
+                assert out is not None, f"Output file is not opened yet"
+                out.write(rec + '\n')
     except Exception as e:
         mapper.close()
         raise
-
+    finally:
+        if out is not None:
+            log.info(f"((closing)) {out.name}")
+            out.close()
 
 def parse_args():
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('-c', '--cmd', dest='cmdline', type=str, required=True,
         help="Mapper command that maps line-by-line, maintains 1:1 mapping and the input order. For example: 'cat'")
     parser.add_argument('-i', '--input', type=argparse.FileType('r'), default=sys.stdin,
         help="Listing file containing file paths. Atleast two paths per line is expected first one is input and last one is output")
-    parser.add_argument('-d', '--delim', type=str, default=DELIM, help="delimiter for paths in input")
+    parser.add_argument('-d', '--delim', type=str, default=DELIM, help="delimiter for paths in input. default=None => split by all whitespaces (space, tab etc.)")
     parser.add_argument('-l', '--limit', type=int, default=0,
                         help="Limit data stream to these many lines. Score: for debugging and testing")
     parser.add_argument('-s', '--skip', type=int, default=0,
                         help="Skip the first n records. Scope: for debugging and testing")
+    parser.add_argument('-p', '--parents',  action='store_true', dest='make_parents',
+                        help="Create parent directories for output files if they do not exist")
+    # return the parsed arguments
     return parser.parse_args()
 
 if __name__ == '__main__':

diff --git a/mtdata/resource/refs.bib b/mtdata/resource/refs.bib
@@ -720,3 +720,10 @@ @misc{nagata2024japanesechinese
     archivePrefix={arXiv},
     primaryClass={cs.CL},
 }
+
+@article{ojha2019english,
+  title={English-Bhojpuri SMT System: Insights from the Karaka Model},
+  author={Ojha, Atul Kr},
+  journal={arXiv preprint arXiv:1905.02239},
+  year={2019}
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -61,6 +61,7 @@ test = [ "pytest", "pytest-cov[all]", "black", "isort", "mypy"]
 mtdata = "mtdata.__main__:main"
 mtdata-iso = "mtdata.iso.__main__:main"
 mtdata-bcp47 = "mtdata.iso.bcp47:main"
+mtdata-map = "mtdata.map:main"
 
 
 [tool.setuptools.dynamic]

diff --git a/tests/test_map_cli.py b/tests/test_map_cli.py
@@ -0,0 +1,89 @@
+import sys
+import subprocess
+from pathlib import Path
+
+
+def run_map_cmd(cmd_args, cwd):
+    cmd = [sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(cmd_args['list_file'])]
+    if cmd_args.get('make_parents'):
+        cmd.append('-p')
+    if cmd_args.get('skip'):
+        cmd.extend(['-s', str(cmd_args['skip'])])
+    if cmd_args.get('limit'):
+        cmd.extend(['-l', str(cmd_args['limit'])])
+    return subprocess.run(cmd, capture_output=True, text=True, cwd=cwd)
+
+
+def test_map_cli_basic(tmp_path):
+    repo_root = Path(__file__).resolve().parents[1]
+    # create a simple input file
+    inp = tmp_path / 'in1.txt'
+    inp.write_text('line1\nline2\n')
+
+    outp = tmp_path / 'out' / 'o1.txt'
+    listing = tmp_path / 'list.txt'
+    listing.write_text(f"{inp}\t{outp}\n")
+
+    res = run_map_cmd({'list_file': listing, 'make_parents': True}, cwd=repo_root)
+    assert res.returncode == 0, f"STDERR:\n{res.stderr}"
+
+    assert outp.exists()
+    assert outp.read_text().splitlines() == ['line1', 'line2']
+
+
+def test_map_cli_skip_limit(tmp_path):
+    repo_root = Path(__file__).resolve().parents[1]
+    inp = tmp_path / 'in2.txt'
+    inp.write_text('L1\nL2\nL3\n')
+
+    outp = tmp_path / 'out2.txt'
+    listing = tmp_path / 'list2.txt'
+    listing.write_text(f"{inp}\t{outp}\n")
+
+    # skip=1 should skip the first data line; limit=3 yields ctrl + two data lines
+    res = subprocess.run([sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(listing), '-p', '-s', '1', '-l', '3'],
+                         capture_output=True, text=True, cwd=repo_root)
+    assert res.returncode == 0, f"STDERR:\n{res.stderr}"
+
+    assert outp.exists()
+    assert outp.read_text().splitlines() == ['L2', 'L3']
+
+
+def test_map_cli_multi_columns(tmp_path):
+    repo_root = Path(__file__).resolve().parents[1]
+
+    # first group of three input files
+    in1a = tmp_path / 'g1_a.txt'
+    in1b = tmp_path / 'g1_b.txt'
+    in1c = tmp_path / 'g1_c.txt'
+    in1a.write_text('a1\na2\n')
+    in1b.write_text('b1\nb2\n')
+    in1c.write_text('c1\nc2\n')
+
+    out1 = tmp_path / 'out_g1.txt'
+
+    # second group of three input files
+    in2a = tmp_path / 'g2_a.txt'
+    in2b = tmp_path / 'g2_b.txt'
+    in2c = tmp_path / 'g2_c.txt'
+    in2a.write_text('x1\nx2\n')
+    in2b.write_text('y1\ny2\n')
+    in2c.write_text('z1\nz2\n')
+
+    out2 = tmp_path / 'out_g2.txt'
+
+    listing = tmp_path / 'list_multi.txt'
+    listing.write_text(
+        f"{in1a}\t{in1b}\t{in1c}\t{out1}\n"
+        f"{in2a}\t{in2b}\t{in2c}\t{out2}\n"
+    )
+
+    res = subprocess.run([sys.executable, '-m', 'mtdata.map', '-c', 'cat', '-i', str(listing), '-p'],
+                         capture_output=True, text=True, cwd=repo_root)
+    assert res.returncode == 0, f"STDERR:\n{res.stderr}"
+
+    assert out1.exists()
+    assert out1.read_text().splitlines() == ['a1\tb1\tc1', 'a2\tb2\tc2']
+
+    assert out2.exists()
+    assert out2.read_text().splitlines() == ['x1\ty1\tz1', 'x2\ty2\tz2']