spacy-pl · flayed1 · Mar 6, 2019 · Mar 27, 2019 · Mar 28, 2019 · Mar 30, 2019
diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py
@@ -1,7 +1,6 @@
 import xml.etree.ElementTree as ET
-from spacy.lang.pl import Polish
 from spacy.gold import biluo_tags_from_offsets
-import spacy
+from spacy.lang.pl import Polish
 import json
 import os
 
@@ -139,8 +138,8 @@ def required_files_exist(dir):
     if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):
         continue
 
-    # we skip the docs that don't have the required annotations (certain .xml files)
     if not required_files_exist(current_folder):
+        # doc_id +=1 ?
         continue
 
     tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml))
@@ -209,9 +208,5 @@ def required_files_exist(dir):
     doc_id += 1
     corpus += [doc_json]
 
-out_path = os.path.expanduser(os.path.join(path_prefix, output_path))
-if not os.path.exists(out_path):
-    os.makedirs(out_path)
-
-with open(os.path.join(out_path, output), 'w+') as f:
+with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
     json.dump(corpus, f)
diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
@@ -0,0 +1,185 @@
+import xml.etree.ElementTree as ET
+import json
+import os
+
+path_prefix = './'
+corpus_path = 'data/kpwr-1.1/'
+output_path = 'data/NER/'
+output = 'NER_wroc.json'
+
+doc_id = 0
+corpus = []
+
+NE_njkp_to_spacy = {'persName': 'PERSON',
+                    'placeName': 'LOC',
+                    'orgName': 'ORG',
+                    'date': 'DATE',
+                    'time': 'TIME',
+                    'geogName': 'LOC'}
+
+
+class Token:
+    def __init__(self, orth, attribs, id):
+        self.orth = orth
+        self.attribs = attribs
+        self.id = id
+
+    def is_NE(self):
+        return len(self.attribs) != 0
+
+    def get_NE(self):
+        return self.attribs[0] if len(self.attribs) > 0 else ""
+
+    def __str__(self):
+        return (self.orth + ":" + str(self.attribs))
+
+
+def get_subdirs(dir):
+    return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]
+
+
+def process_token(tok):
+    attribs = []
+    orth = tok.find("orth").text
+    for ann in tok.iter("ann"):
+        if ann.attrib['chan'].endswith("nam") and ann.text != "0":
+            attribs += [ann.attrib['chan']]
+
+    return Token(orth, attribs, -1)
+
+
+def get_common_tag(t1, t2):
+    set1 = set(t1.attribs)
+    set2 = set(t2.attribs)
+    common = list(set1 & set2)
+    return common[0] if len(common) > 0 else None
+
+
+def pick_tags(tokens):
+    # first and last separately
+    if len(tokens) == 0:
+        return tokens
+    if len(tokens) == 1:
+        if tokens[0].is_NE():
+            tokens[0].attribs = [tokens[0].attribs[0]]
+        return tokens
+
+    t0 = tokens[0]
+    if len(t0.attribs) > 1:
+        new_tag = get_common_tag(t0, tokens[1])
+        if new_tag is None:
+            t0.attribs = [t0.attribs[0]]
+        else:
+            t0.attribs = [new_tag]
+
+    for i in range(1, len(tokens) - 1):
+        if len(tokens[i].attribs) > 1:
+            new_tag = get_common_tag(tokens[i - 1], tokens[i])
+            if new_tag is None:
+                new_tag = get_common_tag(tokens[i], tokens[i + 1])
+                if new_tag is None:
+                    tokens[i].attribs = [tokens[i].attribs[0]]
+                else:
+                    tokens[i].attribs = [new_tag]
+            else:
+                tokens[i].attribs = [new_tag]
+
+    te = tokens[-1]
+    if len(te.attribs) > 1:
+        new_tag = get_common_tag(te, tokens[-2])
+        if new_tag is None:
+            te.attribs = [te.attribs[0]]
+        else:
+            te.attribs = [new_tag]
+
+    assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te]))
+    return [t0] + tokens[1:-2] + [te]
+
+
+def convert_to_biluo(tokens):
+    out = []
+    in_ne = False
+    for i, token in enumerate(tokens[:-1]):
+        if in_ne:
+            if token.is_NE():
+                if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
+                    # inner NE
+                    out += [Token(token.orth, ["I-" + token.get_NE()], token.id)]
+                else:
+                    # last NE
+                    out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
+                    in_ne = False
+            else:
+                # we shouldn't ever get here
+                assert (False)
+
+        else:
+            if token.is_NE():
+                # new NE
+                if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
+                    # beginning NE
+                    out += [Token(token.orth, ["B-" + token.get_NE()], token.id)]
+                    in_ne = True
+                else:
+                    # unit NE
+                    out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
+                    in_ne = False
+            else:
+                # outside of NE
+                out += [Token(token.orth, ["O"], token.id)]
+
+    # process last token
+    token = tokens[-1]
+    if in_ne:
+        out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
+    else:
+        if token.is_NE():
+            out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
+        else:
+            out += [Token(token.orth, ["O"], token.id)]
+
+    return out
+
+
+docs = []
+doc_idx = 0
+for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
+    for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
+        if not file.endswith("rel.xml") and not file.endswith(".ini"):
+            doc_json = {}
+            sentences = []
+            token_idx = 0
+            raw = ""
+            tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
+            root = tree.getroot()
+            sents = root.iter("sentence")
+            for sent in sents:
+                tokens = []
+                for tok in sent.iter("tok"):
+                    token = process_token(tok)
+                    token.id = token_idx
+                    token_idx += 1
+                    tokens += [token]
+
+                tokens = pick_tags(tokens)
+                tokens = convert_to_biluo(tokens)
+
+                sent = {'tokens': [{
+                    'orth': t.orth,
+                    'id': t.id,
+                    'ner': t.get_NE()}
+                    for t in tokens
+                ], 'brackets': []
+                }
+
+                sentences += [sent]
+
+            doc_json = {
+                'id': doc_idx,
+                'paragraphs': [{'sentences': sentences}]
+            }
+            corpus += [doc_json]
+            doc_idx += 1
+
+with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
+    json.dump(corpus, f)
diff --git a/data/.gitignore b/data/.gitignore
@@ -11,3 +11,4 @@ vocab.jsonl
 /vectors_300.txt
 tagmap.py
 NKJP-PodkorpusMilionowy-1.2
+/kpwr-1.1
diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc
@@ -0,0 +1,7 @@
+md5: 58cbc0bd05749d04e4b6a5e4c9d78c01
+outs:
+- cache: true
+  md5: d84971d4b907e5efc5d9320de6691027.dir
+  metric: false
+  path: kpwr-1.1
+wdir: .
diff --git a/data/lemmatizer_data/.gitignore b/data/lemmatizer_data/.gitignore
@@ -4,4 +4,5 @@
 /rules.json
 
 /lemma_sources
-/sjp_ispell.tar.bz2
+/sjp_ispell.tar.bz2
+/lemma_sources_exp
diff --git a/lemma_sources_exp.dvc b/lemma_sources_exp.dvc
@@ -0,0 +1,14 @@
+cmd: python lemma_rules_extraction/yield_all_suffixes.py
+deps:
+- md5: 947b48802b53bdff2ad02122c04063e5.dir
+  path: data/lemmatizer_data/lemma_sources
+- md5: abb151c19621000ccc91d8e4b494674f
+  path: lemma_rules_extraction/yield_all_suffixes.py
+md5: 53ab70bcbc27640766d00b4977b4733d
+outs:
+- cache: true
+  md5: 4f29377d4c9dd5c9997c74750af31321.dir
+  metric: false
+  path: data/lemmatizer_data/lemma_sources_exp
+  persist: false
+wdir: .