spacy-pl · kowaalczyk · May 21, 2019 · May 21, 2019 · May 22, 2019 · May 30, 2019
diff --git a/ConvertNER/NER_pwr_to_wiki.py b/ConvertNER/NER_pwr_to_wiki.py
@@ -0,0 +1,64 @@
+# uncompyle6 version 3.3.1
+# Python bytecode 3.7 (3394)
+# Decompiled from: Python 3.7.1 (default, Nov  6 2018, 18:46:03)
+# [Clang 10.0.0 (clang-1000.11.45.5)]
+# Embedded file name: /Users/quark/studia/zpp/new/utils/ConvertNER/NER_pwr_to_spacy.py
+# Size of source mod 2**32: 1685 bytes
+NER_pwr_to_spacy = {'person_nam':'PER',  'institution_nam':'ORG',
+ 'city_nam':'LOC',
+ 'person_last_nam':'PER',
+ 'person_first_nam':'PER',
+ 'document_nam':'MISC',
+ 'event_nam':'MISC',
+ 'organization_nam':'ORG',
+ 'country_nam':'LOC',
+ 'title_nam':'MISC',
+ 'band_nam':'ORG',
+ 'periodic_nam':'MISC',
+ 'company_nam':'ORG',
+ 'facility_nam':'ORG',
+ 'brand_nam':'ORG',
+ 'political_party_nam':'ORG',
+ 'road_nam':'LOC',
+ 'admin1_nam':'LOC',
+ 'person_add_nam':'PER',
+ 'software_nam':'MISC',
+ 'nation_nam':'MISC',
+ 'tech_nam':'MISC',
+ 'nam':'MISC',
+ 'treaty_nam':'MISC',
+ 'web_nam':'MISC',
+ 'admin2_nam':'LOC',
+ 'award_nam':'MISC',
+ 'continent_nam':'LOC',
+ 'astronomical_nam':'LOC',
+ 'media_nam':'ORG',
+ 'river_nam':'LOC',
+ 'currency_nam':'MISC',
+ 'toponym_nam':'LOC',
+ 'mountain_nam':'LOC',
+ 'historical_region_nam':'LOC',
+ 'district_nam':'LOC',
+ 'country_region_nam':'LOC',
+ 'subdivision_nam':'ORG',
+ 'admin3_nam':'LOC',
+ 'region_nam':'LOC',
+ 'square_nam':'LOC',
+ 'park_nam':'LOC',
+ 'island_nam':'LOC',
+ 'system_nam':'MISC',
+ 'www_nam':'MISC',
+ 'person_group_nam':'MISC',
+ 'license_nam':'MISC',
+ 'lake_nam':'LOC',
+ 'animal_nam':'MISC',
+ 'sea_nam':'LOC',
+ 'person_adj_nam':'PER',
+ 'bay_nam':'LOC',
+ 'peninsula_nam':'LOC',
+ 'conurbation_nam':'LOC',
+ 'vehicle_nam':'MISC',
+ 'organization_sub_nam':'ORG',
+ 'ocean_nam':'LOC',
+ 'cape_nam':'LOC'}
+# okay decompiling NER_pwr_to_spacy.cpython-37.pyc
diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
@@ -34,13 +34,15 @@ def __init__(self, orth, attribs, id):
         self.id = id
 
     def is_NE(self):
-        return len(self.attribs) != 0
+        return self.get_NE() is not None and self.get_NE() != "O"
 
     def get_NE(self):
-        return self.attribs[0] if len(self.attribs) > 0 else ""
+        for attrib in self.attribs:
+            for k in attrib:
+                if attrib[k] != "0":
+                    return k
 
-    def get_cooccurences(self):
-        res = setCounter
+        return None
 
     def __str__(self):
         return (self.orth + ":" + str(self.attribs))
@@ -50,8 +52,8 @@ def process_token(tok):
     attribs = []
     orth = tok.find("orth").text
     for ann in tok.iter("ann"):
-        if ann.attrib['chan'].endswith("nam") and ann.text == "1":
-            attribs += [ann.attrib['chan']]
+        if ann.attrib['chan'].endswith("nam"):  # and ann.text != "0":
+            attribs += [{ann.attrib['chan']: ann.text}]
 
     return Token(orth, attribs, -1)
 
@@ -83,50 +85,80 @@ def get_all_labels_with_cardinalities(tokens):
 
 def map_labels(tokens, map):
     for tok in tokens:
-        tok.attribs = [map[attrib] for attrib in tok.attribs]
+        tok.attribs = [{map[k]: v} for attrib in tok.attribs for k, v in attrib.items()]
 
     return tokens
 
 
-def pick_tags(tokens):
-    # first and last separately
-    if len(tokens) == 0:
-        return tokens
-    if len(tokens) == 1:
-        if tokens[0].is_NE():
-            tokens[0].attribs = [tokens[0].attribs[0]]
-        return tokens
-
-    t0 = tokens[0]
-    if len(t0.attribs) > 1:
-        new_tag = get_common_tag(t0, tokens[1])
-        if new_tag is None:
-            t0.attribs = [t0.attribs[0]]
-        else:
-            t0.attribs = [new_tag]
-
-    for i in range(1, len(tokens) - 1):
-        if len(tokens[i].attribs) > 1:
-            new_tag = get_common_tag(tokens[i - 1], tokens[i])
-            if new_tag is None:
-                new_tag = get_common_tag(tokens[i], tokens[i + 1])
-                if new_tag is None:
-                    tokens[i].attribs = [tokens[i].attribs[0]]
-                else:
-                    tokens[i].attribs = [new_tag]
-            else:
-                tokens[i].attribs = [new_tag]
+def still_in_sequence(v1, v2):
+    return any(v1e == v2e != "0" for v1e, v2e in zip(v1,v2))
+
+
+def get_last_label(v):
+    for i, e in enumerate(v):
+        if e != "0":
+            return i
+    return None
+
+
+def get_label_set(v):
+    res = set()
+    for i, e in enumerate(v):
+        if e != "0":
+            res.add(i)
+
+    return res
+
+
+import random
+def get_any_label(v):
+    if v == emptyset():
+        return None
+    return random.sample(v, 1)[0]
 
-    te = tokens[-1]
-    if len(te.attribs) > 1:
-        new_tag = get_common_tag(te, tokens[-2])
-        if new_tag is None:
-            te.attribs = [te.attribs[0]]
+def emptyset():
+    return set()
+
+def get_longest_sequences(tokens):
+    res = []
+    b = 0
+    e = 0
+    attribs = [k for d in tokens[0].attribs for k in d]
+    last_set = None
+    label_set = emptyset()
+    while e != len(tokens)-1:
+        current_token = tokens[e]
+
+        if last_set == None or label_set == emptyset():
+            last_set = [v for d in current_token.attribs for k, v in d.items()]
+            label_set = get_label_set(last_set)
+            b = e
         else:
-            te.attribs = [new_tag]
+            new_set = [v for d in current_token.attribs for k, v in d.items()]
+            label_set = label_set.intersection(get_label_set(new_set))
+            if not still_in_sequence(last_set, new_set):
+                label_id = get_any_label(label_set)
+                if(label_id != None):
+                    label = attribs[label_id]
+                    res.append((b, e, label))
+                b = e
+                label_set = emptyset()
+
+            last_set = new_set
+        e += 1
+
+    return res
+
 
-    assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te]))
-    return [t0] + tokens[1:-2] + [te]
+# emptyset = set()
+def pick_tags(tokens):
+    longest_sequences = get_longest_sequences(tokens)
+    for b, e, label in longest_sequences:
+        seq = tokens[b:e]
+        for tok in seq:
+            tok.attribs = [{label: '1'}]
+        tokens[b:e] = seq
+    return tokens
 
 
 def convert_to_biluo(tokens):
@@ -137,10 +169,10 @@ def convert_to_biluo(tokens):
             if token.is_NE():
                 if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
                     # inner NE
-                    out += [Token(token.orth, ["I-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"I-" + token.get_NE(): '1'}], token.id)]
                 else:
                     # last NE
-                    out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)]
                     in_ne = False
             else:
                 # we shouldn't ever get here
@@ -151,82 +183,91 @@ def convert_to_biluo(tokens):
                 # new NE
                 if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
                     # beginning NE
-                    out += [Token(token.orth, ["B-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"B-" + token.get_NE(): '1'}], token.id)]
                     in_ne = True
                 else:
                     # unit NE
-                    out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)]
                     in_ne = False
             else:
                 # outside of NE
-                out += [Token(token.orth, ["O"], token.id)]
+                out += [Token(token.orth, [{"O": '1'}], token.id)]
 
     # process last token
     token = tokens[-1]
     if in_ne:
-        out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
+        out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)]
     else:
         if token.is_NE():
-            out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
+            out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)]
         else:
-            out += [Token(token.orth, ["O"], token.id)]
+            out += [Token(token.orth, [{"O": '1'}], token.id)]
 
     return out
 
 
+def get_file_paths(index_path):
+    with open(index_path) as index_file:
+        files = []
+        line = index_file.readline()
+        while line:
+            line = line.replace('\n', '')
+            files.append(line)
+            line = index_file.readline()
+
+        return files
+
+
 @click.command()
 @click.option("-m", "--use-label-map", type=bool, default=False)
 @click.argument("output_path", type=str)
 def main(
         use_label_map,
         output_path,
 ):
-    if use_label_map:
-        # classes = set(NER_pwr_to_spacy.values())
-        # output = f'NER_wroc_{len(classes)}.json'
-        # this would be a cool feature but I'm not sure if it's good for automatic pipelines
-        output = 'NER_wroc_spacy_labels.json'
-    all_labels = setCounter()
     corpus = []
     doc_idx = 0
-    for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
-        for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
-            if not file.endswith("rel.xml") and not file.endswith(".ini"):
-                sentences = []
-                token_idx = 0
-                tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
-                root = tree.getroot()
-                sents = root.iter("sentence")
-                for sent in sents:
-                    tokens = []
-                    for tok in sent.iter("tok"):
-                        token = process_token(tok)
-                        token.id = token_idx
-                        token_idx += 1
-                        tokens += [token]
-
-                    all_labels.merge(get_all_labels_with_cardinalities(tokens))  # for debug and analysis
-                    tokens = pick_tags(tokens)
-                    if use_label_map:
-                        tokens = map_labels(tokens, NER_pwr_to_spacy)
-                    tokens = convert_to_biluo(tokens)
-
-                    sent = {'tokens': [{
-                        'orth': t.orth,
-                        'id': t.id,
-                        'ner': t.get_NE()}
-                        for t in tokens
-                    ], 'brackets': []
-                    }
-
-                    sentences += [sent]
-
-                doc_json = {
-                    'id': doc_idx,
-                    'paragraphs': [{'sentences': sentences}]
-                }
-                corpus += [doc_json]
-                doc_idx += 1
+    file_paths = get_file_paths(os.path.join(path_prefix, corpus_path, 'index_names.txt'))
+    for file in file_paths:
+        file = os.path.join(path_prefix, corpus_path, file)
+        assert(not file.endswith("rel.xml") and not file.endswith(".ini"))
+        sentences = []
+        token_idx = 0
+        tree = ET.parse(file)
+        root = tree.getroot()
+        sents = root.iter("sentence")
+        for sent in sents:
+            tokens = []
+            for tok in sent.iter("tok"):
+                token = process_token(tok)
+                token.id = token_idx
+                token_idx += 1
+                tokens += [token]
+
+            # all_labels.merge(get_all_labels_with_cardinalities(tokens))  # for debug and analysis
+            tokens = pick_tags(tokens)
+            # tokens = flatten_token_attrib_dicts(tokens)
+
+            if use_label_map:
+                tokens = map_labels(tokens, NER_pwr_to_spacy)
+            tokens = convert_to_biluo(tokens)
+
+            sent = {'tokens': [{
+                'orth': t.orth,
+                'id': t.id,
+                'ner': t.get_NE()}
+                for t in tokens
+            ], 'brackets': []
+            }
+
+            sentences += [sent]
+
+        doc_json = {
+            'id': doc_idx,
+            'paragraphs': [{'sentences': sentences}]
+        }
+        corpus += [doc_json]
+        doc_idx += 1
 
     with open(os.path.expanduser(output_path), 'w+') as f:
         json.dump(corpus, f)

diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc
@@ -1,15 +1,16 @@
-cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json 
+cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json
 deps:
-- md5: d84971d4b907e5efc5d9320de6691027.dir
+- md5: edb877fcf74af64289c0c32299288927.dir
   path: data/kpwr-1.1
-- md5: c8aa684e59762c66aeba79e2727c103f
+- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654
   path: ConvertNER/convert_NER_wroc.py
-- md5: eee1569106fcf22473ee5a39f49f57bd
+- md5: d7d343ce8b47f93f20e3870d91c6150e
   path: ConvertNER/NER_pwr_to_spacy.py
-md5: 8edf603b1572083aedf1a95147deec94
+locked: true
+md5: 9618dffa84d6309d470a77da9e8de843
 outs:
 - cache: true
-  md5: ffd284e41307a7b0815d10623a0b4c99
+  md5: 25117be4c42e22d242c1e50d066fa35d
   metric: false
   path: data/NER/NER_wroc-19.json
   persist: false