From be44c727e1f6456750f01adf799a1fb2d1c746d8 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Tue, 21 May 2019 13:36:57 +0200
Subject: [PATCH 1/8] Fix conversion script (messy atm)

---
 ConvertNER/convert_NER_wroc.py | 95 ++++++++++++++++++++++++++++++++--
 data/kpwr-1.1.dvc              |  3 +-
 2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 76cebde..3ba598b 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -50,8 +50,8 @@ def process_token(tok):
     attribs = []
     orth = tok.find("orth").text
     for ann in tok.iter("ann"):
-        if ann.attrib['chan'].endswith("nam") and ann.text == "1":
-            attribs += [ann.attrib['chan']]
+        if ann.attrib['chan'].endswith("nam"): # and ann.text != "0":
+            attribs += [{ann.attrib['chan']:ann.text}]
 
     return Token(orth, attribs, -1)
 
@@ -88,6 +88,91 @@ def map_labels(tokens, map):
     return tokens
 
 
+def get_longest_sequences(tokens):
+    res = []
+    b = 0
+    e = 0
+    # type = None
+
+    last_set = set()
+    while e != len(tokens)-1:
+        current_token = tokens[e]
+
+        if last_set == emptyset:
+            last_set = set(current_token.attribs)
+            b = e
+        else:
+            new_set = set(current_token.attribs) & last_set
+            if new_set == emptyset:
+                label = list(last_set)[0]
+                res.append((b, e, label))
+                b = e
+
+            last_set = new_set
+        e += 1
+
+    return res
+
+
+def still_in_sequence(v1, v2):
+    return any(v1e == v2e != "0" for v1e in v1 for v2e in v2)
+
+
+def get_last_label(v):
+    for i, e in enumerate(v):
+        if e != "0":
+            return i
+    return None
+
+
+def get_longest_sequences_2(tokens):
+    res = []
+    b = 0
+    e = 0
+    attribs = [k for d in tokens[0].attribs for k in d]
+    last_set = None
+
+    while e != len(tokens)-1:
+        current_token = tokens[e]
+
+        if last_set == None:
+            last_set = [v for d in current_token.attribs for k, v in d.items()]
+            b = e
+        else:
+            new_set = [v for d in current_token.attribs for k, v in d.items()]
+            if not still_in_sequence(last_set, new_set):
+                label_id = get_last_label(last_set)
+                if(label_id != None):
+                    label = attribs[label_id]
+                    res.append((b, e, label))
+                b = e
+
+            last_set = new_set
+        e += 1
+
+    return res
+
+
+emptyset = set()
+def pick_tags_2(tokens):
+    longest_sequences = get_longest_sequences_2(tokens)
+    res = []
+    for b, e, label in longest_sequences:
+        seq = tokens[b:e]
+        for tok in seq:
+            tok.attribs = [{label: '1'}]
+        # res += seq
+        tokens[b:e] = seq
+    return tokens
+
+
+def flatten_token_attrib_dicts(tokens):
+    for tok in tokens:
+        tok.attribs = [k for k in tok.attribs[0].keys()] if len(tok.attribs) > 0 is not None else []
+
+    return tokens
+
+
 def pick_tags(tokens):
     # first and last separately
     if len(tokens) == 0:
@@ -205,8 +290,10 @@ def main(
                         token_idx += 1
                         tokens += [token]
 
-                    all_labels.merge(get_all_labels_with_cardinalities(tokens))  # for debug and analysis
-                    tokens = pick_tags(tokens)
+                    # all_labels.merge(get_all_labels_with_cardinalities(tokens))  # for debug and analysis
+                    tokens = pick_tags_2(tokens)
+                    tokens = flatten_token_attrib_dicts(tokens)
+
                     if use_label_map:
                         tokens = map_labels(tokens, NER_pwr_to_spacy)
                     tokens = convert_to_biluo(tokens)
diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc
index 275217c..69e04b8 100644
--- a/data/kpwr-1.1.dvc
+++ b/data/kpwr-1.1.dvc
@@ -1,7 +1,8 @@
-md5: 58cbc0bd05749d04e4b6a5e4c9d78c01
+md5: 86d998d87357a866a9993c1c0458b169
 outs:
 - cache: true
   md5: d84971d4b907e5efc5d9320de6691027.dir
   metric: false
   path: kpwr-1.1
+  persist: false
 wdir: .

From 448cc3384cdf9bcd8b8f7182f4e06fa48e7faa72 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Tue, 21 May 2019 20:13:25 +0200
Subject: [PATCH 2/8] Fix bug in fix

---
 ConvertNER/convert_NER_wroc.py | 111 +++++++--------------------------
 1 file changed, 21 insertions(+), 90 deletions(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 3ba598b..92f3b92 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -34,10 +34,15 @@ def __init__(self, orth, attribs, id):
         self.id = id
 
     def is_NE(self):
-        return len(self.attribs) != 0
+        return self.get_NE() is not None
 
     def get_NE(self):
-        return self.attribs[0] if len(self.attribs) > 0 else ""
+        for attrib in self.attribs:
+            for k in attrib:
+                if attrib[k] != "0":
+                    return k
+
+        return None
 
     def get_cooccurences(self):
         res = setCounter
@@ -83,37 +88,11 @@ def get_all_labels_with_cardinalities(tokens):
 
 def map_labels(tokens, map):
     for tok in tokens:
-        tok.attribs = [map[attrib] for attrib in tok.attribs]
+        tok.attribs = [{map[k]:v} for attrib in tok.attribs for k,v in attrib.items()]
 
     return tokens
 
 
-def get_longest_sequences(tokens):
-    res = []
-    b = 0
-    e = 0
-    # type = None
-
-    last_set = set()
-    while e != len(tokens)-1:
-        current_token = tokens[e]
-
-        if last_set == emptyset:
-            last_set = set(current_token.attribs)
-            b = e
-        else:
-            new_set = set(current_token.attribs) & last_set
-            if new_set == emptyset:
-                label = list(last_set)[0]
-                res.append((b, e, label))
-                b = e
-
-            last_set = new_set
-        e += 1
-
-    return res
-
-
 def still_in_sequence(v1, v2):
     return any(v1e == v2e != "0" for v1e in v1 for v2e in v2)
 
@@ -125,7 +104,7 @@ def get_last_label(v):
     return None
 
 
-def get_longest_sequences_2(tokens):
+def get_longest_sequences(tokens):
     res = []
     b = 0
     e = 0
@@ -154,8 +133,8 @@ def get_longest_sequences_2(tokens):
 
 
 emptyset = set()
-def pick_tags_2(tokens):
-    longest_sequences = get_longest_sequences_2(tokens)
+def pick_tags(tokens):
+    longest_sequences = get_longest_sequences(tokens)
     res = []
     for b, e, label in longest_sequences:
         seq = tokens[b:e]
@@ -166,54 +145,6 @@ def pick_tags_2(tokens):
     return tokens
 
 
-def flatten_token_attrib_dicts(tokens):
-    for tok in tokens:
-        tok.attribs = [k for k in tok.attribs[0].keys()] if len(tok.attribs) > 0 is not None else []
-
-    return tokens
-
-
-def pick_tags(tokens):
-    # first and last separately
-    if len(tokens) == 0:
-        return tokens
-    if len(tokens) == 1:
-        if tokens[0].is_NE():
-            tokens[0].attribs = [tokens[0].attribs[0]]
-        return tokens
-
-    t0 = tokens[0]
-    if len(t0.attribs) > 1:
-        new_tag = get_common_tag(t0, tokens[1])
-        if new_tag is None:
-            t0.attribs = [t0.attribs[0]]
-        else:
-            t0.attribs = [new_tag]
-
-    for i in range(1, len(tokens) - 1):
-        if len(tokens[i].attribs) > 1:
-            new_tag = get_common_tag(tokens[i - 1], tokens[i])
-            if new_tag is None:
-                new_tag = get_common_tag(tokens[i], tokens[i + 1])
-                if new_tag is None:
-                    tokens[i].attribs = [tokens[i].attribs[0]]
-                else:
-                    tokens[i].attribs = [new_tag]
-            else:
-                tokens[i].attribs = [new_tag]
-
-    te = tokens[-1]
-    if len(te.attribs) > 1:
-        new_tag = get_common_tag(te, tokens[-2])
-        if new_tag is None:
-            te.attribs = [te.attribs[0]]
-        else:
-            te.attribs = [new_tag]
-
-    assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te]))
-    return [t0] + tokens[1:-2] + [te]
-
-
 def convert_to_biluo(tokens):
     out = []
     in_ne = False
@@ -222,10 +153,10 @@ def convert_to_biluo(tokens):
             if token.is_NE():
                 if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
                     # inner NE
-                    out += [Token(token.orth, ["I-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"I-" + token.get_NE(): '1'}], token.id)]
                 else:
                     # last NE
-                    out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)]
                     in_ne = False
             else:
                 # we shouldn't ever get here
@@ -236,25 +167,25 @@ def convert_to_biluo(tokens):
                 # new NE
                 if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
                     # beginning NE
-                    out += [Token(token.orth, ["B-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"B-" + token.get_NE(): '1'}], token.id)]
                     in_ne = True
                 else:
                     # unit NE
-                    out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
+                    out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)]
                     in_ne = False
             else:
                 # outside of NE
-                out += [Token(token.orth, ["O"], token.id)]
+                out += [Token(token.orth, [{"O": '1'}], token.id)]
 
     # process last token
     token = tokens[-1]
     if in_ne:
-        out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
+        out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)]
     else:
         if token.is_NE():
-            out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
+            out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)]
         else:
-            out += [Token(token.orth, ["O"], token.id)]
+            out += [Token(token.orth, [{"O": '1'}], token.id)]
 
     return out
 
@@ -291,8 +222,8 @@ def main(
                         tokens += [token]
 
                     # all_labels.merge(get_all_labels_with_cardinalities(tokens))  # for debug and analysis
-                    tokens = pick_tags_2(tokens)
-                    tokens = flatten_token_attrib_dicts(tokens)
+                    tokens = pick_tags(tokens)
+                    # tokens = flatten_token_attrib_dicts(tokens)
 
                     if use_label_map:
                         tokens = map_labels(tokens, NER_pwr_to_spacy)

From 6cebb3a024826c47fe129ec65c8e866b94f9e951 Mon Sep 17 00:00:00 2001
From: Mateusz Olko <mateusz.olko@gmail.com>
Date: Wed, 22 May 2019 14:38:27 +0200
Subject: [PATCH 3/8] Remove redundant lines

---
 ConvertNER/convert_NER_wroc.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 92f3b92..8cc1795 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -44,9 +44,6 @@ def get_NE(self):
 
         return None
 
-    def get_cooccurences(self):
-        res = setCounter
-
     def __str__(self):
         return (self.orth + ":" + str(self.attribs))
 
@@ -55,8 +52,8 @@ def process_token(tok):
     attribs = []
     orth = tok.find("orth").text
     for ann in tok.iter("ann"):
-        if ann.attrib['chan'].endswith("nam"): # and ann.text != "0":
-            attribs += [{ann.attrib['chan']:ann.text}]
+        if ann.attrib['chan'].endswith("nam"):  # and ann.text != "0":
+            attribs += [{ann.attrib['chan']: ann.text}]
 
     return Token(orth, attribs, -1)
 
@@ -88,7 +85,7 @@ def get_all_labels_with_cardinalities(tokens):
 
 def map_labels(tokens, map):
     for tok in tokens:
-        tok.attribs = [{map[k]:v} for attrib in tok.attribs for k,v in attrib.items()]
+        tok.attribs = [{map[k]: v} for attrib in tok.attribs for k, v in attrib.items()]
 
     return tokens
 
@@ -111,7 +108,7 @@ def get_longest_sequences(tokens):
     attribs = [k for d in tokens[0].attribs for k in d]
     last_set = None
 
-    while e != len(tokens)-1:
+    while e != len(tokens) - 1:
         current_token = tokens[e]
 
         if last_set == None:
@@ -121,7 +118,7 @@ def get_longest_sequences(tokens):
             new_set = [v for d in current_token.attribs for k, v in d.items()]
             if not still_in_sequence(last_set, new_set):
                 label_id = get_last_label(last_set)
-                if(label_id != None):
+                if (label_id != None):
                     label = attribs[label_id]
                     res.append((b, e, label))
                 b = e
@@ -133,14 +130,14 @@ def get_longest_sequences(tokens):
 
 
 emptyset = set()
+
+
 def pick_tags(tokens):
     longest_sequences = get_longest_sequences(tokens)
-    res = []
     for b, e, label in longest_sequences:
         seq = tokens[b:e]
         for tok in seq:
             tok.attribs = [{label: '1'}]
-        # res += seq
         tokens[b:e] = seq
     return tokens
 
@@ -197,12 +194,6 @@ def main(
         use_label_map,
         output_path,
 ):
-    if use_label_map:
-        # classes = set(NER_pwr_to_spacy.values())
-        # output = f'NER_wroc_{len(classes)}.json'
-        # this would be a cool feature but I'm not sure if it's good for automatic pipelines
-        output = 'NER_wroc_spacy_labels.json'
-    all_labels = setCounter()
     corpus = []
     doc_idx = 0
     for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):

From be41f7bdbd52d3795cfbe682c57501059a284bf2 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Thu, 30 May 2019 12:38:10 +0200
Subject: [PATCH 4/8] Add label set in squashing algorithm

---
 ConvertNER/convert_NER_wroc.py | 124 +++++++++++++++++++++------------
 NER_wroc-19.json.dvc           |  12 ++--
 data/NER/.gitignore            |   3 +-
 data/kpwr-1.1.dvc              |   4 +-
 data/training/NER/.gitignore   |   5 ++
 ner-wroc-19-train.json.dvc     |  10 +--
 ner_wroc-19_word2vec.dvc       |  10 +--
 7 files changed, 104 insertions(+), 64 deletions(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 92f3b92..41b6e1c 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -94,7 +94,7 @@ def map_labels(tokens, map):
 
 
 def still_in_sequence(v1, v2):
-    return any(v1e == v2e != "0" for v1e in v1 for v2e in v2)
+    return any(v1e == v2e != "0" for v1e, v2e in zip(v1,v2))
 
 
 def get_last_label(v):
@@ -104,27 +104,48 @@ def get_last_label(v):
     return None
 
 
+def get_label_set(v):
+    res = set()
+    for i, e in enumerate(v):
+        if e != "0":
+            res.add(i)
+
+    return res
+
+
+import random
+def get_any_label(v):
+    if v == emptyset():
+        return None
+    return random.sample(v, 1)[0]
+
+def emptyset():
+    return set()
+
 def get_longest_sequences(tokens):
     res = []
     b = 0
     e = 0
     attribs = [k for d in tokens[0].attribs for k in d]
     last_set = None
-
+    label_set = emptyset()
     while e != len(tokens)-1:
         current_token = tokens[e]
 
-        if last_set == None:
+        if last_set == None or label_set == emptyset():
             last_set = [v for d in current_token.attribs for k, v in d.items()]
+            label_set = get_label_set(last_set)
             b = e
         else:
             new_set = [v for d in current_token.attribs for k, v in d.items()]
+            label_set = label_set.intersection(get_label_set(new_set))
             if not still_in_sequence(last_set, new_set):
-                label_id = get_last_label(last_set)
+                label_id = get_any_label(label_set)
                 if(label_id != None):
                     label = attribs[label_id]
                     res.append((b, e, label))
                 b = e
+                label_set = emptyset()
 
             last_set = new_set
         e += 1
@@ -132,7 +153,7 @@ def get_longest_sequences(tokens):
     return res
 
 
-emptyset = set()
+# emptyset = set()
 def pick_tags(tokens):
     longest_sequences = get_longest_sequences(tokens)
     res = []
@@ -190,6 +211,18 @@ def convert_to_biluo(tokens):
     return out
 
 
+def get_file_paths(index_path):
+    with open(index_path) as index_file:
+        files = []
+        line = index_file.readline()
+        while line:
+            line = line.replace('\n', '')
+            files.append(line)
+            line = index_file.readline()
+        
+        return files
+
+
 @click.command()
 @click.option("-m", "--use-label-map", type=bool, default=False)
 @click.argument("output_path", type=str)
@@ -205,46 +238,47 @@ def main(
     all_labels = setCounter()
     corpus = []
     doc_idx = 0
-    for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
-        for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
-            if not file.endswith("rel.xml") and not file.endswith(".ini"):
-                sentences = []
-                token_idx = 0
-                tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
-                root = tree.getroot()
-                sents = root.iter("sentence")
-                for sent in sents:
-                    tokens = []
-                    for tok in sent.iter("tok"):
-                        token = process_token(tok)
-                        token.id = token_idx
-                        token_idx += 1
-                        tokens += [token]
-
-                    # all_labels.merge(get_all_labels_with_cardinalities(tokens))  # for debug and analysis
-                    tokens = pick_tags(tokens)
-                    # tokens = flatten_token_attrib_dicts(tokens)
-
-                    if use_label_map:
-                        tokens = map_labels(tokens, NER_pwr_to_spacy)
-                    tokens = convert_to_biluo(tokens)
-
-                    sent = {'tokens': [{
-                        'orth': t.orth,
-                        'id': t.id,
-                        'ner': t.get_NE()}
-                        for t in tokens
-                    ], 'brackets': []
-                    }
-
-                    sentences += [sent]
-
-                doc_json = {
-                    'id': doc_idx,
-                    'paragraphs': [{'sentences': sentences}]
-                }
-                corpus += [doc_json]
-                doc_idx += 1
+    file_paths = get_file_paths(os.path.join(path_prefix, corpus_path, 'index_names.txt'))
+    for file in file_paths:
+        file = os.path.join(path_prefix, corpus_path, file)
+        assert(not file.endswith("rel.xml") and not file.endswith(".ini"))
+        sentences = []
+        token_idx = 0
+        tree = ET.parse(file)
+        root = tree.getroot()
+        sents = root.iter("sentence")
+        for sent in sents:
+            tokens = []
+            for tok in sent.iter("tok"):
+                token = process_token(tok)
+                token.id = token_idx
+                token_idx += 1
+                tokens += [token]
+
+            # all_labels.merge(get_all_labels_with_cardinalities(tokens))  # for debug and analysis
+            tokens = pick_tags(tokens)
+            # tokens = flatten_token_attrib_dicts(tokens)
+
+            if use_label_map:
+                tokens = map_labels(tokens, NER_pwr_to_spacy)
+            tokens = convert_to_biluo(tokens)
+
+            sent = {'tokens': [{
+                'orth': t.orth,
+                'id': t.id,
+                'ner': t.get_NE()}
+                for t in tokens
+            ], 'brackets': []
+            }
+
+            sentences += [sent]
+
+        doc_json = {
+            'id': doc_idx,
+            'paragraphs': [{'sentences': sentences}]
+        }
+        corpus += [doc_json]
+        doc_idx += 1
 
     with open(os.path.expanduser(output_path), 'w+') as f:
         json.dump(corpus, f)
diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc
index 2ca22da..b977cd6 100644
--- a/NER_wroc-19.json.dvc
+++ b/NER_wroc-19.json.dvc
@@ -1,15 +1,15 @@
-cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json 
+cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json
 deps:
-- md5: d84971d4b907e5efc5d9320de6691027.dir
+- md5: 94b53a67af0d7202fbd760d8ca1e0998.dir
   path: data/kpwr-1.1
-- md5: c8aa684e59762c66aeba79e2727c103f
+- md5: e29fb8a7101c096712d632cb117131c6
   path: ConvertNER/convert_NER_wroc.py
-- md5: eee1569106fcf22473ee5a39f49f57bd
+- md5: d7d343ce8b47f93f20e3870d91c6150e
   path: ConvertNER/NER_pwr_to_spacy.py
-md5: 8edf603b1572083aedf1a95147deec94
+md5: 6506f60af20d47077f8e24c91e4033d4
 outs:
 - cache: true
-  md5: ffd284e41307a7b0815d10623a0b4c99
+  md5: 25117be4c42e22d242c1e50d066fa35d
   metric: false
   path: data/NER/NER_wroc-19.json
   persist: false
diff --git a/data/NER/.gitignore b/data/NER/.gitignore
index b222295..a602b47 100644
--- a/data/NER/.gitignore
+++ b/data/NER/.gitignore
@@ -1,4 +1,5 @@
 /NER.json
 /NER_wroc.json
 /NER_wroc_19.json
-/NER_wroc_spacy_labels.json
\ No newline at end of file
+/NER_wroc_spacy_labels.json
+/NER_wroc-19.json
\ No newline at end of file
diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc
index 69e04b8..7e17ff2 100644
--- a/data/kpwr-1.1.dvc
+++ b/data/kpwr-1.1.dvc
@@ -1,7 +1,7 @@
-md5: 86d998d87357a866a9993c1c0458b169
+md5: 776829dcd16aeae294a70cd147183d9c
 outs:
 - cache: true
-  md5: d84971d4b907e5efc5d9320de6691027.dir
+  md5: 94b53a67af0d7202fbd760d8ca1e0998.dir
   metric: false
   path: kpwr-1.1
   persist: false
diff --git a/data/training/NER/.gitignore b/data/training/NER/.gitignore
index 99cb319..da15df0 100644
--- a/data/training/NER/.gitignore
+++ b/data/training/NER/.gitignore
@@ -2,3 +2,8 @@
 /ner-train.json
 /ner-test.json
 /ner-validation.json
+
+
+/ner-wroc-19-train.json
+/ner-wroc-19-validation.json
+/ner-wroc-19-test.json
\ No newline at end of file
diff --git a/ner-wroc-19-train.json.dvc b/ner-wroc-19-train.json.dvc
index 1feac85..a36cb44 100644
--- a/ner-wroc-19-train.json.dvc
+++ b/ner-wroc-19-train.json.dvc
@@ -2,22 +2,22 @@ cmd: python training/split-data.py --input-file data/NER/NER_wroc-19.json --trai
   data/training/NER/ner-wroc-19-train.json --validation-output data/training/NER/ner-wroc-19-validation.json
   --test-output data/training/NER/ner-wroc-19-test.json
 deps:
-- md5: 1f40fe3247574c0debdad863a63ae4de
+- md5: 25117be4c42e22d242c1e50d066fa35d
   path: data/NER/NER_wroc-19.json
-md5: 39ad2d77ad532f5705ec894a80c2b344
+md5: 4b9cf46376a1706104898c24954aa8d6
 outs:
 - cache: true
-  md5: 2a8a96bd480cc7908e137d18ba1c06de
+  md5: a821c9f3c70d36c977673f7b06914c16
   metric: false
   path: data/training/NER/ner-wroc-19-train.json
   persist: false
 - cache: true
-  md5: 3096af2e4e0434b2a869586e5b08954b
+  md5: 9ed18e190b97651637df77b1d541642c
   metric: false
   path: data/training/NER/ner-wroc-19-validation.json
   persist: false
 - cache: true
-  md5: fef72e0918b1e197d0d2e4d891de42f7
+  md5: add29d611966be1070ebcb1cd9fc0aa8
   metric: false
   path: data/training/NER/ner-wroc-19-test.json
   persist: false
diff --git a/ner_wroc-19_word2vec.dvc b/ner_wroc-19_word2vec.dvc
index b463f25..9d36393 100644
--- a/ner_wroc-19_word2vec.dvc
+++ b/ner_wroc-19_word2vec.dvc
@@ -2,16 +2,16 @@ cmd: python -m spacy train pl models/ner_wroc-19_word2vec data/training/NER/ner-
   data/training/NER/ner-wroc-19-validation.json --vectors models/blank_NKJP_word2vec
   -p ner -g 0 -n 20 -e 2
 deps:
-- md5: 61eda27883b647a6c0be5725d3eb3ccb.dir
+- md5: b1be412efe7f8bcbf97cf43fc2c80ef0.dir
   path: models/blank_NKJP_word2vec
-- md5: 2a8a96bd480cc7908e137d18ba1c06de
+- md5: a821c9f3c70d36c977673f7b06914c16
   path: data/training/NER/ner-wroc-19-train.json
-- md5: 3096af2e4e0434b2a869586e5b08954b
+- md5: 9ed18e190b97651637df77b1d541642c
   path: data/training/NER/ner-wroc-19-validation.json
-md5: 28f3c4a56cacccda75862abfe01e121d
+md5: 8cb208a1300c8020a0c038994d46544b
 outs:
 - cache: true
-  md5: 3af0b7f53c402e4e9a7d36045b8ab8b6.dir
+  md5: 37074c5206ed8912bbdf746d04d34b4c.dir
   metric: false
   path: models/ner_wroc-19_word2vec
   persist: false

From 884aedc027f5f66be76575b8644c606a4ff4d209 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Wed, 12 Jun 2019 16:45:20 +0200
Subject: [PATCH 5/8] Add small label set conversion map

---
 ConvertNER/NER_pwr_to_wiki.py | 64 +++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 ConvertNER/NER_pwr_to_wiki.py

diff --git a/ConvertNER/NER_pwr_to_wiki.py b/ConvertNER/NER_pwr_to_wiki.py
new file mode 100644
index 0000000..0e5e304
--- /dev/null
+++ b/ConvertNER/NER_pwr_to_wiki.py
@@ -0,0 +1,64 @@
+# uncompyle6 version 3.3.1
+# Python bytecode 3.7 (3394)
+# Decompiled from: Python 3.7.1 (default, Nov  6 2018, 18:46:03)
+# [Clang 10.0.0 (clang-1000.11.45.5)]
+# Embedded file name: /Users/quark/studia/zpp/new/utils/ConvertNER/NER_pwr_to_spacy.py
+# Size of source mod 2**32: 1685 bytes
+NER_pwr_to_spacy = {'person_nam':'PER',  'institution_nam':'ORG',
+ 'city_nam':'LOC',
+ 'person_last_nam':'PER',
+ 'person_first_nam':'PER',
+ 'document_nam':'MISC',
+ 'event_nam':'MISC',
+ 'organization_nam':'ORG',
+ 'country_nam':'LOC',
+ 'title_nam':'MISC',
+ 'band_nam':'ORG',
+ 'periodic_nam':'MISC',
+ 'company_nam':'ORG',
+ 'facility_nam':'ORG',
+ 'brand_nam':'ORG',
+ 'political_party_nam':'ORG',
+ 'road_nam':'LOC',
+ 'admin1_nam':'LOC',
+ 'person_add_nam':'PER',
+ 'software_nam':'MISC',
+ 'nation_nam':'MISC',
+ 'tech_nam':'MISC',
+ 'nam':'MISC',
+ 'treaty_nam':'MISC',
+ 'web_nam':'MISC',
+ 'admin2_nam':'LOC',
+ 'award_nam':'MISC',
+ 'continent_nam':'LOC',
+ 'astronomical_nam':'LOC',
+ 'media_nam':'ORG',
+ 'river_nam':'LOC',
+ 'currency_nam':'MISC',
+ 'toponym_nam':'LOC',
+ 'mountain_nam':'LOC',
+ 'historical_region_nam':'LOC',
+ 'district_nam':'LOC',
+ 'country_region_nam':'LOC',
+ 'subdivision_nam':'ORG',
+ 'admin3_nam':'LOC',
+ 'region_nam':'LOC',
+ 'square_nam':'LOC',
+ 'park_nam':'LOC',
+ 'island_nam':'LOC',
+ 'system_nam':'MISC',
+ 'www_nam':'MISC',
+ 'person_group_nam':'MISC',
+ 'license_nam':'MISC',
+ 'lake_nam':'LOC',
+ 'animal_nam':'MISC',
+ 'sea_nam':'LOC',
+ 'person_adj_nam':'PER',
+ 'bay_nam':'LOC',
+ 'peninsula_nam':'LOC',
+ 'conurbation_nam':'LOC',
+ 'vehicle_nam':'MISC',
+ 'organization_sub_nam':'ORG',
+ 'ocean_nam':'LOC',
+ 'cape_nam':'LOC'}
+# okay decompiling NER_pwr_to_spacy.cpython-37.pyc

From dc86cd8dba1f715d68f1e4cfb95d2e7c9ccabbf9 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Tue, 25 Jun 2019 23:58:40 +0200
Subject: [PATCH 6/8] is_NE change

---
 ConvertNER/convert_NER_wroc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index f155b27..9e6df87 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -34,7 +34,7 @@ def __init__(self, orth, attribs, id):
         self.id = id
 
     def is_NE(self):
-        return self.get_NE() is not None
+        return self.get_NE() is not None and self.get_NE() != "O"
 
     def get_NE(self):
         for attrib in self.attribs:

From d48eb5e2ec237541c0ae4a2f1df430d226935b31 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Wed, 26 Jun 2019 00:09:27 +0200
Subject: [PATCH 7/8] Add missing (?) dvc file

---
 NER_wroc-19.json.dvc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc
index b977cd6..fcbf993 100644
--- a/NER_wroc-19.json.dvc
+++ b/NER_wroc-19.json.dvc
@@ -1,12 +1,12 @@
 cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json
 deps:
-- md5: 94b53a67af0d7202fbd760d8ca1e0998.dir
+- md5: edb877fcf74af64289c0c32299288927.dir
   path: data/kpwr-1.1
-- md5: e29fb8a7101c096712d632cb117131c6
+- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654
   path: ConvertNER/convert_NER_wroc.py
 - md5: d7d343ce8b47f93f20e3870d91c6150e
   path: ConvertNER/NER_pwr_to_spacy.py
-md5: 6506f60af20d47077f8e24c91e4033d4
+md5: 9618dffa84d6309d470a77da9e8de843
 outs:
 - cache: true
   md5: 25117be4c42e22d242c1e50d066fa35d

From 69c6f843a22dcc59a9d5f5d2c9bdcc1a1f5147a2 Mon Sep 17 00:00:00 2001
From: Krzysztof Kowalczyk <k.kowaalczyk@gmail.com>
Date: Mon, 8 Jul 2019 18:26:40 +0000
Subject: [PATCH 8/8] Updated NER with 60.2 F-score, 1.0.0 models release

---
 NER_wroc-19.json.dvc         |  1 +
 NER_wroc.json.dvc            | 11 ++++++-----
 data/kpwr-1.1.dvc            |  4 ++--
 data/training/NER/.gitignore |  5 ++++-
 deployment/deploy.sh         |  2 +-
 models/.gitignore            |  4 +++-
 ner-wroc-train.json.dvc      | 10 +++++-----
 ner_wroc-19_fasttext.dvc     | 18 ++++++++++++++++++
 ner_wroc_fasttext.dvc        | 18 ++++++++++++++++++
 9 files changed, 58 insertions(+), 15 deletions(-)
 create mode 100644 ner_wroc-19_fasttext.dvc
 create mode 100644 ner_wroc_fasttext.dvc

diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc
index fcbf993..5749899 100644
--- a/NER_wroc-19.json.dvc
+++ b/NER_wroc-19.json.dvc
@@ -6,6 +6,7 @@ deps:
   path: ConvertNER/convert_NER_wroc.py
 - md5: d7d343ce8b47f93f20e3870d91c6150e
   path: ConvertNER/NER_pwr_to_spacy.py
+locked: true
 md5: 9618dffa84d6309d470a77da9e8de843
 outs:
 - cache: true
diff --git a/NER_wroc.json.dvc b/NER_wroc.json.dvc
index b5dfb2d..835e9b1 100644
--- a/NER_wroc.json.dvc
+++ b/NER_wroc.json.dvc
@@ -1,15 +1,16 @@
 cmd: python ConvertNER/convert_NER_wroc.py -m false data/NER/NER_wroc.json
 deps:
-- md5: d84971d4b907e5efc5d9320de6691027.dir
+- md5: 96d1abc9f866c7f713a5d655cacb453a.dir
   path: data/kpwr-1.1
-- md5: c8aa684e59762c66aeba79e2727c103f
+- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654
   path: ConvertNER/convert_NER_wroc.py
-- md5: eee1569106fcf22473ee5a39f49f57bd
+- md5: d7d343ce8b47f93f20e3870d91c6150e
   path: ConvertNER/NER_pwr_to_spacy.py
-md5: 8edf603b1572083aedf1a95147deec94
+locked: true
+md5: 9f27d06e80815d0cd8d7a5489f47dbce
 outs:
 - cache: true
-  md5: ffd284e41307a7b0815d10623a0b4c99
+  md5: ca5e2a82931ad7dced2b2f838761ea3b
   metric: false
   path: data/NER/NER_wroc.json
   persist: false
diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc
index 7e17ff2..4e30a4d 100644
--- a/data/kpwr-1.1.dvc
+++ b/data/kpwr-1.1.dvc
@@ -1,7 +1,7 @@
-md5: 776829dcd16aeae294a70cd147183d9c
+md5: 14d0df72a86e1141a7938134eb3009d1
 outs:
 - cache: true
-  md5: 94b53a67af0d7202fbd760d8ca1e0998.dir
+  md5: 96d1abc9f866c7f713a5d655cacb453a.dir
   metric: false
   path: kpwr-1.1
   persist: false
diff --git a/data/training/NER/.gitignore b/data/training/NER/.gitignore
index da15df0..6b35a4d 100644
--- a/data/training/NER/.gitignore
+++ b/data/training/NER/.gitignore
@@ -6,4 +6,7 @@
 
 /ner-wroc-19-train.json
 /ner-wroc-19-validation.json
-/ner-wroc-19-test.json
\ No newline at end of file
+/ner-wroc-19-test.json
+/ner-wroc-train.json
+/ner-wroc-validation.json
+/ner-wroc-test.json
\ No newline at end of file
diff --git a/deployment/deploy.sh b/deployment/deploy.sh
index 44c813d..55a7e7a 100755
--- a/deployment/deploy.sh
+++ b/deployment/deploy.sh
@@ -6,7 +6,7 @@ echo ""
 # --- SETTINGS ---
 
 PACKAGE_DIR="release"  # same as passed to combine or spacy.cli.package
-MODEL_NAME="pl_model-0.2.0"  # same as inputted in spacy.cli.package
+MODEL_NAME="pl_model-1.0.0"  # same as inputted in spacy.cli.package
 
 BUCKET_NAME="gs://spacy-pl-public-models"
 BUCKET_PUBLIC_URL="https://storage.googleapis.com/spacy-pl-public-models"
diff --git a/models/.gitignore b/models/.gitignore
index 5d0257c..cfa234a 100644
--- a/models/.gitignore
+++ b/models/.gitignore
@@ -11,4 +11,6 @@
 /pos_NKJP-justpos_fasttext
 /trees_LFG_fasttext
 /ner_nkjp_fasttext
-/ner_wroc-19_word2vec
\ No newline at end of file
+/ner_wroc-19_word2vec
+/ner_wroc_fasttext
+/ner_wroc-19_fasttext
\ No newline at end of file
diff --git a/ner-wroc-train.json.dvc b/ner-wroc-train.json.dvc
index 454548f..2cc4129 100644
--- a/ner-wroc-train.json.dvc
+++ b/ner-wroc-train.json.dvc
@@ -2,22 +2,22 @@ cmd: python training/split-data.py --input-file data/NER/NER_wroc.json --train-o
   data/training/NER/ner-wroc-train.json --validation-output data/training/NER/ner-wroc-validation.json
   --test-output data/training/NER/ner-wroc-test.json
 deps:
-- md5: ffd284e41307a7b0815d10623a0b4c99
+- md5: ca5e2a82931ad7dced2b2f838761ea3b
   path: data/NER/NER_wroc.json
-md5: e8749979d1f6bff59eb4b724a1e1d0d5
+md5: c00b5ca9040325a0201f017df51b5332
 outs:
 - cache: true
-  md5: 09e76ef24c694cdc9b5ce263cc6deca4
+  md5: 583ae4d3d540b935495158f436f848ef
   metric: false
   path: data/training/NER/ner-wroc-train.json
   persist: false
 - cache: true
-  md5: ad8ce2a4657262084b8d81e9ed07ac1d
+  md5: 51eaf4ea624b692fb1b0daec14736224
   metric: false
   path: data/training/NER/ner-wroc-validation.json
   persist: false
 - cache: true
-  md5: c02d5d314cf98f8327723edba692b8a4
+  md5: c781b90cca2a14e35d20c9d597b49b5d
   metric: false
   path: data/training/NER/ner-wroc-test.json
   persist: false
diff --git a/ner_wroc-19_fasttext.dvc b/ner_wroc-19_fasttext.dvc
new file mode 100644
index 0000000..49bf3c8
--- /dev/null
+++ b/ner_wroc-19_fasttext.dvc
@@ -0,0 +1,18 @@
+cmd: python -m spacy train pl models/ner_wroc-19_fasttext data/training/NER/ner-wroc-19-train.json
+  data/training/NER/ner-wroc-19-validation.json --vectors models/blank_fasttext -p
+  ner -g 0 -n 80 -e 8
+deps:
+- md5: b1be412efe7f8bcbf97cf43fc2c80ef0.dir
+  path: models/blank_NKJP_word2vec
+- md5: a821c9f3c70d36c977673f7b06914c16
+  path: data/training/NER/ner-wroc-19-train.json
+- md5: 9ed18e190b97651637df77b1d541642c
+  path: data/training/NER/ner-wroc-19-validation.json
+md5: 0947bbffdb3707555a0da9011f9a2f16
+outs:
+- cache: true
+  md5: 099eb0933b6d2131641ff457013d43ba.dir
+  metric: false
+  path: models/ner_wroc-19_fasttext
+  persist: false
+wdir: .
diff --git a/ner_wroc_fasttext.dvc b/ner_wroc_fasttext.dvc
new file mode 100644
index 0000000..254f3b4
--- /dev/null
+++ b/ner_wroc_fasttext.dvc
@@ -0,0 +1,18 @@
+cmd: python -m spacy train pl models/ner_wroc_fasttext data/training/NER/ner-wroc-train.json
+  data/training/NER/ner-wroc-validation.json --vectors models/blank_fasttext -p ner
+  -g 0 -n 80 -e 8
+deps:
+- md5: fe3ebcb89593a8e1026e7668ffe6de23.dir
+  path: models/blank_fasttext
+- md5: 583ae4d3d540b935495158f436f848ef
+  path: data/training/NER/ner-wroc-train.json
+- md5: 51eaf4ea624b692fb1b0daec14736224
+  path: data/training/NER/ner-wroc-validation.json
+md5: f5c883da955b6daf11cb2bcdaf5492e0
+outs:
+- cache: true
+  md5: 55a741e0221cc1041446eb2bfcf79908.dir
+  metric: false
+  path: models/ner_wroc_fasttext
+  persist: false
+wdir: .