From e0b16973051500ecc75531dbf8c9434e78f6a005 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Wed, 6 Mar 2019 19:59:21 +0100
Subject: [PATCH 01/14] Working-ish solution

@TODO convert tags (use the map from tagmap)
@TODO refactor
@TODO head?
@TODO dep?

Change the script to work from repository's root folder

Working-ish solution

@TODO convert tags (use the map from tagmap)
@TODO refactor
@TODO head?
@TODO dep?

Change the script to work from repository's root folder

Fix output format

Working-ish solution

@TODO convert tags (use the map from tagmap)
@TODO refactor
@TODO head?
@TODO dep?

Change the script to work from repository's root folder

Working-ish solution

@TODO convert tags (use the map from tagmap)
@TODO refactor
@TODO head?
@TODO dep?

Change the script to work from repository's root folder
---
 ConvertNER/convert_NER.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py
index b6eefd3..0948085 100644
--- a/ConvertNER/convert_NER.py
+++ b/ConvertNER/convert_NER.py
@@ -1,5 +1,4 @@
 import xml.etree.ElementTree as ET
-from spacy.lang.pl import Polish
 from spacy.gold import biluo_tags_from_offsets
 import spacy
 import json
@@ -121,7 +120,7 @@ def required_files_exist(dir):
 
     return True
 
-nlp = Polish()
+nlp = spacy.load('en_core_web_sm')
 doc_id = 0
 corpus = []
 
@@ -139,8 +138,8 @@ def required_files_exist(dir):
     if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):
         continue
 
-    # we skip the docs that don't have the required annotations (certain .xml files)
     if not required_files_exist(current_folder):
+        # doc_id +=1 ?
         continue
 
     tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml))
@@ -199,7 +198,7 @@ def required_files_exist(dir):
         biluo_tags = biluo_tags_from_offsets(doc, entities)
 
         sentences = set_biluo_tags(sentences, biluo_tags)
-        paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences]
+        paragraph_json['sentences'] = sentences
         paragraph_json['raw'] = pg_text
         paragraphs += [paragraph_json]
 
@@ -209,9 +208,5 @@ def required_files_exist(dir):
     doc_id += 1
     corpus += [doc_json]
 
-out_path = os.path.expanduser(os.path.join(path_prefix, output_path))
-if not os.path.exists(out_path):
-    os.makedirs(out_path)
-
-with open(os.path.join(out_path, output), 'w+') as f:
+with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
     json.dump(corpus, f)

From 84914b7d2d094d1b14bdce3434e46b329196fc0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Wed, 27 Mar 2019 13:20:57 +0100
Subject: [PATCH 02/14] Fix output format

---
 ConvertNER/convert_NER.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py
index 0948085..3c12602 100644
--- a/ConvertNER/convert_NER.py
+++ b/ConvertNER/convert_NER.py
@@ -198,7 +198,7 @@ def required_files_exist(dir):
         biluo_tags = biluo_tags_from_offsets(doc, entities)
 
         sentences = set_biluo_tags(sentences, biluo_tags)
-        paragraph_json['sentences'] = sentences
+        paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences]
         paragraph_json['raw'] = pg_text
         paragraphs += [paragraph_json]
 

From 326698cb8817ab7f0f6ea644cf4033e9d27594a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Thu, 28 Mar 2019 01:40:22 +0100
Subject: [PATCH 03/14] [WIP] Some really ugly stuff

---
 ConvertNER/convert_NER_wroc.py | 387 +++++++++++++++++++++++++++++++++
 1 file changed, 387 insertions(+)
 create mode 100644 ConvertNER/convert_NER_wroc.py

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
new file mode 100644
index 0000000..ca25117
--- /dev/null
+++ b/ConvertNER/convert_NER_wroc.py
@@ -0,0 +1,387 @@
+import xml.etree.ElementTree as ET
+from spacy.gold import biluo_tags_from_offsets
+# import spacy
+import json
+import os
+
+path_prefix = '../'
+corpus_path = 'data/kpwr-1.1/'
+output_path = 'data/NER/'
+output = 'NER_wroc.json'
+
+def get_subdirs(dir):
+    return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))]
+
+morphosyntax_xml = 'ann_morphosyntax.xml'
+groups_xml = 'ann_groups.xml'
+named_xml = 'ann_named.xml'
+senses_xml = 'ann_senses.xml'
+header_xml = 'header.xml'
+segmentation_xml = 'ann_segmentation.xml'
+words_xml = 'ann_words.xml'
+text_xml = 'text.xml'
+
+def print_children_recursively(n, i=0):
+    if i > 10:
+        return
+    for c in n:
+        print(' '*(3*i), c.attrib, c.tag)
+        print_children_recursively(c, i+1)
+
+def get(node, k, v):
+    if node is None:
+        return
+    for c in node:
+        if c.attrib.get(k)==v:
+            return c
+
+def get_morph(seg):
+    for c in seg:
+        if c.attrib['type']=='morph':
+            return c
+
+def get_orth(seg):
+    morph = get(seg, 'type', 'morph')
+    orth = get(morph, 'name', 'orth')
+    return orth[0].text if orth is not None else None
+
+def get_named(seg):
+    named = get(seg, 'type', 'named')
+    orth = get(named, 'name', 'orth')
+    return orth[0].text if orth is not None else None
+
+def get_named_type(seg):
+    named = get(seg, 'type', 'named')
+    type = get(named, 'name', 'type')
+    return type[0].attrib['value']
+
+def get_ctag(seg):
+    morph = get(seg, 'type', 'morph')
+    interps = get(morph, 'name', 'interps')
+    lex = get(interps, 'type', 'lex')
+    ctag = get(lex, 'name', 'ctag')
+    return ctag[0].attrib['value']
+
+def get_corresp_morph(sent):
+    return sent.attrib['corresp'].split('#')[1]
+
+def get_entity_maps(root):
+    result = {}
+    for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'):
+        tmp = []
+        for seg in sent:
+            text = get_named(seg)
+            type = get_named_type(seg)
+            tmp += [(text, type)]
+
+        result[get_corresp_morph(sent)] = dict(tmp)
+
+    return result
+
+def get_segmentation_text_maps(root):
+    res = {}
+    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'):
+        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
+        value = paragraph.attrib['corresp'].split('#')[1]
+        res[key]=value
+
+    return res
+
+def get_text_maps(root):
+    result = {}
+    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'):
+        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
+        text = ''
+        for child in paragraph:
+            text += child.text
+
+        result[key]=text
+
+    return result
+
+def get_sent_id(sent):
+    return sent.attrib['{http://www.w3.org/XML/1998/namespace}id']
+
+def get_paragraph_text(paragraph, segm_text_map, text_maps):
+    paragraph_id = paragraph.attrib['corresp'].split('#')[1]
+    return text_maps[segm_text_map[paragraph_id]]
+
+def set_biluo_tags(sentences, tags):
+    i = 0
+    for sent in sentences:
+        for token in sent:
+            token['ner'] = tags[i]
+            i += 1
+
+    return sentences
+
+def required_files_exist(dir):
+    required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml]
+    for file in required_files:
+        if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)):
+            return False
+
+    return True
+
+# nlp = spacy.load('en_core_web_sm')
+doc_id = 0
+corpus = []
+
+NE_njkp_to_spacy = {'persName': 'PERSON',
+ 'placeName': 'LOC',
+ 'orgName': 'ORG',
+ 'date': 'DATE',
+ 'time': 'TIME',
+ 'geogName': 'LOC'}
+
+class Token:
+    # def __init__(self, orth, attribs):
+    #     self.orth = orth
+    #     self.attribs = attribs
+    #     self.id = None #this is fugly
+
+
+    def __init__(self, orth, attribs, id):
+        self.orth = orth
+        self.attribs = attribs
+        self.id = id
+
+    def is_NE(self):
+        return len(self.attribs) != 0
+
+    def get_NE(self):
+        return self.attribs[0] if len(self.attribs) > 0 else ""
+
+    def __str__(self):
+        return (self.orth + ":" + str(self.attribs))
+
+
+def process_token(tok):
+    attribs = []
+    orth = tok.find("orth").text
+    for ann in tok.iter("ann"):
+        if ann.attrib['chan'].endswith("nam") and ann.text=="1":
+            attribs += [ann.attrib['chan']]
+
+    return Token(orth, attribs, -1)
+
+def get_common_tag(t1, t2):
+    set1 = set(t1.attribs)
+    set2 = set(t2.attribs)
+    common = list(set1 & set2)
+    return common[0] if len(common) > 0 else None
+
+def get_all_labels(tokens):
+    labels = set()
+    for tok in tokens:
+        for attr in tok.attribs:
+            labels.add(attr)
+
+    return labels
+
+def pick_tags(tokens):
+    # first and last separately
+    if len(tokens) == 0:
+        return tokens
+    if len(tokens) == 1:
+        if tokens[0].is_NE():
+            tokens[0].attribs = [token[0].attribs[0]]
+        return tokens
+
+    t0 = tokens[0]
+    if len(t0.attribs) > 1:
+        new_tag = get_common_tag(t0, tokens[1])
+        if new_tag is None:
+            t0.attribs = [t0.attribs[0]]
+        else:
+            t0.attribs = [new_tag]
+
+    for i in range(1, len(tokens)-1):
+        if len(tokens[i].attribs) > 1:
+            new_tag = get_common_tag(tokens[i-1], tokens[i])
+            if new_tag is None:
+                new_tag = get_common_tag(tokens[i], tokens[i+1])
+                if new_tag is None:
+                    tokens[i].attribs = [tokens[i].attribs[0]]
+                else:
+                    tokens[i].attribs = [new_tag]
+            else:
+                tokens[i].attribs = [new_tag]
+
+    te = tokens[-1]
+    if len(te.attribs) > 1:
+        new_tag = get_common_tag(te, tokens[-2])
+        if new_tag is None:
+            te.attribs = [te.attribs[0]]
+        else:
+            te.attribs = [new_tag]
+
+    assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te]))
+    return [t0] + tokens[1:-2] + [te]
+
+def convert_to_biluo(tokens):
+    out = []
+    in_ne = False
+    for i, token in enumerate(tokens[:-1]):
+        if in_ne:
+            if token.is_NE():
+                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
+                    # inner NE
+                    out += [Token(token.orth, ["I_"+token.get_NE()], token.id)]
+                else:
+                    # last NE
+                    out += [Token(token.orth, ["L_"+token.get_NE()], token.id)]
+                    in_ne = False
+            else:
+                # we shouldn't ever get here
+                assert(False)
+
+        else:
+            if token.is_NE():
+                # new NE
+                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
+                    # beginning NE
+                    out += [Token(token.orth, ["B_"+token.get_NE()], token.id)]
+                    in_ne = True
+                else:
+                    # unit NE
+                    out += [Token(token.orth, ["U_"+token.get_NE()], token.id)]
+                    in_ne = False
+            else:
+                # outside of NE
+                out += [Token(token.orth, ["O"], token.id)]
+
+    # process last token
+    token = tokens[-1]
+    if in_ne:
+        out += [Token(token.orth, ["L_" + token.get_NE()], token.id)]
+    else:
+        if token.is_NE():
+            out += [Token(token.orth, ["U_" + token.get_NE()], token.id)]
+        else:
+            out += [Token(token.orth, ["O"], token.id)]
+
+    return out
+
+
+
+
+
+
+print(get_subdirs(os.path.join(path_prefix, corpus_path)))
+
+all_labels = set()
+for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
+    for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
+        doc_idx = 0
+        if not file.endswith("rel.xml") and not file.endswith(".ini"):
+            token_idx = 0
+            tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
+            root = tree.getroot()
+            sents = root.iter("sentence")
+            for sent in sents:
+                tokens = []
+                for tok in sent.iter("tok"):
+                    token = process_token(tok)
+                    token.id = token_idx
+                    token_idx += 1
+                    # if token.is_NE(): print(token)
+                    tokens += [token]
+
+                all_labels |= get_all_labels(tokens)
+                tokens = pick_tags(tokens)
+                tokens = convert_to_biluo(tokens)
+
+                sent = {'tokens': [{
+                    'orth': t.orth,
+                    'id': t.id,
+                    'ner': t.get_NE()} # change to t.get_NE()
+                    for t in tokens
+                ], 'brackets': []
+                }
+                print(sent)
+        doc_idx +=1
+
+    break
+
+print(all_labels)
+
+# for f in os.listdir(os.path.join(path_prefix, corpus_path)):
+#     doc_json = {}
+#     current_folder = f
+#
+#     if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):
+#         continue
+#
+#     if not required_files_exist(current_folder):
+#         # doc_id +=1 ?
+#         continue
+#
+#     tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml))
+#     root_morphosyntax = tree_morphosyntax.getroot()
+#
+#     tree_named = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,named_xml))
+#     root_named = tree_named.getroot()
+#
+#     tree_text = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,text_xml))
+#     root_text = tree_text.getroot()
+#
+#     tree_segmentation = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,segmentation_xml))
+#     root_segmentation = tree_segmentation.getroot()
+#
+#     segmentation_text_map = get_segmentation_text_maps(root_segmentation)
+#     entity_maps = get_entity_maps(root_named)
+#     text_maps = get_text_maps(root_text)
+#
+#     token_idx = 0
+#     paragraphs = []
+#     for paragraph in root_morphosyntax.iter('{http://www.tei-c.org/ns/1.0}p'):
+#         paragraph_json = {}
+#         pg_text = get_paragraph_text(paragraph, segmentation_text_map, text_maps)
+#
+#         text = ''
+#         nes = []
+#         sentences = []
+#         for sentence in paragraph:
+#             sent_id = get_sent_id(sentence)
+#             sentence_entity_map = entity_maps[sent_id]
+#             sentence_json = []
+#             for seg in sentence:
+#                 token = {}
+#                 ctag = get_ctag(seg)
+#                 orth = get_orth(seg)
+#                 ne = sentence_entity_map.get(orth)
+#
+#                 text += orth + ' '
+#
+#                 if ne is not None:
+#                     ne = NE_njkp_to_spacy[ne]
+#                     nes += [(len(text)-1-len(orth), len(text)-1, ne)]
+#
+#                 token['ctag'] = ctag
+#                 token['orth'] = orth
+#                 token['head'] = 0  # @TODO
+#                 token['dep'] = 'NA'  # @TODO
+#                 token['id'] = token_idx
+#                 token['ner'] = ne
+#                 token_idx += 1
+#                 sentence_json += [token]
+#             sentences += [sentence_json]
+#
+#         doc = nlp(text)
+#         entities = nes
+#         biluo_tags = biluo_tags_from_offsets(doc, entities)
+#
+#         sentences = set_biluo_tags(sentences, biluo_tags)
+#         paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences]
+#         paragraph_json['raw'] = pg_text
+#         paragraphs += [paragraph_json]
+#
+#     doc_json['id'] = doc_id
+#     doc_json['paragraphs'] = paragraphs
+#
+#     doc_id += 1
+#     corpus += [doc_json]
+#
+# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
+#     json.dump(corpus, f)
\ No newline at end of file

From 06e6e4b25ec044771dae64dcc53864acfda1f691 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Sat, 30 Mar 2019 13:50:18 +0100
Subject: [PATCH 04/14] Save training data, needs a bug fix

---
 ConvertNER/convert_NER_wroc.py | 59 +++++++++++++++++++++++++---------
 data/.gitignore                |  3 +-
 data/kpwr-1.1.dvc              |  7 ++++
 3 files changed, 53 insertions(+), 16 deletions(-)
 create mode 100644 data/kpwr-1.1.dvc

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index ca25117..2849d5a 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -4,7 +4,7 @@
 import json
 import os
 
-path_prefix = '../'
+path_prefix = './'
 corpus_path = 'data/kpwr-1.1/'
 output_path = 'data/NER/'
 output = 'NER_wroc.json'
@@ -185,7 +185,7 @@ def pick_tags(tokens):
         return tokens
     if len(tokens) == 1:
         if tokens[0].is_NE():
-            tokens[0].attribs = [token[0].attribs[0]]
+            tokens[0].attribs = [tokens[0].attribs[0]]
         return tokens
 
     t0 = tokens[0]
@@ -263,19 +263,40 @@ def convert_to_biluo(tokens):
 
     return out
 
+def get_text(tokens):
+    raw = ""
+    for token in tokens:
+        raw += token.orth + " "
 
+    _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ؛ ٪ . ! ?'
+    _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
+    _hyphens = '- – — -- --- —— ~'
+    _brackets_pref =  ") ] }"
+    _brackets_post = "( [ {"
 
+    interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ")
+    interp_post = _brackets_post.split(" ")
+    raw = raw[:-1]
+    for char in interp_pref:
+        raw = raw.replace(" "+char, char)
 
+    for char in interp_post:
+        raw = raw.replace(char+" ", char)
+
+    return raw
 
 
-print(get_subdirs(os.path.join(path_prefix, corpus_path)))
 
 all_labels = set()
+docs = []
+doc_idx = 0
 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
     for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
-        doc_idx = 0
         if not file.endswith("rel.xml") and not file.endswith(".ini"):
+            doc_json = {}
+            sentences = []
             token_idx = 0
+            raw = ""
             tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
             root = tree.getroot()
             sents = root.iter("sentence")
@@ -295,17 +316,27 @@ def convert_to_biluo(tokens):
                 sent = {'tokens': [{
                     'orth': t.orth,
                     'id': t.id,
-                    'ner': t.get_NE()} # change to t.get_NE()
+                    'ner': t.get_NE()}
                     for t in tokens
                 ], 'brackets': []
                 }
-                print(sent)
-        doc_idx +=1
-
-    break
-
-print(all_labels)
-
+                # print(sent)
+                # print(get_text(tokens))
+
+                text = get_text(tokens)
+                sentences += [sent]
+                raw += "\n"+text
+
+            doc_json = {
+                'id': doc_idx,
+                'paragraphs': [{'sentences': sentences}]
+            }
+            corpus += [doc_json]
+            doc_idx +=1
+
+# print(corpus)
+with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
+    json.dump(corpus, f)
 # for f in os.listdir(os.path.join(path_prefix, corpus_path)):
 #     doc_json = {}
 #     current_folder = f
@@ -382,6 +413,4 @@ def convert_to_biluo(tokens):
 #
 #     doc_id += 1
 #     corpus += [doc_json]
-#
-# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
-#     json.dump(corpus, f)
\ No newline at end of file
+#
\ No newline at end of file
diff --git a/data/.gitignore b/data/.gitignore
index 6cefda7..e74d18a 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -10,4 +10,5 @@ vocab.jsonl
 /vocab.jsonl
 /vectors_300.txt
 tagmap.py
-NKJP-PodkorpusMilionowy-1.2
+
+/kpwr-1.1
\ No newline at end of file
diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc
new file mode 100644
index 0000000..275217c
--- /dev/null
+++ b/data/kpwr-1.1.dvc
@@ -0,0 +1,7 @@
+md5: 58cbc0bd05749d04e4b6a5e4c9d78c01
+outs:
+- cache: true
+  md5: d84971d4b907e5efc5d9320de6691027.dir
+  metric: false
+  path: kpwr-1.1
+wdir: .

From 525a283216cd81c6a2af1e1010c76b0b50b82acc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Sat, 30 Mar 2019 15:17:21 +0100
Subject: [PATCH 05/14] Fix BILUO

---
 ConvertNER/convert_NER_wroc.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 2849d5a..6c8fa3e 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -227,10 +227,10 @@ def convert_to_biluo(tokens):
             if token.is_NE():
                 if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
                     # inner NE
-                    out += [Token(token.orth, ["I_"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["I-"+token.get_NE()], token.id)]
                 else:
                     # last NE
-                    out += [Token(token.orth, ["L_"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["L-"+token.get_NE()], token.id)]
                     in_ne = False
             else:
                 # we shouldn't ever get here
@@ -241,11 +241,11 @@ def convert_to_biluo(tokens):
                 # new NE
                 if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
                     # beginning NE
-                    out += [Token(token.orth, ["B_"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["B-"+token.get_NE()], token.id)]
                     in_ne = True
                 else:
                     # unit NE
-                    out += [Token(token.orth, ["U_"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["U-"+token.get_NE()], token.id)]
                     in_ne = False
             else:
                 # outside of NE
@@ -254,10 +254,10 @@ def convert_to_biluo(tokens):
     # process last token
     token = tokens[-1]
     if in_ne:
-        out += [Token(token.orth, ["L_" + token.get_NE()], token.id)]
+        out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
     else:
         if token.is_NE():
-            out += [Token(token.orth, ["U_" + token.get_NE()], token.id)]
+            out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
         else:
             out += [Token(token.orth, ["O"], token.id)]
 

From 2f06b47ee64ad9df1a95bac9d5649156870b35eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Sat, 30 Mar 2019 15:51:02 +0100
Subject: [PATCH 06/14] Add analysis

---
 ConvertNER/convert_NER_wroc.py    | 108 +++------
 NER-wroc-analysis/get_analysis.py | 365 ++++++++++++++++++++++++++++++
 2 files changed, 393 insertions(+), 80 deletions(-)
 create mode 100644 NER-wroc-analysis/get_analysis.py

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 6c8fa3e..ecd27d6 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -179,6 +179,28 @@ def get_all_labels(tokens):
 
     return labels
 
+class setCounter:
+    def __init__(self):
+        self.contents = {}
+
+    def count(self, k, times=1):
+        if k in self.contents:
+            self.contents[k] += times
+        else:
+            self.contents[k] = times
+
+    def merge(self, other):
+        for k in other.contents:
+            self.count(k, other.contents[k])
+
+def get_all_labels_with_cardinalities(tokens):
+    labels = setCounter()
+    for tok in tokens:
+        for attr in tok.attribs:
+            labels.count(attr)
+
+    return labels
+
 def pick_tags(tokens):
     # first and last separately
     if len(tokens) == 0:
@@ -287,7 +309,7 @@ def get_text(tokens):
 
 
 
-all_labels = set()
+all_labels = setCounter()
 docs = []
 doc_idx = 0
 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
@@ -309,7 +331,8 @@ def get_text(tokens):
                     # if token.is_NE(): print(token)
                     tokens += [token]
 
-                all_labels |= get_all_labels(tokens)
+                # all_labels |= get_all_labels(tokens)
+                all_labels.merge(get_all_labels_with_cardinalities(tokens))
                 tokens = pick_tags(tokens)
                 tokens = convert_to_biluo(tokens)
 
@@ -334,83 +357,8 @@ def get_text(tokens):
             corpus += [doc_json]
             doc_idx +=1
 
-# print(corpus)
 with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
     json.dump(corpus, f)
-# for f in os.listdir(os.path.join(path_prefix, corpus_path)):
-#     doc_json = {}
-#     current_folder = f
-#
-#     if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):
-#         continue
-#
-#     if not required_files_exist(current_folder):
-#         # doc_id +=1 ?
-#         continue
-#
-#     tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml))
-#     root_morphosyntax = tree_morphosyntax.getroot()
-#
-#     tree_named = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,named_xml))
-#     root_named = tree_named.getroot()
-#
-#     tree_text = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,text_xml))
-#     root_text = tree_text.getroot()
-#
-#     tree_segmentation = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,segmentation_xml))
-#     root_segmentation = tree_segmentation.getroot()
-#
-#     segmentation_text_map = get_segmentation_text_maps(root_segmentation)
-#     entity_maps = get_entity_maps(root_named)
-#     text_maps = get_text_maps(root_text)
-#
-#     token_idx = 0
-#     paragraphs = []
-#     for paragraph in root_morphosyntax.iter('{http://www.tei-c.org/ns/1.0}p'):
-#         paragraph_json = {}
-#         pg_text = get_paragraph_text(paragraph, segmentation_text_map, text_maps)
-#
-#         text = ''
-#         nes = []
-#         sentences = []
-#         for sentence in paragraph:
-#             sent_id = get_sent_id(sentence)
-#             sentence_entity_map = entity_maps[sent_id]
-#             sentence_json = []
-#             for seg in sentence:
-#                 token = {}
-#                 ctag = get_ctag(seg)
-#                 orth = get_orth(seg)
-#                 ne = sentence_entity_map.get(orth)
-#
-#                 text += orth + ' '
-#
-#                 if ne is not None:
-#                     ne = NE_njkp_to_spacy[ne]
-#                     nes += [(len(text)-1-len(orth), len(text)-1, ne)]
-#
-#                 token['ctag'] = ctag
-#                 token['orth'] = orth
-#                 token['head'] = 0  # @TODO
-#                 token['dep'] = 'NA'  # @TODO
-#                 token['id'] = token_idx
-#                 token['ner'] = ne
-#                 token_idx += 1
-#                 sentence_json += [token]
-#             sentences += [sentence_json]
-#
-#         doc = nlp(text)
-#         entities = nes
-#         biluo_tags = biluo_tags_from_offsets(doc, entities)
-#
-#         sentences = set_biluo_tags(sentences, biluo_tags)
-#         paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences]
-#         paragraph_json['raw'] = pg_text
-#         paragraphs += [paragraph_json]
-#
-#     doc_json['id'] = doc_id
-#     doc_json['paragraphs'] = paragraphs
-#
-#     doc_id += 1
-#     corpus += [doc_json]
-#
\ No newline at end of file
+
+# with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f:
+#     json.dump(all_labels.contents, f)
\ No newline at end of file
diff --git a/NER-wroc-analysis/get_analysis.py b/NER-wroc-analysis/get_analysis.py
new file mode 100644
index 0000000..ea6972c
--- /dev/null
+++ b/NER-wroc-analysis/get_analysis.py
@@ -0,0 +1,365 @@
+import xml.etree.ElementTree as ET
+from spacy.gold import biluo_tags_from_offsets
+# import spacy
+import json
+import os
+
+path_prefix = './'
+corpus_path = 'data/kpwr-1.1/'
+output_path = 'data/NER/'
+output = 'NER_wroc.json'
+
+def get_subdirs(dir):
+    return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))]
+
+morphosyntax_xml = 'ann_morphosyntax.xml'
+groups_xml = 'ann_groups.xml'
+named_xml = 'ann_named.xml'
+senses_xml = 'ann_senses.xml'
+header_xml = 'header.xml'
+segmentation_xml = 'ann_segmentation.xml'
+words_xml = 'ann_words.xml'
+text_xml = 'text.xml'
+
+def print_children_recursively(n, i=0):
+    if i > 10:
+        return
+    for c in n:
+        print(' '*(3*i), c.attrib, c.tag)
+        print_children_recursively(c, i+1)
+
+def get(node, k, v):
+    if node is None:
+        return
+    for c in node:
+        if c.attrib.get(k)==v:
+            return c
+
+def get_morph(seg):
+    for c in seg:
+        if c.attrib['type']=='morph':
+            return c
+
+def get_orth(seg):
+    morph = get(seg, 'type', 'morph')
+    orth = get(morph, 'name', 'orth')
+    return orth[0].text if orth is not None else None
+
+def get_named(seg):
+    named = get(seg, 'type', 'named')
+    orth = get(named, 'name', 'orth')
+    return orth[0].text if orth is not None else None
+
+def get_named_type(seg):
+    named = get(seg, 'type', 'named')
+    type = get(named, 'name', 'type')
+    return type[0].attrib['value']
+
+def get_ctag(seg):
+    morph = get(seg, 'type', 'morph')
+    interps = get(morph, 'name', 'interps')
+    lex = get(interps, 'type', 'lex')
+    ctag = get(lex, 'name', 'ctag')
+    return ctag[0].attrib['value']
+
+def get_corresp_morph(sent):
+    return sent.attrib['corresp'].split('#')[1]
+
+def get_entity_maps(root):
+    result = {}
+    for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'):
+        tmp = []
+        for seg in sent:
+            text = get_named(seg)
+            type = get_named_type(seg)
+            tmp += [(text, type)]
+
+        result[get_corresp_morph(sent)] = dict(tmp)
+
+    return result
+
+def get_segmentation_text_maps(root):
+    res = {}
+    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'):
+        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
+        value = paragraph.attrib['corresp'].split('#')[1]
+        res[key]=value
+
+    return res
+
+def get_text_maps(root):
+    result = {}
+    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'):
+        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
+        text = ''
+        for child in paragraph:
+            text += child.text
+
+        result[key]=text
+
+    return result
+
+def get_sent_id(sent):
+    return sent.attrib['{http://www.w3.org/XML/1998/namespace}id']
+
+def get_paragraph_text(paragraph, segm_text_map, text_maps):
+    paragraph_id = paragraph.attrib['corresp'].split('#')[1]
+    return text_maps[segm_text_map[paragraph_id]]
+
+def set_biluo_tags(sentences, tags):
+    i = 0
+    for sent in sentences:
+        for token in sent:
+            token['ner'] = tags[i]
+            i += 1
+
+    return sentences
+
+def required_files_exist(dir):
+    required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml]
+    for file in required_files:
+        if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)):
+            return False
+
+    return True
+
+# nlp = spacy.load('en_core_web_sm')
+doc_id = 0
+corpus = []
+
+NE_njkp_to_spacy = {'persName': 'PERSON',
+ 'placeName': 'LOC',
+ 'orgName': 'ORG',
+ 'date': 'DATE',
+ 'time': 'TIME',
+ 'geogName': 'LOC'}
+
+class Token:
+    # def __init__(self, orth, attribs):
+    #     self.orth = orth
+    #     self.attribs = attribs
+    #     self.id = None #this is fugly
+
+
+    def __init__(self, orth, attribs, id):
+        self.orth = orth
+        self.attribs = attribs
+        self.id = id
+
+    def is_NE(self):
+        return len(self.attribs) != 0
+
+    def get_NE(self):
+        return self.attribs[0] if len(self.attribs) > 0 else ""
+
+    def __str__(self):
+        return (self.orth + ":" + str(self.attribs))
+
+
+def process_token(tok):
+    attribs = []
+    orth = tok.find("orth").text
+    for ann in tok.iter("ann"):
+        if ann.attrib['chan'].endswith("nam") and ann.text=="1":
+            attribs += [ann.attrib['chan']]
+
+    return Token(orth, attribs, -1)
+
+def get_common_tag(t1, t2):
+    set1 = set(t1.attribs)
+    set2 = set(t2.attribs)
+    common = list(set1 & set2)
+    return common[0] if len(common) > 0 else None
+
+def get_all_labels(tokens):
+    labels = set()
+    for tok in tokens:
+        for attr in tok.attribs:
+            labels.add(attr)
+
+    return labels
+
+class setCounter:
+    def __init__(self):
+        self.contents = {}
+
+    def count(self, k, times=1):
+        if k in self.contents:
+            self.contents[k] += times
+        else:
+            self.contents[k] = times
+
+    def merge(self, other):
+        for k in other.contents:
+            self.count(k, other.contents[k])
+
+def get_all_labels_with_cardinalities(tokens):
+    labels = setCounter()
+    for tok in tokens:
+        for attr in tok.attribs:
+            labels.count(attr)
+
+    return labels
+
+def pick_tags(tokens):
+    # first and last separately
+    if len(tokens) == 0:
+        return tokens
+    if len(tokens) == 1:
+        if tokens[0].is_NE():
+            tokens[0].attribs = [tokens[0].attribs[0]]
+        return tokens
+
+    t0 = tokens[0]
+    if len(t0.attribs) > 1:
+        new_tag = get_common_tag(t0, tokens[1])
+        if new_tag is None:
+            t0.attribs = [t0.attribs[0]]
+        else:
+            t0.attribs = [new_tag]
+
+    for i in range(1, len(tokens)-1):
+        if len(tokens[i].attribs) > 1:
+            new_tag = get_common_tag(tokens[i-1], tokens[i])
+            if new_tag is None:
+                new_tag = get_common_tag(tokens[i], tokens[i+1])
+                if new_tag is None:
+                    tokens[i].attribs = [tokens[i].attribs[0]]
+                else:
+                    tokens[i].attribs = [new_tag]
+            else:
+                tokens[i].attribs = [new_tag]
+
+    te = tokens[-1]
+    if len(te.attribs) > 1:
+        new_tag = get_common_tag(te, tokens[-2])
+        if new_tag is None:
+            te.attribs = [te.attribs[0]]
+        else:
+            te.attribs = [new_tag]
+
+    assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te]))
+    return [t0] + tokens[1:-2] + [te]
+
+def convert_to_biluo(tokens):
+    out = []
+    in_ne = False
+    for i, token in enumerate(tokens[:-1]):
+        if in_ne:
+            if token.is_NE():
+                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
+                    # inner NE
+                    out += [Token(token.orth, ["I-"+token.get_NE()], token.id)]
+                else:
+                    # last NE
+                    out += [Token(token.orth, ["L-"+token.get_NE()], token.id)]
+                    in_ne = False
+            else:
+                # we shouldn't ever get here
+                assert(False)
+
+        else:
+            if token.is_NE():
+                # new NE
+                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
+                    # beginning NE
+                    out += [Token(token.orth, ["B-"+token.get_NE()], token.id)]
+                    in_ne = True
+                else:
+                    # unit NE
+                    out += [Token(token.orth, ["U-"+token.get_NE()], token.id)]
+                    in_ne = False
+            else:
+                # outside of NE
+                out += [Token(token.orth, ["O"], token.id)]
+
+    # process last token
+    token = tokens[-1]
+    if in_ne:
+        out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
+    else:
+        if token.is_NE():
+            out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
+        else:
+            out += [Token(token.orth, ["O"], token.id)]
+
+    return out
+
+def get_text(tokens):
+    raw = ""
+    for token in tokens:
+        raw += token.orth + " "
+
+    _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ؛ ٪ . ! ?'
+    _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
+    _hyphens = '- – — -- --- —— ~'
+    _brackets_pref =  ") ] }"
+    _brackets_post = "( [ {"
+
+    interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ")
+    interp_post = _brackets_post.split(" ")
+    raw = raw[:-1]
+    for char in interp_pref:
+        raw = raw.replace(" "+char, char)
+
+    for char in interp_post:
+        raw = raw.replace(char+" ", char)
+
+    return raw
+
+
+
+all_labels = setCounter()
+docs = []
+doc_idx = 0
+for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
+    for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
+        if not file.endswith("rel.xml") and not file.endswith(".ini"):
+            doc_json = {}
+            sentences = []
+            token_idx = 0
+            raw = ""
+            tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
+            root = tree.getroot()
+            sents = root.iter("sentence")
+            for sent in sents:
+                tokens = []
+                for tok in sent.iter("tok"):
+                    token = process_token(tok)
+                    token.id = token_idx
+                    token_idx += 1
+                    # if token.is_NE(): print(token)
+                    tokens += [token]
+
+                # all_labels |= get_all_labels(tokens)
+                all_labels.merge(get_all_labels_with_cardinalities(tokens))
+                tokens = pick_tags(tokens)
+                tokens = convert_to_biluo(tokens)
+
+                sent = {'tokens': [{
+                    'orth': t.orth,
+                    'id': t.id,
+                    'ner': t.get_NE()}
+                    for t in tokens
+                ], 'brackets': []
+                }
+                # print(sent)
+                # print(get_text(tokens))
+
+                text = get_text(tokens)
+                sentences += [sent]
+                raw += "\n"+text
+
+            doc_json = {
+                'id': doc_idx,
+                'paragraphs': [{'sentences': sentences}]
+            }
+            corpus += [doc_json]
+            doc_idx +=1
+
+# print(corpus)
+# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
+#     json.dump(corpus, f)
+
+with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f:
+    json.dump(all_labels.contents, f)
\ No newline at end of file

From 9d3224be4e7f0b765c2a4bc4eadeb8d85690f787 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Wed, 3 Apr 2019 21:22:31 +0200
Subject: [PATCH 07/14] Remove analysis (moved to another branch)

---
 NER-wroc-analysis/get_analysis.py | 365 ------------------------------
 1 file changed, 365 deletions(-)
 delete mode 100644 NER-wroc-analysis/get_analysis.py

diff --git a/NER-wroc-analysis/get_analysis.py b/NER-wroc-analysis/get_analysis.py
deleted file mode 100644
index ea6972c..0000000
--- a/NER-wroc-analysis/get_analysis.py
+++ /dev/null
@@ -1,365 +0,0 @@
-import xml.etree.ElementTree as ET
-from spacy.gold import biluo_tags_from_offsets
-# import spacy
-import json
-import os
-
-path_prefix = './'
-corpus_path = 'data/kpwr-1.1/'
-output_path = 'data/NER/'
-output = 'NER_wroc.json'
-
-def get_subdirs(dir):
-    return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))]
-
-morphosyntax_xml = 'ann_morphosyntax.xml'
-groups_xml = 'ann_groups.xml'
-named_xml = 'ann_named.xml'
-senses_xml = 'ann_senses.xml'
-header_xml = 'header.xml'
-segmentation_xml = 'ann_segmentation.xml'
-words_xml = 'ann_words.xml'
-text_xml = 'text.xml'
-
-def print_children_recursively(n, i=0):
-    if i > 10:
-        return
-    for c in n:
-        print(' '*(3*i), c.attrib, c.tag)
-        print_children_recursively(c, i+1)
-
-def get(node, k, v):
-    if node is None:
-        return
-    for c in node:
-        if c.attrib.get(k)==v:
-            return c
-
-def get_morph(seg):
-    for c in seg:
-        if c.attrib['type']=='morph':
-            return c
-
-def get_orth(seg):
-    morph = get(seg, 'type', 'morph')
-    orth = get(morph, 'name', 'orth')
-    return orth[0].text if orth is not None else None
-
-def get_named(seg):
-    named = get(seg, 'type', 'named')
-    orth = get(named, 'name', 'orth')
-    return orth[0].text if orth is not None else None
-
-def get_named_type(seg):
-    named = get(seg, 'type', 'named')
-    type = get(named, 'name', 'type')
-    return type[0].attrib['value']
-
-def get_ctag(seg):
-    morph = get(seg, 'type', 'morph')
-    interps = get(morph, 'name', 'interps')
-    lex = get(interps, 'type', 'lex')
-    ctag = get(lex, 'name', 'ctag')
-    return ctag[0].attrib['value']
-
-def get_corresp_morph(sent):
-    return sent.attrib['corresp'].split('#')[1]
-
-def get_entity_maps(root):
-    result = {}
-    for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'):
-        tmp = []
-        for seg in sent:
-            text = get_named(seg)
-            type = get_named_type(seg)
-            tmp += [(text, type)]
-
-        result[get_corresp_morph(sent)] = dict(tmp)
-
-    return result
-
-def get_segmentation_text_maps(root):
-    res = {}
-    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'):
-        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
-        value = paragraph.attrib['corresp'].split('#')[1]
-        res[key]=value
-
-    return res
-
-def get_text_maps(root):
-    result = {}
-    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'):
-        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
-        text = ''
-        for child in paragraph:
-            text += child.text
-
-        result[key]=text
-
-    return result
-
-def get_sent_id(sent):
-    return sent.attrib['{http://www.w3.org/XML/1998/namespace}id']
-
-def get_paragraph_text(paragraph, segm_text_map, text_maps):
-    paragraph_id = paragraph.attrib['corresp'].split('#')[1]
-    return text_maps[segm_text_map[paragraph_id]]
-
-def set_biluo_tags(sentences, tags):
-    i = 0
-    for sent in sentences:
-        for token in sent:
-            token['ner'] = tags[i]
-            i += 1
-
-    return sentences
-
-def required_files_exist(dir):
-    required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml]
-    for file in required_files:
-        if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)):
-            return False
-
-    return True
-
-# nlp = spacy.load('en_core_web_sm')
-doc_id = 0
-corpus = []
-
-NE_njkp_to_spacy = {'persName': 'PERSON',
- 'placeName': 'LOC',
- 'orgName': 'ORG',
- 'date': 'DATE',
- 'time': 'TIME',
- 'geogName': 'LOC'}
-
-class Token:
-    # def __init__(self, orth, attribs):
-    #     self.orth = orth
-    #     self.attribs = attribs
-    #     self.id = None #this is fugly
-
-
-    def __init__(self, orth, attribs, id):
-        self.orth = orth
-        self.attribs = attribs
-        self.id = id
-
-    def is_NE(self):
-        return len(self.attribs) != 0
-
-    def get_NE(self):
-        return self.attribs[0] if len(self.attribs) > 0 else ""
-
-    def __str__(self):
-        return (self.orth + ":" + str(self.attribs))
-
-
-def process_token(tok):
-    attribs = []
-    orth = tok.find("orth").text
-    for ann in tok.iter("ann"):
-        if ann.attrib['chan'].endswith("nam") and ann.text=="1":
-            attribs += [ann.attrib['chan']]
-
-    return Token(orth, attribs, -1)
-
-def get_common_tag(t1, t2):
-    set1 = set(t1.attribs)
-    set2 = set(t2.attribs)
-    common = list(set1 & set2)
-    return common[0] if len(common) > 0 else None
-
-def get_all_labels(tokens):
-    labels = set()
-    for tok in tokens:
-        for attr in tok.attribs:
-            labels.add(attr)
-
-    return labels
-
-class setCounter:
-    def __init__(self):
-        self.contents = {}
-
-    def count(self, k, times=1):
-        if k in self.contents:
-            self.contents[k] += times
-        else:
-            self.contents[k] = times
-
-    def merge(self, other):
-        for k in other.contents:
-            self.count(k, other.contents[k])
-
-def get_all_labels_with_cardinalities(tokens):
-    labels = setCounter()
-    for tok in tokens:
-        for attr in tok.attribs:
-            labels.count(attr)
-
-    return labels
-
-def pick_tags(tokens):
-    # first and last separately
-    if len(tokens) == 0:
-        return tokens
-    if len(tokens) == 1:
-        if tokens[0].is_NE():
-            tokens[0].attribs = [tokens[0].attribs[0]]
-        return tokens
-
-    t0 = tokens[0]
-    if len(t0.attribs) > 1:
-        new_tag = get_common_tag(t0, tokens[1])
-        if new_tag is None:
-            t0.attribs = [t0.attribs[0]]
-        else:
-            t0.attribs = [new_tag]
-
-    for i in range(1, len(tokens)-1):
-        if len(tokens[i].attribs) > 1:
-            new_tag = get_common_tag(tokens[i-1], tokens[i])
-            if new_tag is None:
-                new_tag = get_common_tag(tokens[i], tokens[i+1])
-                if new_tag is None:
-                    tokens[i].attribs = [tokens[i].attribs[0]]
-                else:
-                    tokens[i].attribs = [new_tag]
-            else:
-                tokens[i].attribs = [new_tag]
-
-    te = tokens[-1]
-    if len(te.attribs) > 1:
-        new_tag = get_common_tag(te, tokens[-2])
-        if new_tag is None:
-            te.attribs = [te.attribs[0]]
-        else:
-            te.attribs = [new_tag]
-
-    assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te]))
-    return [t0] + tokens[1:-2] + [te]
-
-def convert_to_biluo(tokens):
-    out = []
-    in_ne = False
-    for i, token in enumerate(tokens[:-1]):
-        if in_ne:
-            if token.is_NE():
-                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
-                    # inner NE
-                    out += [Token(token.orth, ["I-"+token.get_NE()], token.id)]
-                else:
-                    # last NE
-                    out += [Token(token.orth, ["L-"+token.get_NE()], token.id)]
-                    in_ne = False
-            else:
-                # we shouldn't ever get here
-                assert(False)
-
-        else:
-            if token.is_NE():
-                # new NE
-                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
-                    # beginning NE
-                    out += [Token(token.orth, ["B-"+token.get_NE()], token.id)]
-                    in_ne = True
-                else:
-                    # unit NE
-                    out += [Token(token.orth, ["U-"+token.get_NE()], token.id)]
-                    in_ne = False
-            else:
-                # outside of NE
-                out += [Token(token.orth, ["O"], token.id)]
-
-    # process last token
-    token = tokens[-1]
-    if in_ne:
-        out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
-    else:
-        if token.is_NE():
-            out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
-        else:
-            out += [Token(token.orth, ["O"], token.id)]
-
-    return out
-
-def get_text(tokens):
-    raw = ""
-    for token in tokens:
-        raw += token.orth + " "
-
-    _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ؛ ٪ . ! ?'
-    _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
-    _hyphens = '- – — -- --- —— ~'
-    _brackets_pref =  ") ] }"
-    _brackets_post = "( [ {"
-
-    interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ")
-    interp_post = _brackets_post.split(" ")
-    raw = raw[:-1]
-    for char in interp_pref:
-        raw = raw.replace(" "+char, char)
-
-    for char in interp_post:
-        raw = raw.replace(char+" ", char)
-
-    return raw
-
-
-
-all_labels = setCounter()
-docs = []
-doc_idx = 0
-for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
-    for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
-        if not file.endswith("rel.xml") and not file.endswith(".ini"):
-            doc_json = {}
-            sentences = []
-            token_idx = 0
-            raw = ""
-            tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
-            root = tree.getroot()
-            sents = root.iter("sentence")
-            for sent in sents:
-                tokens = []
-                for tok in sent.iter("tok"):
-                    token = process_token(tok)
-                    token.id = token_idx
-                    token_idx += 1
-                    # if token.is_NE(): print(token)
-                    tokens += [token]
-
-                # all_labels |= get_all_labels(tokens)
-                all_labels.merge(get_all_labels_with_cardinalities(tokens))
-                tokens = pick_tags(tokens)
-                tokens = convert_to_biluo(tokens)
-
-                sent = {'tokens': [{
-                    'orth': t.orth,
-                    'id': t.id,
-                    'ner': t.get_NE()}
-                    for t in tokens
-                ], 'brackets': []
-                }
-                # print(sent)
-                # print(get_text(tokens))
-
-                text = get_text(tokens)
-                sentences += [sent]
-                raw += "\n"+text
-
-            doc_json = {
-                'id': doc_idx,
-                'paragraphs': [{'sentences': sentences}]
-            }
-            corpus += [doc_json]
-            doc_idx +=1
-
-# print(corpus)
-# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
-#     json.dump(corpus, f)
-
-with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f:
-    json.dump(all_labels.contents, f)
\ No newline at end of file

From 59491bb082fe4b50bb3ed24960ad4a0a04109ef1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Wed, 3 Apr 2019 21:32:01 +0200
Subject: [PATCH 08/14] Remove commented out code

---
 ConvertNER/convert_NER_wroc.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index ecd27d6..194f229 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -135,12 +135,6 @@ def required_files_exist(dir):
  'geogName': 'LOC'}
 
 class Token:
-    # def __init__(self, orth, attribs):
-    #     self.orth = orth
-    #     self.attribs = attribs
-    #     self.id = None #this is fugly
-
-
     def __init__(self, orth, attribs, id):
         self.orth = orth
         self.attribs = attribs
@@ -328,10 +322,8 @@ def get_text(tokens):
                     token = process_token(tok)
                     token.id = token_idx
                     token_idx += 1
-                    # if token.is_NE(): print(token)
                     tokens += [token]
 
-                # all_labels |= get_all_labels(tokens)
                 all_labels.merge(get_all_labels_with_cardinalities(tokens))
                 tokens = pick_tags(tokens)
                 tokens = convert_to_biluo(tokens)
@@ -343,8 +335,6 @@ def get_text(tokens):
                     for t in tokens
                 ], 'brackets': []
                 }
-                # print(sent)
-                # print(get_text(tokens))
 
                 text = get_text(tokens)
                 sentences += [sent]
@@ -358,7 +348,4 @@ def get_text(tokens):
             doc_idx +=1
 
 with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
-    json.dump(corpus, f)
-
-# with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f:
-#     json.dump(all_labels.contents, f)
\ No newline at end of file
+    json.dump(corpus, f)
\ No newline at end of file

From b7cd6103e8c9de87d8ff386c56111e1053a017ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= <piotr.ksiazek96@gmail.com>
Date: Wed, 3 Apr 2019 22:17:34 +0200
Subject: [PATCH 09/14] this field has non-binary values (it can even be 2, or
 <text>-<number>)

---
 ConvertNER/convert_NER_wroc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 194f229..e762599 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -154,7 +154,7 @@ def process_token(tok):
     attribs = []
     orth = tok.find("orth").text
     for ann in tok.iter("ann"):
-        if ann.attrib['chan'].endswith("nam") and ann.text=="1":
+        if ann.attrib['chan'].endswith("nam") and ann.text!="0":
             attribs += [ann.attrib['chan']]
 
     return Token(orth, attribs, -1)

From 70785b21e23fc961a65a55308d95b5bec04864cc Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Wed, 17 Apr 2019 21:43:05 +0200
Subject: [PATCH 10/14] Code cleanup

---
 ConvertNER/convert_NER_wroc.py  | 200 +++++---------------------------
 data/lemmatizer_data/.gitignore |   3 +-
 lemma_sources_exp.dvc           |  14 +++
 3 files changed, 43 insertions(+), 174 deletions(-)
 create mode 100644 lemma_sources_exp.dvc

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index e762599..20abd3d 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -1,6 +1,4 @@
 import xml.etree.ElementTree as ET
-from spacy.gold import biluo_tags_from_offsets
-# import spacy
 import json
 import os
 
@@ -9,130 +7,16 @@
 output_path = 'data/NER/'
 output = 'NER_wroc.json'
 
-def get_subdirs(dir):
-    return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))]
-
-morphosyntax_xml = 'ann_morphosyntax.xml'
-groups_xml = 'ann_groups.xml'
-named_xml = 'ann_named.xml'
-senses_xml = 'ann_senses.xml'
-header_xml = 'header.xml'
-segmentation_xml = 'ann_segmentation.xml'
-words_xml = 'ann_words.xml'
-text_xml = 'text.xml'
-
-def print_children_recursively(n, i=0):
-    if i > 10:
-        return
-    for c in n:
-        print(' '*(3*i), c.attrib, c.tag)
-        print_children_recursively(c, i+1)
-
-def get(node, k, v):
-    if node is None:
-        return
-    for c in node:
-        if c.attrib.get(k)==v:
-            return c
-
-def get_morph(seg):
-    for c in seg:
-        if c.attrib['type']=='morph':
-            return c
-
-def get_orth(seg):
-    morph = get(seg, 'type', 'morph')
-    orth = get(morph, 'name', 'orth')
-    return orth[0].text if orth is not None else None
-
-def get_named(seg):
-    named = get(seg, 'type', 'named')
-    orth = get(named, 'name', 'orth')
-    return orth[0].text if orth is not None else None
-
-def get_named_type(seg):
-    named = get(seg, 'type', 'named')
-    type = get(named, 'name', 'type')
-    return type[0].attrib['value']
-
-def get_ctag(seg):
-    morph = get(seg, 'type', 'morph')
-    interps = get(morph, 'name', 'interps')
-    lex = get(interps, 'type', 'lex')
-    ctag = get(lex, 'name', 'ctag')
-    return ctag[0].attrib['value']
-
-def get_corresp_morph(sent):
-    return sent.attrib['corresp'].split('#')[1]
-
-def get_entity_maps(root):
-    result = {}
-    for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'):
-        tmp = []
-        for seg in sent:
-            text = get_named(seg)
-            type = get_named_type(seg)
-            tmp += [(text, type)]
-
-        result[get_corresp_morph(sent)] = dict(tmp)
-
-    return result
-
-def get_segmentation_text_maps(root):
-    res = {}
-    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'):
-        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
-        value = paragraph.attrib['corresp'].split('#')[1]
-        res[key]=value
-
-    return res
-
-def get_text_maps(root):
-    result = {}
-    for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'):
-        key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id']
-        text = ''
-        for child in paragraph:
-            text += child.text
-
-        result[key]=text
-
-    return result
-
-def get_sent_id(sent):
-    return sent.attrib['{http://www.w3.org/XML/1998/namespace}id']
-
-def get_paragraph_text(paragraph, segm_text_map, text_maps):
-    paragraph_id = paragraph.attrib['corresp'].split('#')[1]
-    return text_maps[segm_text_map[paragraph_id]]
-
-def set_biluo_tags(sentences, tags):
-    i = 0
-    for sent in sentences:
-        for token in sent:
-            token['ner'] = tags[i]
-            i += 1
-
-    return sentences
-
-def required_files_exist(dir):
-    required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml]
-    for file in required_files:
-        if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)):
-            return False
-
-    return True
-
-# nlp = spacy.load('en_core_web_sm')
 doc_id = 0
 corpus = []
 
 NE_njkp_to_spacy = {'persName': 'PERSON',
- 'placeName': 'LOC',
- 'orgName': 'ORG',
- 'date': 'DATE',
- 'time': 'TIME',
- 'geogName': 'LOC'}
+                    'placeName': 'LOC',
+                    'orgName': 'ORG',
+                    'date': 'DATE',
+                    'time': 'TIME',
+                    'geogName': 'LOC'}
+
 
 class Token:
     def __init__(self, orth, attribs, id):
@@ -154,46 +38,18 @@ def process_token(tok):
     attribs = []
     orth = tok.find("orth").text
     for ann in tok.iter("ann"):
-        if ann.attrib['chan'].endswith("nam") and ann.text!="0":
+        if ann.attrib['chan'].endswith("nam") and ann.text != "0":
             attribs += [ann.attrib['chan']]
 
     return Token(orth, attribs, -1)
 
+
 def get_common_tag(t1, t2):
     set1 = set(t1.attribs)
     set2 = set(t2.attribs)
     common = list(set1 & set2)
     return common[0] if len(common) > 0 else None
 
-def get_all_labels(tokens):
-    labels = set()
-    for tok in tokens:
-        for attr in tok.attribs:
-            labels.add(attr)
-
-    return labels
-
-class setCounter:
-    def __init__(self):
-        self.contents = {}
-
-    def count(self, k, times=1):
-        if k in self.contents:
-            self.contents[k] += times
-        else:
-            self.contents[k] = times
-
-    def merge(self, other):
-        for k in other.contents:
-            self.count(k, other.contents[k])
-
-def get_all_labels_with_cardinalities(tokens):
-    labels = setCounter()
-    for tok in tokens:
-        for attr in tok.attribs:
-            labels.count(attr)
-
-    return labels
 
 def pick_tags(tokens):
     # first and last separately
@@ -212,11 +68,11 @@ def pick_tags(tokens):
         else:
             t0.attribs = [new_tag]
 
-    for i in range(1, len(tokens)-1):
+    for i in range(1, len(tokens) - 1):
         if len(tokens[i].attribs) > 1:
-            new_tag = get_common_tag(tokens[i-1], tokens[i])
+            new_tag = get_common_tag(tokens[i - 1], tokens[i])
             if new_tag is None:
-                new_tag = get_common_tag(tokens[i], tokens[i+1])
+                new_tag = get_common_tag(tokens[i], tokens[i + 1])
                 if new_tag is None:
                     tokens[i].attribs = [tokens[i].attribs[0]]
                 else:
@@ -232,36 +88,37 @@ def pick_tags(tokens):
         else:
             te.attribs = [new_tag]
 
-    assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te]))
+    assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te]))
     return [t0] + tokens[1:-2] + [te]
 
+
 def convert_to_biluo(tokens):
     out = []
     in_ne = False
     for i, token in enumerate(tokens[:-1]):
         if in_ne:
             if token.is_NE():
-                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
+                if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
                     # inner NE
-                    out += [Token(token.orth, ["I-"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["I-" + token.get_NE()], token.id)]
                 else:
                     # last NE
-                    out += [Token(token.orth, ["L-"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
                     in_ne = False
             else:
                 # we shouldn't ever get here
-                assert(False)
+                assert (False)
 
         else:
             if token.is_NE():
                 # new NE
-                if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE():
+                if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
                     # beginning NE
-                    out += [Token(token.orth, ["B-"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["B-" + token.get_NE()], token.id)]
                     in_ne = True
                 else:
                     # unit NE
-                    out += [Token(token.orth, ["U-"+token.get_NE()], token.id)]
+                    out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
                     in_ne = False
             else:
                 # outside of NE
@@ -279,6 +136,7 @@ def convert_to_biluo(tokens):
 
     return out
 
+
 def get_text(tokens):
     raw = ""
     for token in tokens:
@@ -287,23 +145,20 @@ def get_text(tokens):
     _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ؛ ٪ . ! ?'
     _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
     _hyphens = '- – — -- --- —— ~'
-    _brackets_pref =  ") ] }"
+    _brackets_pref = ") ] }"
     _brackets_post = "( [ {"
 
     interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ")
     interp_post = _brackets_post.split(" ")
-    raw = raw[:-1]
     for char in interp_pref:
-        raw = raw.replace(" "+char, char)
+        raw = raw.replace(" " + char, char)
 
     for char in interp_post:
-        raw = raw.replace(char+" ", char)
+        raw = raw.replace(char + " ", char)
 
     return raw
 
 
-
-all_labels = setCounter()
 docs = []
 doc_idx = 0
 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
@@ -324,7 +179,6 @@ def get_text(tokens):
                     token_idx += 1
                     tokens += [token]
 
-                all_labels.merge(get_all_labels_with_cardinalities(tokens))
                 tokens = pick_tags(tokens)
                 tokens = convert_to_biluo(tokens)
 
@@ -338,14 +192,14 @@ def get_text(tokens):
 
                 text = get_text(tokens)
                 sentences += [sent]
-                raw += "\n"+text
+                raw += "\n" + text
 
             doc_json = {
                 'id': doc_idx,
                 'paragraphs': [{'sentences': sentences}]
             }
             corpus += [doc_json]
-            doc_idx +=1
+            doc_idx += 1
 
 with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
-    json.dump(corpus, f)
\ No newline at end of file
+    json.dump(corpus, f)
diff --git a/data/lemmatizer_data/.gitignore b/data/lemmatizer_data/.gitignore
index a6cd652..cdafee2 100644
--- a/data/lemmatizer_data/.gitignore
+++ b/data/lemmatizer_data/.gitignore
@@ -4,4 +4,5 @@
 /rules.json
 
 /lemma_sources
-/sjp_ispell.tar.bz2
\ No newline at end of file
+/sjp_ispell.tar.bz2
+/lemma_sources_exp
\ No newline at end of file
diff --git a/lemma_sources_exp.dvc b/lemma_sources_exp.dvc
new file mode 100644
index 0000000..76ca409
--- /dev/null
+++ b/lemma_sources_exp.dvc
@@ -0,0 +1,14 @@
+cmd: python lemma_rules_extraction/yield_all_suffixes.py
+deps:
+- md5: 947b48802b53bdff2ad02122c04063e5.dir
+  path: data/lemmatizer_data/lemma_sources
+- md5: abb151c19621000ccc91d8e4b494674f
+  path: lemma_rules_extraction/yield_all_suffixes.py
+md5: 53ab70bcbc27640766d00b4977b4733d
+outs:
+- cache: true
+  md5: 4f29377d4c9dd5c9997c74750af31321.dir
+  metric: false
+  path: data/lemmatizer_data/lemma_sources_exp
+  persist: false
+wdir: .

From 1b371b2e027ac2517d18f9fb7aa3343e6c3fb297 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Wed, 17 Apr 2019 23:14:03 +0200
Subject: [PATCH 11/14] Auto stash before rebase of "master"

---
 data/.gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/.gitignore b/data/.gitignore
index e74d18a..5f63d8c 100644
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -10,5 +10,5 @@ vocab.jsonl
 /vocab.jsonl
 /vectors_300.txt
 tagmap.py
-
+NKJP-PodkorpusMilionowy-1.2
 /kpwr-1.1
\ No newline at end of file

From de41d270b06ae55c3a76c3ffbe9f308fc70bb166 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Wed, 17 Apr 2019 23:34:32 +0200
Subject: [PATCH 12/14] Add accidentally removed function

---
 ConvertNER/convert_NER_wroc.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 20abd3d..98282ee 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -34,6 +34,10 @@ def __str__(self):
         return (self.orth + ":" + str(self.attribs))
 
 
+def get_subdirs(dir):
+    return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]
+
+
 def process_token(tok):
     attribs = []
     orth = tok.find("orth").text

From 050dbbbf461b7eef19e942dabb186bb0ba2a3170 Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Thu, 25 Apr 2019 09:13:04 +0200
Subject: [PATCH 13/14] Fix rebase bugs

---
 ConvertNER/convert_NER.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py
index 3c12602..d8a1db2 100644
--- a/ConvertNER/convert_NER.py
+++ b/ConvertNER/convert_NER.py
@@ -1,6 +1,6 @@
 import xml.etree.ElementTree as ET
 from spacy.gold import biluo_tags_from_offsets
-import spacy
+from spacy.lang.pl import Polish
 import json
 import os
 
@@ -120,7 +120,7 @@ def required_files_exist(dir):
 
     return True
 
-nlp = spacy.load('en_core_web_sm')
+nlp = Polish()
 doc_id = 0
 corpus = []
 

From 96f9e2dc9bade3ebf7705d76831b2c826bd80e0b Mon Sep 17 00:00:00 2001
From: Ksiazek <piotr.ksiazek96@gmail.com>
Date: Sat, 18 May 2019 11:01:48 +0200
Subject: [PATCH 14/14] Remove remains of detokenization

---
 ConvertNER/convert_NER_wroc.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py
index 98282ee..fa98707 100644
--- a/ConvertNER/convert_NER_wroc.py
+++ b/ConvertNER/convert_NER_wroc.py
@@ -141,28 +141,6 @@ def convert_to_biluo(tokens):
     return out
 
 
-def get_text(tokens):
-    raw = ""
-    for token in tokens:
-        raw += token.orth + " "
-
-    _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ؛ ٪ . ! ?'
-    _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
-    _hyphens = '- – — -- --- —— ~'
-    _brackets_pref = ") ] }"
-    _brackets_post = "( [ {"
-
-    interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ")
-    interp_post = _brackets_post.split(" ")
-    for char in interp_pref:
-        raw = raw.replace(" " + char, char)
-
-    for char in interp_post:
-        raw = raw.replace(char + " ", char)
-
-    return raw
-
-
 docs = []
 doc_idx = 0
 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
@@ -194,9 +172,7 @@ def get_text(tokens):
                 ], 'brackets': []
                 }
 
-                text = get_text(tokens)
                 sentences += [sent]
-                raw += "\n" + text
 
             doc_json = {
                 'id': doc_idx,