From e0b16973051500ecc75531dbf8c9434e78f6a005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Wed, 6 Mar 2019 19:59:21 +0100 Subject: [PATCH 01/14] Working-ish solution @TODO convert tags (use the map from tagmap) @TODO refactor @TODO head? @TODO dep? Change the script to work from repository's root folder Working-ish solution @TODO convert tags (use the map from tagmap) @TODO refactor @TODO head? @TODO dep? Change the script to work from repository's root folder Fix output format Working-ish solution @TODO convert tags (use the map from tagmap) @TODO refactor @TODO head? @TODO dep? Change the script to work from repository's root folder Working-ish solution @TODO convert tags (use the map from tagmap) @TODO refactor @TODO head? @TODO dep? Change the script to work from repository's root folder --- ConvertNER/convert_NER.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py index b6eefd3..0948085 100644 --- a/ConvertNER/convert_NER.py +++ b/ConvertNER/convert_NER.py @@ -1,5 +1,4 @@ import xml.etree.ElementTree as ET -from spacy.lang.pl import Polish from spacy.gold import biluo_tags_from_offsets import spacy import json @@ -121,7 +120,7 @@ def required_files_exist(dir): return True -nlp = Polish() +nlp = spacy.load('en_core_web_sm') doc_id = 0 corpus = [] @@ -139,8 +138,8 @@ def required_files_exist(dir): if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))): continue - # we skip the docs that don't have the required annotations (certain .xml files) if not required_files_exist(current_folder): + # doc_id +=1 ? continue tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml)) @@ -199,7 +198,7 @@ def required_files_exist(dir): biluo_tags = biluo_tags_from_offsets(doc, entities) sentences = set_biluo_tags(sentences, biluo_tags) - paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences] + paragraph_json['sentences'] = sentences paragraph_json['raw'] = pg_text paragraphs += [paragraph_json] @@ -209,9 +208,5 @@ def required_files_exist(dir): doc_id += 1 corpus += [doc_json] -out_path = os.path.expanduser(os.path.join(path_prefix, output_path)) -if not os.path.exists(out_path): - os.makedirs(out_path) - -with open(os.path.join(out_path, output), 'w+') as f: +with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: json.dump(corpus, f) From 84914b7d2d094d1b14bdce3434e46b329196fc0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Wed, 27 Mar 2019 13:20:57 +0100 Subject: [PATCH 02/14] Fix output format --- ConvertNER/convert_NER.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py index 0948085..3c12602 100644 --- a/ConvertNER/convert_NER.py +++ b/ConvertNER/convert_NER.py @@ -198,7 +198,7 @@ def required_files_exist(dir): biluo_tags = biluo_tags_from_offsets(doc, entities) sentences = set_biluo_tags(sentences, biluo_tags) - paragraph_json['sentences'] = sentences + paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences] paragraph_json['raw'] = pg_text paragraphs += [paragraph_json] From 326698cb8817ab7f0f6ea644cf4033e9d27594a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Thu, 28 Mar 2019 01:40:22 +0100 Subject: [PATCH 03/14] [WIP] Some really ugly stuff --- ConvertNER/convert_NER_wroc.py | 387 +++++++++++++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 ConvertNER/convert_NER_wroc.py diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py new file mode 100644 index 0000000..ca25117 --- /dev/null +++ b/ConvertNER/convert_NER_wroc.py @@ -0,0 +1,387 @@ +import xml.etree.ElementTree as ET +from spacy.gold import biluo_tags_from_offsets +# import spacy +import json +import os + +path_prefix = '../' +corpus_path = 'data/kpwr-1.1/' +output_path = 'data/NER/' +output = 'NER_wroc.json' + +def get_subdirs(dir): + return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))] + +morphosyntax_xml = 'ann_morphosyntax.xml' +groups_xml = 'ann_groups.xml' +named_xml = 'ann_named.xml' +senses_xml = 'ann_senses.xml' +header_xml = 'header.xml' +segmentation_xml = 'ann_segmentation.xml' +words_xml = 'ann_words.xml' +text_xml = 'text.xml' + +def print_children_recursively(n, i=0): + if i > 10: + return + for c in n: + print(' '*(3*i), c.attrib, c.tag) + print_children_recursively(c, i+1) + +def get(node, k, v): + if node is None: + return + for c in node: + if c.attrib.get(k)==v: + return c + +def get_morph(seg): + for c in seg: + if c.attrib['type']=='morph': + return c + +def get_orth(seg): + morph = get(seg, 'type', 'morph') + orth = get(morph, 'name', 'orth') + return orth[0].text if orth is not None else None + +def get_named(seg): + named = get(seg, 'type', 'named') + orth = get(named, 'name', 'orth') + return orth[0].text if orth is not None else None + +def get_named_type(seg): + named = get(seg, 'type', 'named') + type = get(named, 'name', 'type') + return type[0].attrib['value'] + +def get_ctag(seg): + morph = get(seg, 'type', 'morph') + interps = get(morph, 'name', 'interps') + lex = get(interps, 'type', 'lex') + ctag = get(lex, 'name', 'ctag') + return ctag[0].attrib['value'] + +def get_corresp_morph(sent): + return sent.attrib['corresp'].split('#')[1] + +def get_entity_maps(root): + result = {} + for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'): + tmp = [] + for seg in sent: + text = get_named(seg) + type = get_named_type(seg) + tmp += [(text, type)] + + result[get_corresp_morph(sent)] = dict(tmp) + + return result + +def get_segmentation_text_maps(root): + res = {} + for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'): + key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + value = paragraph.attrib['corresp'].split('#')[1] + res[key]=value + + return res + +def get_text_maps(root): + result = {} + for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'): + key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + text = '' + for child in paragraph: + text += child.text + + result[key]=text + + return result + +def get_sent_id(sent): + return sent.attrib['{http://www.w3.org/XML/1998/namespace}id'] + +def get_paragraph_text(paragraph, segm_text_map, text_maps): + paragraph_id = paragraph.attrib['corresp'].split('#')[1] + return text_maps[segm_text_map[paragraph_id]] + +def set_biluo_tags(sentences, tags): + i = 0 + for sent in sentences: + for token in sent: + token['ner'] = tags[i] + i += 1 + + return sentences + +def required_files_exist(dir): + required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml] + for file in required_files: + if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)): + return False + + return True + +# nlp = spacy.load('en_core_web_sm') +doc_id = 0 +corpus = [] + +NE_njkp_to_spacy = {'persName': 'PERSON', + 'placeName': 'LOC', + 'orgName': 'ORG', + 'date': 'DATE', + 'time': 'TIME', + 'geogName': 'LOC'} + +class Token: + # def __init__(self, orth, attribs): + # self.orth = orth + # self.attribs = attribs + # self.id = None #this is fugly + + + def __init__(self, orth, attribs, id): + self.orth = orth + self.attribs = attribs + self.id = id + + def is_NE(self): + return len(self.attribs) != 0 + + def get_NE(self): + return self.attribs[0] if len(self.attribs) > 0 else "" + + def __str__(self): + return (self.orth + ":" + str(self.attribs)) + + +def process_token(tok): + attribs = [] + orth = tok.find("orth").text + for ann in tok.iter("ann"): + if ann.attrib['chan'].endswith("nam") and ann.text=="1": + attribs += [ann.attrib['chan']] + + return Token(orth, attribs, -1) + +def get_common_tag(t1, t2): + set1 = set(t1.attribs) + set2 = set(t2.attribs) + common = list(set1 & set2) + return common[0] if len(common) > 0 else None + +def get_all_labels(tokens): + labels = set() + for tok in tokens: + for attr in tok.attribs: + labels.add(attr) + + return labels + +def pick_tags(tokens): + # first and last separately + if len(tokens) == 0: + return tokens + if len(tokens) == 1: + if tokens[0].is_NE(): + tokens[0].attribs = [token[0].attribs[0]] + return tokens + + t0 = tokens[0] + if len(t0.attribs) > 1: + new_tag = get_common_tag(t0, tokens[1]) + if new_tag is None: + t0.attribs = [t0.attribs[0]] + else: + t0.attribs = [new_tag] + + for i in range(1, len(tokens)-1): + if len(tokens[i].attribs) > 1: + new_tag = get_common_tag(tokens[i-1], tokens[i]) + if new_tag is None: + new_tag = get_common_tag(tokens[i], tokens[i+1]) + if new_tag is None: + tokens[i].attribs = [tokens[i].attribs[0]] + else: + tokens[i].attribs = [new_tag] + else: + tokens[i].attribs = [new_tag] + + te = tokens[-1] + if len(te.attribs) > 1: + new_tag = get_common_tag(te, tokens[-2]) + if new_tag is None: + te.attribs = [te.attribs[0]] + else: + te.attribs = [new_tag] + + assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te])) + return [t0] + tokens[1:-2] + [te] + +def convert_to_biluo(tokens): + out = [] + in_ne = False + for i, token in enumerate(tokens[:-1]): + if in_ne: + if token.is_NE(): + if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): + # inner NE + out += [Token(token.orth, ["I_"+token.get_NE()], token.id)] + else: + # last NE + out += [Token(token.orth, ["L_"+token.get_NE()], token.id)] + in_ne = False + else: + # we shouldn't ever get here + assert(False) + + else: + if token.is_NE(): + # new NE + if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): + # beginning NE + out += [Token(token.orth, ["B_"+token.get_NE()], token.id)] + in_ne = True + else: + # unit NE + out += [Token(token.orth, ["U_"+token.get_NE()], token.id)] + in_ne = False + else: + # outside of NE + out += [Token(token.orth, ["O"], token.id)] + + # process last token + token = tokens[-1] + if in_ne: + out += [Token(token.orth, ["L_" + token.get_NE()], token.id)] + else: + if token.is_NE(): + out += [Token(token.orth, ["U_" + token.get_NE()], token.id)] + else: + out += [Token(token.orth, ["O"], token.id)] + + return out + + + + + + +print(get_subdirs(os.path.join(path_prefix, corpus_path))) + +all_labels = set() +for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): + for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)): + doc_idx = 0 + if not file.endswith("rel.xml") and not file.endswith(".ini"): + token_idx = 0 + tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file)) + root = tree.getroot() + sents = root.iter("sentence") + for sent in sents: + tokens = [] + for tok in sent.iter("tok"): + token = process_token(tok) + token.id = token_idx + token_idx += 1 + # if token.is_NE(): print(token) + tokens += [token] + + all_labels |= get_all_labels(tokens) + tokens = pick_tags(tokens) + tokens = convert_to_biluo(tokens) + + sent = {'tokens': [{ + 'orth': t.orth, + 'id': t.id, + 'ner': t.get_NE()} # change to t.get_NE() + for t in tokens + ], 'brackets': [] + } + print(sent) + doc_idx +=1 + + break + +print(all_labels) + +# for f in os.listdir(os.path.join(path_prefix, corpus_path)): +# doc_json = {} +# current_folder = f +# +# if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))): +# continue +# +# if not required_files_exist(current_folder): +# # doc_id +=1 ? +# continue +# +# tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml)) +# root_morphosyntax = tree_morphosyntax.getroot() +# +# tree_named = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,named_xml)) +# root_named = tree_named.getroot() +# +# tree_text = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,text_xml)) +# root_text = tree_text.getroot() +# +# tree_segmentation = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,segmentation_xml)) +# root_segmentation = tree_segmentation.getroot() +# +# segmentation_text_map = get_segmentation_text_maps(root_segmentation) +# entity_maps = get_entity_maps(root_named) +# text_maps = get_text_maps(root_text) +# +# token_idx = 0 +# paragraphs = [] +# for paragraph in root_morphosyntax.iter('{http://www.tei-c.org/ns/1.0}p'): +# paragraph_json = {} +# pg_text = get_paragraph_text(paragraph, segmentation_text_map, text_maps) +# +# text = '' +# nes = [] +# sentences = [] +# for sentence in paragraph: +# sent_id = get_sent_id(sentence) +# sentence_entity_map = entity_maps[sent_id] +# sentence_json = [] +# for seg in sentence: +# token = {} +# ctag = get_ctag(seg) +# orth = get_orth(seg) +# ne = sentence_entity_map.get(orth) +# +# text += orth + ' ' +# +# if ne is not None: +# ne = NE_njkp_to_spacy[ne] +# nes += [(len(text)-1-len(orth), len(text)-1, ne)] +# +# token['ctag'] = ctag +# token['orth'] = orth +# token['head'] = 0 # @TODO +# token['dep'] = 'NA' # @TODO +# token['id'] = token_idx +# token['ner'] = ne +# token_idx += 1 +# sentence_json += [token] +# sentences += [sentence_json] +# +# doc = nlp(text) +# entities = nes +# biluo_tags = biluo_tags_from_offsets(doc, entities) +# +# sentences = set_biluo_tags(sentences, biluo_tags) +# paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences] +# paragraph_json['raw'] = pg_text +# paragraphs += [paragraph_json] +# +# doc_json['id'] = doc_id +# doc_json['paragraphs'] = paragraphs +# +# doc_id += 1 +# corpus += [doc_json] +# +# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: +# json.dump(corpus, f) \ No newline at end of file From 06e6e4b25ec044771dae64dcc53864acfda1f691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Sat, 30 Mar 2019 13:50:18 +0100 Subject: [PATCH 04/14] Save training data, needs a bug fix --- ConvertNER/convert_NER_wroc.py | 59 +++++++++++++++++++++++++--------- data/.gitignore | 3 +- data/kpwr-1.1.dvc | 7 ++++ 3 files changed, 53 insertions(+), 16 deletions(-) create mode 100644 data/kpwr-1.1.dvc diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index ca25117..2849d5a 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -4,7 +4,7 @@ import json import os -path_prefix = '../' +path_prefix = './' corpus_path = 'data/kpwr-1.1/' output_path = 'data/NER/' output = 'NER_wroc.json' @@ -185,7 +185,7 @@ def pick_tags(tokens): return tokens if len(tokens) == 1: if tokens[0].is_NE(): - tokens[0].attribs = [token[0].attribs[0]] + tokens[0].attribs = [tokens[0].attribs[0]] return tokens t0 = tokens[0] @@ -263,19 +263,40 @@ def convert_to_biluo(tokens): return out +def get_text(tokens): + raw = "" + for token in tokens: + raw += token.orth + " " + _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪ . ! ?' + _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' + _hyphens = '- – — -- --- —— ~' + _brackets_pref = ") ] }" + _brackets_post = "( [ {" + interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ") + interp_post = _brackets_post.split(" ") + raw = raw[:-1] + for char in interp_pref: + raw = raw.replace(" "+char, char) + for char in interp_post: + raw = raw.replace(char+" ", char) + + return raw -print(get_subdirs(os.path.join(path_prefix, corpus_path))) all_labels = set() +docs = [] +doc_idx = 0 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)): - doc_idx = 0 if not file.endswith("rel.xml") and not file.endswith(".ini"): + doc_json = {} + sentences = [] token_idx = 0 + raw = "" tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file)) root = tree.getroot() sents = root.iter("sentence") @@ -295,17 +316,27 @@ def convert_to_biluo(tokens): sent = {'tokens': [{ 'orth': t.orth, 'id': t.id, - 'ner': t.get_NE()} # change to t.get_NE() + 'ner': t.get_NE()} for t in tokens ], 'brackets': [] } - print(sent) - doc_idx +=1 - - break - -print(all_labels) - + # print(sent) + # print(get_text(tokens)) + + text = get_text(tokens) + sentences += [sent] + raw += "\n"+text + + doc_json = { + 'id': doc_idx, + 'paragraphs': [{'sentences': sentences}] + } + corpus += [doc_json] + doc_idx +=1 + +# print(corpus) +with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: + json.dump(corpus, f) # for f in os.listdir(os.path.join(path_prefix, corpus_path)): # doc_json = {} # current_folder = f @@ -382,6 +413,4 @@ def convert_to_biluo(tokens): # # doc_id += 1 # corpus += [doc_json] -# -# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: -# json.dump(corpus, f) \ No newline at end of file +# \ No newline at end of file diff --git a/data/.gitignore b/data/.gitignore index 6cefda7..e74d18a 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -10,4 +10,5 @@ vocab.jsonl /vocab.jsonl /vectors_300.txt tagmap.py -NKJP-PodkorpusMilionowy-1.2 + +/kpwr-1.1 \ No newline at end of file diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc new file mode 100644 index 0000000..275217c --- /dev/null +++ b/data/kpwr-1.1.dvc @@ -0,0 +1,7 @@ +md5: 58cbc0bd05749d04e4b6a5e4c9d78c01 +outs: +- cache: true + md5: d84971d4b907e5efc5d9320de6691027.dir + metric: false + path: kpwr-1.1 +wdir: . From 525a283216cd81c6a2af1e1010c76b0b50b82acc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Sat, 30 Mar 2019 15:17:21 +0100 Subject: [PATCH 05/14] Fix BILUO --- ConvertNER/convert_NER_wroc.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 2849d5a..6c8fa3e 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -227,10 +227,10 @@ def convert_to_biluo(tokens): if token.is_NE(): if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): # inner NE - out += [Token(token.orth, ["I_"+token.get_NE()], token.id)] + out += [Token(token.orth, ["I-"+token.get_NE()], token.id)] else: # last NE - out += [Token(token.orth, ["L_"+token.get_NE()], token.id)] + out += [Token(token.orth, ["L-"+token.get_NE()], token.id)] in_ne = False else: # we shouldn't ever get here @@ -241,11 +241,11 @@ def convert_to_biluo(tokens): # new NE if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): # beginning NE - out += [Token(token.orth, ["B_"+token.get_NE()], token.id)] + out += [Token(token.orth, ["B-"+token.get_NE()], token.id)] in_ne = True else: # unit NE - out += [Token(token.orth, ["U_"+token.get_NE()], token.id)] + out += [Token(token.orth, ["U-"+token.get_NE()], token.id)] in_ne = False else: # outside of NE @@ -254,10 +254,10 @@ def convert_to_biluo(tokens): # process last token token = tokens[-1] if in_ne: - out += [Token(token.orth, ["L_" + token.get_NE()], token.id)] + out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] else: if token.is_NE(): - out += [Token(token.orth, ["U_" + token.get_NE()], token.id)] + out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] else: out += [Token(token.orth, ["O"], token.id)] From 2f06b47ee64ad9df1a95bac9d5649156870b35eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Sat, 30 Mar 2019 15:51:02 +0100 Subject: [PATCH 06/14] Add analysis --- ConvertNER/convert_NER_wroc.py | 108 +++------ NER-wroc-analysis/get_analysis.py | 365 ++++++++++++++++++++++++++++++ 2 files changed, 393 insertions(+), 80 deletions(-) create mode 100644 NER-wroc-analysis/get_analysis.py diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 6c8fa3e..ecd27d6 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -179,6 +179,28 @@ def get_all_labels(tokens): return labels +class setCounter: + def __init__(self): + self.contents = {} + + def count(self, k, times=1): + if k in self.contents: + self.contents[k] += times + else: + self.contents[k] = times + + def merge(self, other): + for k in other.contents: + self.count(k, other.contents[k]) + +def get_all_labels_with_cardinalities(tokens): + labels = setCounter() + for tok in tokens: + for attr in tok.attribs: + labels.count(attr) + + return labels + def pick_tags(tokens): # first and last separately if len(tokens) == 0: @@ -287,7 +309,7 @@ def get_text(tokens): -all_labels = set() +all_labels = setCounter() docs = [] doc_idx = 0 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): @@ -309,7 +331,8 @@ def get_text(tokens): # if token.is_NE(): print(token) tokens += [token] - all_labels |= get_all_labels(tokens) + # all_labels |= get_all_labels(tokens) + all_labels.merge(get_all_labels_with_cardinalities(tokens)) tokens = pick_tags(tokens) tokens = convert_to_biluo(tokens) @@ -334,83 +357,8 @@ def get_text(tokens): corpus += [doc_json] doc_idx +=1 -# print(corpus) with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: json.dump(corpus, f) -# for f in os.listdir(os.path.join(path_prefix, corpus_path)): -# doc_json = {} -# current_folder = f -# -# if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))): -# continue -# -# if not required_files_exist(current_folder): -# # doc_id +=1 ? -# continue -# -# tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml)) -# root_morphosyntax = tree_morphosyntax.getroot() -# -# tree_named = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,named_xml)) -# root_named = tree_named.getroot() -# -# tree_text = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,text_xml)) -# root_text = tree_text.getroot() -# -# tree_segmentation = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,segmentation_xml)) -# root_segmentation = tree_segmentation.getroot() -# -# segmentation_text_map = get_segmentation_text_maps(root_segmentation) -# entity_maps = get_entity_maps(root_named) -# text_maps = get_text_maps(root_text) -# -# token_idx = 0 -# paragraphs = [] -# for paragraph in root_morphosyntax.iter('{http://www.tei-c.org/ns/1.0}p'): -# paragraph_json = {} -# pg_text = get_paragraph_text(paragraph, segmentation_text_map, text_maps) -# -# text = '' -# nes = [] -# sentences = [] -# for sentence in paragraph: -# sent_id = get_sent_id(sentence) -# sentence_entity_map = entity_maps[sent_id] -# sentence_json = [] -# for seg in sentence: -# token = {} -# ctag = get_ctag(seg) -# orth = get_orth(seg) -# ne = sentence_entity_map.get(orth) -# -# text += orth + ' ' -# -# if ne is not None: -# ne = NE_njkp_to_spacy[ne] -# nes += [(len(text)-1-len(orth), len(text)-1, ne)] -# -# token['ctag'] = ctag -# token['orth'] = orth -# token['head'] = 0 # @TODO -# token['dep'] = 'NA' # @TODO -# token['id'] = token_idx -# token['ner'] = ne -# token_idx += 1 -# sentence_json += [token] -# sentences += [sentence_json] -# -# doc = nlp(text) -# entities = nes -# biluo_tags = biluo_tags_from_offsets(doc, entities) -# -# sentences = set_biluo_tags(sentences, biluo_tags) -# paragraph_json['sentences'] = [{'tokens': tok, 'brackets': []} for tok in sentences] -# paragraph_json['raw'] = pg_text -# paragraphs += [paragraph_json] -# -# doc_json['id'] = doc_id -# doc_json['paragraphs'] = paragraphs -# -# doc_id += 1 -# corpus += [doc_json] -# \ No newline at end of file + +# with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f: +# json.dump(all_labels.contents, f) \ No newline at end of file diff --git a/NER-wroc-analysis/get_analysis.py b/NER-wroc-analysis/get_analysis.py new file mode 100644 index 0000000..ea6972c --- /dev/null +++ b/NER-wroc-analysis/get_analysis.py @@ -0,0 +1,365 @@ +import xml.etree.ElementTree as ET +from spacy.gold import biluo_tags_from_offsets +# import spacy +import json +import os + +path_prefix = './' +corpus_path = 'data/kpwr-1.1/' +output_path = 'data/NER/' +output = 'NER_wroc.json' + +def get_subdirs(dir): + return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))] + +morphosyntax_xml = 'ann_morphosyntax.xml' +groups_xml = 'ann_groups.xml' +named_xml = 'ann_named.xml' +senses_xml = 'ann_senses.xml' +header_xml = 'header.xml' +segmentation_xml = 'ann_segmentation.xml' +words_xml = 'ann_words.xml' +text_xml = 'text.xml' + +def print_children_recursively(n, i=0): + if i > 10: + return + for c in n: + print(' '*(3*i), c.attrib, c.tag) + print_children_recursively(c, i+1) + +def get(node, k, v): + if node is None: + return + for c in node: + if c.attrib.get(k)==v: + return c + +def get_morph(seg): + for c in seg: + if c.attrib['type']=='morph': + return c + +def get_orth(seg): + morph = get(seg, 'type', 'morph') + orth = get(morph, 'name', 'orth') + return orth[0].text if orth is not None else None + +def get_named(seg): + named = get(seg, 'type', 'named') + orth = get(named, 'name', 'orth') + return orth[0].text if orth is not None else None + +def get_named_type(seg): + named = get(seg, 'type', 'named') + type = get(named, 'name', 'type') + return type[0].attrib['value'] + +def get_ctag(seg): + morph = get(seg, 'type', 'morph') + interps = get(morph, 'name', 'interps') + lex = get(interps, 'type', 'lex') + ctag = get(lex, 'name', 'ctag') + return ctag[0].attrib['value'] + +def get_corresp_morph(sent): + return sent.attrib['corresp'].split('#')[1] + +def get_entity_maps(root): + result = {} + for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'): + tmp = [] + for seg in sent: + text = get_named(seg) + type = get_named_type(seg) + tmp += [(text, type)] + + result[get_corresp_morph(sent)] = dict(tmp) + + return result + +def get_segmentation_text_maps(root): + res = {} + for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'): + key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + value = paragraph.attrib['corresp'].split('#')[1] + res[key]=value + + return res + +def get_text_maps(root): + result = {} + for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'): + key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] + text = '' + for child in paragraph: + text += child.text + + result[key]=text + + return result + +def get_sent_id(sent): + return sent.attrib['{http://www.w3.org/XML/1998/namespace}id'] + +def get_paragraph_text(paragraph, segm_text_map, text_maps): + paragraph_id = paragraph.attrib['corresp'].split('#')[1] + return text_maps[segm_text_map[paragraph_id]] + +def set_biluo_tags(sentences, tags): + i = 0 + for sent in sentences: + for token in sent: + token['ner'] = tags[i] + i += 1 + + return sentences + +def required_files_exist(dir): + required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml] + for file in required_files: + if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)): + return False + + return True + +# nlp = spacy.load('en_core_web_sm') +doc_id = 0 +corpus = [] + +NE_njkp_to_spacy = {'persName': 'PERSON', + 'placeName': 'LOC', + 'orgName': 'ORG', + 'date': 'DATE', + 'time': 'TIME', + 'geogName': 'LOC'} + +class Token: + # def __init__(self, orth, attribs): + # self.orth = orth + # self.attribs = attribs + # self.id = None #this is fugly + + + def __init__(self, orth, attribs, id): + self.orth = orth + self.attribs = attribs + self.id = id + + def is_NE(self): + return len(self.attribs) != 0 + + def get_NE(self): + return self.attribs[0] if len(self.attribs) > 0 else "" + + def __str__(self): + return (self.orth + ":" + str(self.attribs)) + + +def process_token(tok): + attribs = [] + orth = tok.find("orth").text + for ann in tok.iter("ann"): + if ann.attrib['chan'].endswith("nam") and ann.text=="1": + attribs += [ann.attrib['chan']] + + return Token(orth, attribs, -1) + +def get_common_tag(t1, t2): + set1 = set(t1.attribs) + set2 = set(t2.attribs) + common = list(set1 & set2) + return common[0] if len(common) > 0 else None + +def get_all_labels(tokens): + labels = set() + for tok in tokens: + for attr in tok.attribs: + labels.add(attr) + + return labels + +class setCounter: + def __init__(self): + self.contents = {} + + def count(self, k, times=1): + if k in self.contents: + self.contents[k] += times + else: + self.contents[k] = times + + def merge(self, other): + for k in other.contents: + self.count(k, other.contents[k]) + +def get_all_labels_with_cardinalities(tokens): + labels = setCounter() + for tok in tokens: + for attr in tok.attribs: + labels.count(attr) + + return labels + +def pick_tags(tokens): + # first and last separately + if len(tokens) == 0: + return tokens + if len(tokens) == 1: + if tokens[0].is_NE(): + tokens[0].attribs = [tokens[0].attribs[0]] + return tokens + + t0 = tokens[0] + if len(t0.attribs) > 1: + new_tag = get_common_tag(t0, tokens[1]) + if new_tag is None: + t0.attribs = [t0.attribs[0]] + else: + t0.attribs = [new_tag] + + for i in range(1, len(tokens)-1): + if len(tokens[i].attribs) > 1: + new_tag = get_common_tag(tokens[i-1], tokens[i]) + if new_tag is None: + new_tag = get_common_tag(tokens[i], tokens[i+1]) + if new_tag is None: + tokens[i].attribs = [tokens[i].attribs[0]] + else: + tokens[i].attribs = [new_tag] + else: + tokens[i].attribs = [new_tag] + + te = tokens[-1] + if len(te.attribs) > 1: + new_tag = get_common_tag(te, tokens[-2]) + if new_tag is None: + te.attribs = [te.attribs[0]] + else: + te.attribs = [new_tag] + + assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te])) + return [t0] + tokens[1:-2] + [te] + +def convert_to_biluo(tokens): + out = [] + in_ne = False + for i, token in enumerate(tokens[:-1]): + if in_ne: + if token.is_NE(): + if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): + # inner NE + out += [Token(token.orth, ["I-"+token.get_NE()], token.id)] + else: + # last NE + out += [Token(token.orth, ["L-"+token.get_NE()], token.id)] + in_ne = False + else: + # we shouldn't ever get here + assert(False) + + else: + if token.is_NE(): + # new NE + if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): + # beginning NE + out += [Token(token.orth, ["B-"+token.get_NE()], token.id)] + in_ne = True + else: + # unit NE + out += [Token(token.orth, ["U-"+token.get_NE()], token.id)] + in_ne = False + else: + # outside of NE + out += [Token(token.orth, ["O"], token.id)] + + # process last token + token = tokens[-1] + if in_ne: + out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] + else: + if token.is_NE(): + out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] + else: + out += [Token(token.orth, ["O"], token.id)] + + return out + +def get_text(tokens): + raw = "" + for token in tokens: + raw += token.orth + " " + + _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪ . ! ?' + _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' + _hyphens = '- – — -- --- —— ~' + _brackets_pref = ") ] }" + _brackets_post = "( [ {" + + interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ") + interp_post = _brackets_post.split(" ") + raw = raw[:-1] + for char in interp_pref: + raw = raw.replace(" "+char, char) + + for char in interp_post: + raw = raw.replace(char+" ", char) + + return raw + + + +all_labels = setCounter() +docs = [] +doc_idx = 0 +for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): + for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)): + if not file.endswith("rel.xml") and not file.endswith(".ini"): + doc_json = {} + sentences = [] + token_idx = 0 + raw = "" + tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file)) + root = tree.getroot() + sents = root.iter("sentence") + for sent in sents: + tokens = [] + for tok in sent.iter("tok"): + token = process_token(tok) + token.id = token_idx + token_idx += 1 + # if token.is_NE(): print(token) + tokens += [token] + + # all_labels |= get_all_labels(tokens) + all_labels.merge(get_all_labels_with_cardinalities(tokens)) + tokens = pick_tags(tokens) + tokens = convert_to_biluo(tokens) + + sent = {'tokens': [{ + 'orth': t.orth, + 'id': t.id, + 'ner': t.get_NE()} + for t in tokens + ], 'brackets': [] + } + # print(sent) + # print(get_text(tokens)) + + text = get_text(tokens) + sentences += [sent] + raw += "\n"+text + + doc_json = { + 'id': doc_idx, + 'paragraphs': [{'sentences': sentences}] + } + corpus += [doc_json] + doc_idx +=1 + +# print(corpus) +# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: +# json.dump(corpus, f) + +with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f: + json.dump(all_labels.contents, f) \ No newline at end of file From 9d3224be4e7f0b765c2a4bc4eadeb8d85690f787 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Wed, 3 Apr 2019 21:22:31 +0200 Subject: [PATCH 07/14] Remove analysis (moved to another branch) --- NER-wroc-analysis/get_analysis.py | 365 ------------------------------ 1 file changed, 365 deletions(-) delete mode 100644 NER-wroc-analysis/get_analysis.py diff --git a/NER-wroc-analysis/get_analysis.py b/NER-wroc-analysis/get_analysis.py deleted file mode 100644 index ea6972c..0000000 --- a/NER-wroc-analysis/get_analysis.py +++ /dev/null @@ -1,365 +0,0 @@ -import xml.etree.ElementTree as ET -from spacy.gold import biluo_tags_from_offsets -# import spacy -import json -import os - -path_prefix = './' -corpus_path = 'data/kpwr-1.1/' -output_path = 'data/NER/' -output = 'NER_wroc.json' - -def get_subdirs(dir): - return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))] - -morphosyntax_xml = 'ann_morphosyntax.xml' -groups_xml = 'ann_groups.xml' -named_xml = 'ann_named.xml' -senses_xml = 'ann_senses.xml' -header_xml = 'header.xml' -segmentation_xml = 'ann_segmentation.xml' -words_xml = 'ann_words.xml' -text_xml = 'text.xml' - -def print_children_recursively(n, i=0): - if i > 10: - return - for c in n: - print(' '*(3*i), c.attrib, c.tag) - print_children_recursively(c, i+1) - -def get(node, k, v): - if node is None: - return - for c in node: - if c.attrib.get(k)==v: - return c - -def get_morph(seg): - for c in seg: - if c.attrib['type']=='morph': - return c - -def get_orth(seg): - morph = get(seg, 'type', 'morph') - orth = get(morph, 'name', 'orth') - return orth[0].text if orth is not None else None - -def get_named(seg): - named = get(seg, 'type', 'named') - orth = get(named, 'name', 'orth') - return orth[0].text if orth is not None else None - -def get_named_type(seg): - named = get(seg, 'type', 'named') - type = get(named, 'name', 'type') - return type[0].attrib['value'] - -def get_ctag(seg): - morph = get(seg, 'type', 'morph') - interps = get(morph, 'name', 'interps') - lex = get(interps, 'type', 'lex') - ctag = get(lex, 'name', 'ctag') - return ctag[0].attrib['value'] - -def get_corresp_morph(sent): - return sent.attrib['corresp'].split('#')[1] - -def get_entity_maps(root): - result = {} - for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'): - tmp = [] - for seg in sent: - text = get_named(seg) - type = get_named_type(seg) - tmp += [(text, type)] - - result[get_corresp_morph(sent)] = dict(tmp) - - return result - -def get_segmentation_text_maps(root): - res = {} - for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'): - key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] - value = paragraph.attrib['corresp'].split('#')[1] - res[key]=value - - return res - -def get_text_maps(root): - result = {} - for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'): - key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] - text = '' - for child in paragraph: - text += child.text - - result[key]=text - - return result - -def get_sent_id(sent): - return sent.attrib['{http://www.w3.org/XML/1998/namespace}id'] - -def get_paragraph_text(paragraph, segm_text_map, text_maps): - paragraph_id = paragraph.attrib['corresp'].split('#')[1] - return text_maps[segm_text_map[paragraph_id]] - -def set_biluo_tags(sentences, tags): - i = 0 - for sent in sentences: - for token in sent: - token['ner'] = tags[i] - i += 1 - - return sentences - -def required_files_exist(dir): - required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml] - for file in required_files: - if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)): - return False - - return True - -# nlp = spacy.load('en_core_web_sm') -doc_id = 0 -corpus = [] - -NE_njkp_to_spacy = {'persName': 'PERSON', - 'placeName': 'LOC', - 'orgName': 'ORG', - 'date': 'DATE', - 'time': 'TIME', - 'geogName': 'LOC'} - -class Token: - # def __init__(self, orth, attribs): - # self.orth = orth - # self.attribs = attribs - # self.id = None #this is fugly - - - def __init__(self, orth, attribs, id): - self.orth = orth - self.attribs = attribs - self.id = id - - def is_NE(self): - return len(self.attribs) != 0 - - def get_NE(self): - return self.attribs[0] if len(self.attribs) > 0 else "" - - def __str__(self): - return (self.orth + ":" + str(self.attribs)) - - -def process_token(tok): - attribs = [] - orth = tok.find("orth").text - for ann in tok.iter("ann"): - if ann.attrib['chan'].endswith("nam") and ann.text=="1": - attribs += [ann.attrib['chan']] - - return Token(orth, attribs, -1) - -def get_common_tag(t1, t2): - set1 = set(t1.attribs) - set2 = set(t2.attribs) - common = list(set1 & set2) - return common[0] if len(common) > 0 else None - -def get_all_labels(tokens): - labels = set() - for tok in tokens: - for attr in tok.attribs: - labels.add(attr) - - return labels - -class setCounter: - def __init__(self): - self.contents = {} - - def count(self, k, times=1): - if k in self.contents: - self.contents[k] += times - else: - self.contents[k] = times - - def merge(self, other): - for k in other.contents: - self.count(k, other.contents[k]) - -def get_all_labels_with_cardinalities(tokens): - labels = setCounter() - for tok in tokens: - for attr in tok.attribs: - labels.count(attr) - - return labels - -def pick_tags(tokens): - # first and last separately - if len(tokens) == 0: - return tokens - if len(tokens) == 1: - if tokens[0].is_NE(): - tokens[0].attribs = [tokens[0].attribs[0]] - return tokens - - t0 = tokens[0] - if len(t0.attribs) > 1: - new_tag = get_common_tag(t0, tokens[1]) - if new_tag is None: - t0.attribs = [t0.attribs[0]] - else: - t0.attribs = [new_tag] - - for i in range(1, len(tokens)-1): - if len(tokens[i].attribs) > 1: - new_tag = get_common_tag(tokens[i-1], tokens[i]) - if new_tag is None: - new_tag = get_common_tag(tokens[i], tokens[i+1]) - if new_tag is None: - tokens[i].attribs = [tokens[i].attribs[0]] - else: - tokens[i].attribs = [new_tag] - else: - tokens[i].attribs = [new_tag] - - te = tokens[-1] - if len(te.attribs) > 1: - new_tag = get_common_tag(te, tokens[-2]) - if new_tag is None: - te.attribs = [te.attribs[0]] - else: - te.attribs = [new_tag] - - assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te])) - return [t0] + tokens[1:-2] + [te] - -def convert_to_biluo(tokens): - out = [] - in_ne = False - for i, token in enumerate(tokens[:-1]): - if in_ne: - if token.is_NE(): - if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): - # inner NE - out += [Token(token.orth, ["I-"+token.get_NE()], token.id)] - else: - # last NE - out += [Token(token.orth, ["L-"+token.get_NE()], token.id)] - in_ne = False - else: - # we shouldn't ever get here - assert(False) - - else: - if token.is_NE(): - # new NE - if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): - # beginning NE - out += [Token(token.orth, ["B-"+token.get_NE()], token.id)] - in_ne = True - else: - # unit NE - out += [Token(token.orth, ["U-"+token.get_NE()], token.id)] - in_ne = False - else: - # outside of NE - out += [Token(token.orth, ["O"], token.id)] - - # process last token - token = tokens[-1] - if in_ne: - out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] - else: - if token.is_NE(): - out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] - else: - out += [Token(token.orth, ["O"], token.id)] - - return out - -def get_text(tokens): - raw = "" - for token in tokens: - raw += token.orth + " " - - _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪ . ! ?' - _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' - _hyphens = '- – — -- --- —— ~' - _brackets_pref = ") ] }" - _brackets_post = "( [ {" - - interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ") - interp_post = _brackets_post.split(" ") - raw = raw[:-1] - for char in interp_pref: - raw = raw.replace(" "+char, char) - - for char in interp_post: - raw = raw.replace(char+" ", char) - - return raw - - - -all_labels = setCounter() -docs = [] -doc_idx = 0 -for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): - for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)): - if not file.endswith("rel.xml") and not file.endswith(".ini"): - doc_json = {} - sentences = [] - token_idx = 0 - raw = "" - tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file)) - root = tree.getroot() - sents = root.iter("sentence") - for sent in sents: - tokens = [] - for tok in sent.iter("tok"): - token = process_token(tok) - token.id = token_idx - token_idx += 1 - # if token.is_NE(): print(token) - tokens += [token] - - # all_labels |= get_all_labels(tokens) - all_labels.merge(get_all_labels_with_cardinalities(tokens)) - tokens = pick_tags(tokens) - tokens = convert_to_biluo(tokens) - - sent = {'tokens': [{ - 'orth': t.orth, - 'id': t.id, - 'ner': t.get_NE()} - for t in tokens - ], 'brackets': [] - } - # print(sent) - # print(get_text(tokens)) - - text = get_text(tokens) - sentences += [sent] - raw += "\n"+text - - doc_json = { - 'id': doc_idx, - 'paragraphs': [{'sentences': sentences}] - } - corpus += [doc_json] - doc_idx +=1 - -# print(corpus) -# with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: -# json.dump(corpus, f) - -with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f: - json.dump(all_labels.contents, f) \ No newline at end of file From 59491bb082fe4b50bb3ed24960ad4a0a04109ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Wed, 3 Apr 2019 21:32:01 +0200 Subject: [PATCH 08/14] Remove commented out code --- ConvertNER/convert_NER_wroc.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index ecd27d6..194f229 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -135,12 +135,6 @@ def required_files_exist(dir): 'geogName': 'LOC'} class Token: - # def __init__(self, orth, attribs): - # self.orth = orth - # self.attribs = attribs - # self.id = None #this is fugly - - def __init__(self, orth, attribs, id): self.orth = orth self.attribs = attribs @@ -328,10 +322,8 @@ def get_text(tokens): token = process_token(tok) token.id = token_idx token_idx += 1 - # if token.is_NE(): print(token) tokens += [token] - # all_labels |= get_all_labels(tokens) all_labels.merge(get_all_labels_with_cardinalities(tokens)) tokens = pick_tags(tokens) tokens = convert_to_biluo(tokens) @@ -343,8 +335,6 @@ def get_text(tokens): for t in tokens ], 'brackets': [] } - # print(sent) - # print(get_text(tokens)) text = get_text(tokens) sentences += [sent] @@ -358,7 +348,4 @@ def get_text(tokens): doc_idx +=1 with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: - json.dump(corpus, f) - -# with open(os.path.expanduser(os.path.join(path_prefix, output_path, "analysis.json")), 'w+') as f: -# json.dump(all_labels.contents, f) \ No newline at end of file + json.dump(corpus, f) \ No newline at end of file From b7cd6103e8c9de87d8ff386c56111e1053a017ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ksi=C4=85=C5=BCek?= Date: Wed, 3 Apr 2019 22:17:34 +0200 Subject: [PATCH 09/14] this field has non-binary values (it can even be 2, or -) --- ConvertNER/convert_NER_wroc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 194f229..e762599 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -154,7 +154,7 @@ def process_token(tok): attribs = [] orth = tok.find("orth").text for ann in tok.iter("ann"): - if ann.attrib['chan'].endswith("nam") and ann.text=="1": + if ann.attrib['chan'].endswith("nam") and ann.text!="0": attribs += [ann.attrib['chan']] return Token(orth, attribs, -1) From 70785b21e23fc961a65a55308d95b5bec04864cc Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Wed, 17 Apr 2019 21:43:05 +0200 Subject: [PATCH 10/14] Code cleanup --- ConvertNER/convert_NER_wroc.py | 200 +++++--------------------------- data/lemmatizer_data/.gitignore | 3 +- lemma_sources_exp.dvc | 14 +++ 3 files changed, 43 insertions(+), 174 deletions(-) create mode 100644 lemma_sources_exp.dvc diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index e762599..20abd3d 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -1,6 +1,4 @@ import xml.etree.ElementTree as ET -from spacy.gold import biluo_tags_from_offsets -# import spacy import json import os @@ -9,130 +7,16 @@ output_path = 'data/NER/' output = 'NER_wroc.json' -def get_subdirs(dir): - return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir,name))] - -morphosyntax_xml = 'ann_morphosyntax.xml' -groups_xml = 'ann_groups.xml' -named_xml = 'ann_named.xml' -senses_xml = 'ann_senses.xml' -header_xml = 'header.xml' -segmentation_xml = 'ann_segmentation.xml' -words_xml = 'ann_words.xml' -text_xml = 'text.xml' - -def print_children_recursively(n, i=0): - if i > 10: - return - for c in n: - print(' '*(3*i), c.attrib, c.tag) - print_children_recursively(c, i+1) - -def get(node, k, v): - if node is None: - return - for c in node: - if c.attrib.get(k)==v: - return c - -def get_morph(seg): - for c in seg: - if c.attrib['type']=='morph': - return c - -def get_orth(seg): - morph = get(seg, 'type', 'morph') - orth = get(morph, 'name', 'orth') - return orth[0].text if orth is not None else None - -def get_named(seg): - named = get(seg, 'type', 'named') - orth = get(named, 'name', 'orth') - return orth[0].text if orth is not None else None - -def get_named_type(seg): - named = get(seg, 'type', 'named') - type = get(named, 'name', 'type') - return type[0].attrib['value'] - -def get_ctag(seg): - morph = get(seg, 'type', 'morph') - interps = get(morph, 'name', 'interps') - lex = get(interps, 'type', 'lex') - ctag = get(lex, 'name', 'ctag') - return ctag[0].attrib['value'] - -def get_corresp_morph(sent): - return sent.attrib['corresp'].split('#')[1] - -def get_entity_maps(root): - result = {} - for sent in root.iter('{http://www.tei-c.org/ns/1.0}s'): - tmp = [] - for seg in sent: - text = get_named(seg) - type = get_named_type(seg) - tmp += [(text, type)] - - result[get_corresp_morph(sent)] = dict(tmp) - - return result - -def get_segmentation_text_maps(root): - res = {} - for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}p'): - key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] - value = paragraph.attrib['corresp'].split('#')[1] - res[key]=value - - return res - -def get_text_maps(root): - result = {} - for paragraph in root.iter('{http://www.tei-c.org/ns/1.0}div'): - key = paragraph.attrib['{http://www.w3.org/XML/1998/namespace}id'] - text = '' - for child in paragraph: - text += child.text - - result[key]=text - - return result - -def get_sent_id(sent): - return sent.attrib['{http://www.w3.org/XML/1998/namespace}id'] - -def get_paragraph_text(paragraph, segm_text_map, text_maps): - paragraph_id = paragraph.attrib['corresp'].split('#')[1] - return text_maps[segm_text_map[paragraph_id]] - -def set_biluo_tags(sentences, tags): - i = 0 - for sent in sentences: - for token in sent: - token['ner'] = tags[i] - i += 1 - - return sentences - -def required_files_exist(dir): - required_files = [segmentation_xml, text_xml, named_xml, morphosyntax_xml] - for file in required_files: - if not os.path.isfile(os.path.join(path_prefix,corpus_path,dir,file)): - return False - - return True - -# nlp = spacy.load('en_core_web_sm') doc_id = 0 corpus = [] NE_njkp_to_spacy = {'persName': 'PERSON', - 'placeName': 'LOC', - 'orgName': 'ORG', - 'date': 'DATE', - 'time': 'TIME', - 'geogName': 'LOC'} + 'placeName': 'LOC', + 'orgName': 'ORG', + 'date': 'DATE', + 'time': 'TIME', + 'geogName': 'LOC'} + class Token: def __init__(self, orth, attribs, id): @@ -154,46 +38,18 @@ def process_token(tok): attribs = [] orth = tok.find("orth").text for ann in tok.iter("ann"): - if ann.attrib['chan'].endswith("nam") and ann.text!="0": + if ann.attrib['chan'].endswith("nam") and ann.text != "0": attribs += [ann.attrib['chan']] return Token(orth, attribs, -1) + def get_common_tag(t1, t2): set1 = set(t1.attribs) set2 = set(t2.attribs) common = list(set1 & set2) return common[0] if len(common) > 0 else None -def get_all_labels(tokens): - labels = set() - for tok in tokens: - for attr in tok.attribs: - labels.add(attr) - - return labels - -class setCounter: - def __init__(self): - self.contents = {} - - def count(self, k, times=1): - if k in self.contents: - self.contents[k] += times - else: - self.contents[k] = times - - def merge(self, other): - for k in other.contents: - self.count(k, other.contents[k]) - -def get_all_labels_with_cardinalities(tokens): - labels = setCounter() - for tok in tokens: - for attr in tok.attribs: - labels.count(attr) - - return labels def pick_tags(tokens): # first and last separately @@ -212,11 +68,11 @@ def pick_tags(tokens): else: t0.attribs = [new_tag] - for i in range(1, len(tokens)-1): + for i in range(1, len(tokens) - 1): if len(tokens[i].attribs) > 1: - new_tag = get_common_tag(tokens[i-1], tokens[i]) + new_tag = get_common_tag(tokens[i - 1], tokens[i]) if new_tag is None: - new_tag = get_common_tag(tokens[i], tokens[i+1]) + new_tag = get_common_tag(tokens[i], tokens[i + 1]) if new_tag is None: tokens[i].attribs = [tokens[i].attribs[0]] else: @@ -232,36 +88,37 @@ def pick_tags(tokens): else: te.attribs = [new_tag] - assert(all(len(t.attribs)<=1 for t in [t0] + tokens+ [te])) + assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te])) return [t0] + tokens[1:-2] + [te] + def convert_to_biluo(tokens): out = [] in_ne = False for i, token in enumerate(tokens[:-1]): if in_ne: if token.is_NE(): - if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): + if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): # inner NE - out += [Token(token.orth, ["I-"+token.get_NE()], token.id)] + out += [Token(token.orth, ["I-" + token.get_NE()], token.id)] else: # last NE - out += [Token(token.orth, ["L-"+token.get_NE()], token.id)] + out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] in_ne = False else: # we shouldn't ever get here - assert(False) + assert (False) else: if token.is_NE(): # new NE - if tokens[i+1].is_NE() and token.get_NE() == tokens[i+1].get_NE(): + if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): # beginning NE - out += [Token(token.orth, ["B-"+token.get_NE()], token.id)] + out += [Token(token.orth, ["B-" + token.get_NE()], token.id)] in_ne = True else: # unit NE - out += [Token(token.orth, ["U-"+token.get_NE()], token.id)] + out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] in_ne = False else: # outside of NE @@ -279,6 +136,7 @@ def convert_to_biluo(tokens): return out + def get_text(tokens): raw = "" for token in tokens: @@ -287,23 +145,20 @@ def get_text(tokens): _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪ . ! ?' _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' _hyphens = '- – — -- --- —— ~' - _brackets_pref = ") ] }" + _brackets_pref = ") ] }" _brackets_post = "( [ {" interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ") interp_post = _brackets_post.split(" ") - raw = raw[:-1] for char in interp_pref: - raw = raw.replace(" "+char, char) + raw = raw.replace(" " + char, char) for char in interp_post: - raw = raw.replace(char+" ", char) + raw = raw.replace(char + " ", char) return raw - -all_labels = setCounter() docs = [] doc_idx = 0 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): @@ -324,7 +179,6 @@ def get_text(tokens): token_idx += 1 tokens += [token] - all_labels.merge(get_all_labels_with_cardinalities(tokens)) tokens = pick_tags(tokens) tokens = convert_to_biluo(tokens) @@ -338,14 +192,14 @@ def get_text(tokens): text = get_text(tokens) sentences += [sent] - raw += "\n"+text + raw += "\n" + text doc_json = { 'id': doc_idx, 'paragraphs': [{'sentences': sentences}] } corpus += [doc_json] - doc_idx +=1 + doc_idx += 1 with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: - json.dump(corpus, f) \ No newline at end of file + json.dump(corpus, f) diff --git a/data/lemmatizer_data/.gitignore b/data/lemmatizer_data/.gitignore index a6cd652..cdafee2 100644 --- a/data/lemmatizer_data/.gitignore +++ b/data/lemmatizer_data/.gitignore @@ -4,4 +4,5 @@ /rules.json /lemma_sources -/sjp_ispell.tar.bz2 \ No newline at end of file +/sjp_ispell.tar.bz2 +/lemma_sources_exp \ No newline at end of file diff --git a/lemma_sources_exp.dvc b/lemma_sources_exp.dvc new file mode 100644 index 0000000..76ca409 --- /dev/null +++ b/lemma_sources_exp.dvc @@ -0,0 +1,14 @@ +cmd: python lemma_rules_extraction/yield_all_suffixes.py +deps: +- md5: 947b48802b53bdff2ad02122c04063e5.dir + path: data/lemmatizer_data/lemma_sources +- md5: abb151c19621000ccc91d8e4b494674f + path: lemma_rules_extraction/yield_all_suffixes.py +md5: 53ab70bcbc27640766d00b4977b4733d +outs: +- cache: true + md5: 4f29377d4c9dd5c9997c74750af31321.dir + metric: false + path: data/lemmatizer_data/lemma_sources_exp + persist: false +wdir: . From 1b371b2e027ac2517d18f9fb7aa3343e6c3fb297 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Wed, 17 Apr 2019 23:14:03 +0200 Subject: [PATCH 11/14] Auto stash before rebase of "master" --- data/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/.gitignore b/data/.gitignore index e74d18a..5f63d8c 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -10,5 +10,5 @@ vocab.jsonl /vocab.jsonl /vectors_300.txt tagmap.py - +NKJP-PodkorpusMilionowy-1.2 /kpwr-1.1 \ No newline at end of file From de41d270b06ae55c3a76c3ffbe9f308fc70bb166 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Wed, 17 Apr 2019 23:34:32 +0200 Subject: [PATCH 12/14] Add accidentally removed function --- ConvertNER/convert_NER_wroc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 20abd3d..98282ee 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -34,6 +34,10 @@ def __str__(self): return (self.orth + ":" + str(self.attribs)) +def get_subdirs(dir): + return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))] + + def process_token(tok): attribs = [] orth = tok.find("orth").text From 050dbbbf461b7eef19e942dabb186bb0ba2a3170 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Thu, 25 Apr 2019 09:13:04 +0200 Subject: [PATCH 13/14] Fix rebase bugs --- ConvertNER/convert_NER.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py index 3c12602..d8a1db2 100644 --- a/ConvertNER/convert_NER.py +++ b/ConvertNER/convert_NER.py @@ -1,6 +1,6 @@ import xml.etree.ElementTree as ET from spacy.gold import biluo_tags_from_offsets -import spacy +from spacy.lang.pl import Polish import json import os @@ -120,7 +120,7 @@ def required_files_exist(dir): return True -nlp = spacy.load('en_core_web_sm') +nlp = Polish() doc_id = 0 corpus = [] From 96f9e2dc9bade3ebf7705d76831b2c826bd80e0b Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Sat, 18 May 2019 11:01:48 +0200 Subject: [PATCH 14/14] Remove remains of detokenization --- ConvertNER/convert_NER_wroc.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 98282ee..fa98707 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -141,28 +141,6 @@ def convert_to_biluo(tokens): return out -def get_text(tokens): - raw = "" - for token in tokens: - raw += token.orth + " " - - _punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪ . ! ?' - _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' - _hyphens = '- – — -- --- —— ~' - _brackets_pref = ") ] }" - _brackets_post = "( [ {" - - interp_pref = _punct.split(" ") + _quotes.split(" ") + _hyphens.split(" ") + _brackets_pref.split(" ") - interp_post = _brackets_post.split(" ") - for char in interp_pref: - raw = raw.replace(" " + char, char) - - for char in interp_post: - raw = raw.replace(char + " ", char) - - return raw - - docs = [] doc_idx = 0 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): @@ -194,9 +172,7 @@ def get_text(tokens): ], 'brackets': [] } - text = get_text(tokens) sentences += [sent] - raw += "\n" + text doc_json = { 'id': doc_idx,