From be44c727e1f6456750f01adf799a1fb2d1c746d8 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Tue, 21 May 2019 13:36:57 +0200 Subject: [PATCH 1/8] Fix conversion script (messy atm) --- ConvertNER/convert_NER_wroc.py | 95 ++++++++++++++++++++++++++++++++-- data/kpwr-1.1.dvc | 3 +- 2 files changed, 93 insertions(+), 5 deletions(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 76cebde..3ba598b 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -50,8 +50,8 @@ def process_token(tok): attribs = [] orth = tok.find("orth").text for ann in tok.iter("ann"): - if ann.attrib['chan'].endswith("nam") and ann.text == "1": - attribs += [ann.attrib['chan']] + if ann.attrib['chan'].endswith("nam"): # and ann.text != "0": + attribs += [{ann.attrib['chan']:ann.text}] return Token(orth, attribs, -1) @@ -88,6 +88,91 @@ def map_labels(tokens, map): return tokens +def get_longest_sequences(tokens): + res = [] + b = 0 + e = 0 + # type = None + + last_set = set() + while e != len(tokens)-1: + current_token = tokens[e] + + if last_set == emptyset: + last_set = set(current_token.attribs) + b = e + else: + new_set = set(current_token.attribs) & last_set + if new_set == emptyset: + label = list(last_set)[0] + res.append((b, e, label)) + b = e + + last_set = new_set + e += 1 + + return res + + +def still_in_sequence(v1, v2): + return any(v1e == v2e != "0" for v1e in v1 for v2e in v2) + + +def get_last_label(v): + for i, e in enumerate(v): + if e != "0": + return i + return None + + +def get_longest_sequences_2(tokens): + res = [] + b = 0 + e = 0 + attribs = [k for d in tokens[0].attribs for k in d] + last_set = None + + while e != len(tokens)-1: + current_token = tokens[e] + + if last_set == None: + last_set = [v for d in current_token.attribs for k, v in d.items()] + b = e + else: + new_set = [v for d in current_token.attribs for k, v in d.items()] + if not still_in_sequence(last_set, new_set): + label_id = get_last_label(last_set) + if(label_id != None): + label = attribs[label_id] + res.append((b, e, label)) + b = e + + last_set = new_set + e += 1 + + return res + + +emptyset = set() +def pick_tags_2(tokens): + longest_sequences = get_longest_sequences_2(tokens) + res = [] + for b, e, label in longest_sequences: + seq = tokens[b:e] + for tok in seq: + tok.attribs = [{label: '1'}] + # res += seq + tokens[b:e] = seq + return tokens + + +def flatten_token_attrib_dicts(tokens): + for tok in tokens: + tok.attribs = [k for k in tok.attribs[0].keys()] if len(tok.attribs) > 0 is not None else [] + + return tokens + + def pick_tags(tokens): # first and last separately if len(tokens) == 0: @@ -205,8 +290,10 @@ def main( token_idx += 1 tokens += [token] - all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis - tokens = pick_tags(tokens) + # all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis + tokens = pick_tags_2(tokens) + tokens = flatten_token_attrib_dicts(tokens) + if use_label_map: tokens = map_labels(tokens, NER_pwr_to_spacy) tokens = convert_to_biluo(tokens) diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc index 275217c..69e04b8 100644 --- a/data/kpwr-1.1.dvc +++ b/data/kpwr-1.1.dvc @@ -1,7 +1,8 @@ -md5: 58cbc0bd05749d04e4b6a5e4c9d78c01 +md5: 86d998d87357a866a9993c1c0458b169 outs: - cache: true md5: d84971d4b907e5efc5d9320de6691027.dir metric: false path: kpwr-1.1 + persist: false wdir: . From 448cc3384cdf9bcd8b8f7182f4e06fa48e7faa72 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Tue, 21 May 2019 20:13:25 +0200 Subject: [PATCH 2/8] Fix bug in fix --- ConvertNER/convert_NER_wroc.py | 111 +++++++-------------------------- 1 file changed, 21 insertions(+), 90 deletions(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 3ba598b..92f3b92 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -34,10 +34,15 @@ def __init__(self, orth, attribs, id): self.id = id def is_NE(self): - return len(self.attribs) != 0 + return self.get_NE() is not None def get_NE(self): - return self.attribs[0] if len(self.attribs) > 0 else "" + for attrib in self.attribs: + for k in attrib: + if attrib[k] != "0": + return k + + return None def get_cooccurences(self): res = setCounter @@ -83,37 +88,11 @@ def get_all_labels_with_cardinalities(tokens): def map_labels(tokens, map): for tok in tokens: - tok.attribs = [map[attrib] for attrib in tok.attribs] + tok.attribs = [{map[k]:v} for attrib in tok.attribs for k,v in attrib.items()] return tokens -def get_longest_sequences(tokens): - res = [] - b = 0 - e = 0 - # type = None - - last_set = set() - while e != len(tokens)-1: - current_token = tokens[e] - - if last_set == emptyset: - last_set = set(current_token.attribs) - b = e - else: - new_set = set(current_token.attribs) & last_set - if new_set == emptyset: - label = list(last_set)[0] - res.append((b, e, label)) - b = e - - last_set = new_set - e += 1 - - return res - - def still_in_sequence(v1, v2): return any(v1e == v2e != "0" for v1e in v1 for v2e in v2) @@ -125,7 +104,7 @@ def get_last_label(v): return None -def get_longest_sequences_2(tokens): +def get_longest_sequences(tokens): res = [] b = 0 e = 0 @@ -154,8 +133,8 @@ def get_longest_sequences_2(tokens): emptyset = set() -def pick_tags_2(tokens): - longest_sequences = get_longest_sequences_2(tokens) +def pick_tags(tokens): + longest_sequences = get_longest_sequences(tokens) res = [] for b, e, label in longest_sequences: seq = tokens[b:e] @@ -166,54 +145,6 @@ def pick_tags_2(tokens): return tokens -def flatten_token_attrib_dicts(tokens): - for tok in tokens: - tok.attribs = [k for k in tok.attribs[0].keys()] if len(tok.attribs) > 0 is not None else [] - - return tokens - - -def pick_tags(tokens): - # first and last separately - if len(tokens) == 0: - return tokens - if len(tokens) == 1: - if tokens[0].is_NE(): - tokens[0].attribs = [tokens[0].attribs[0]] - return tokens - - t0 = tokens[0] - if len(t0.attribs) > 1: - new_tag = get_common_tag(t0, tokens[1]) - if new_tag is None: - t0.attribs = [t0.attribs[0]] - else: - t0.attribs = [new_tag] - - for i in range(1, len(tokens) - 1): - if len(tokens[i].attribs) > 1: - new_tag = get_common_tag(tokens[i - 1], tokens[i]) - if new_tag is None: - new_tag = get_common_tag(tokens[i], tokens[i + 1]) - if new_tag is None: - tokens[i].attribs = [tokens[i].attribs[0]] - else: - tokens[i].attribs = [new_tag] - else: - tokens[i].attribs = [new_tag] - - te = tokens[-1] - if len(te.attribs) > 1: - new_tag = get_common_tag(te, tokens[-2]) - if new_tag is None: - te.attribs = [te.attribs[0]] - else: - te.attribs = [new_tag] - - assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te])) - return [t0] + tokens[1:-2] + [te] - - def convert_to_biluo(tokens): out = [] in_ne = False @@ -222,10 +153,10 @@ def convert_to_biluo(tokens): if token.is_NE(): if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): # inner NE - out += [Token(token.orth, ["I-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"I-" + token.get_NE(): '1'}], token.id)] else: # last NE - out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)] in_ne = False else: # we shouldn't ever get here @@ -236,25 +167,25 @@ def convert_to_biluo(tokens): # new NE if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): # beginning NE - out += [Token(token.orth, ["B-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"B-" + token.get_NE(): '1'}], token.id)] in_ne = True else: # unit NE - out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)] in_ne = False else: # outside of NE - out += [Token(token.orth, ["O"], token.id)] + out += [Token(token.orth, [{"O": '1'}], token.id)] # process last token token = tokens[-1] if in_ne: - out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)] else: if token.is_NE(): - out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)] else: - out += [Token(token.orth, ["O"], token.id)] + out += [Token(token.orth, [{"O": '1'}], token.id)] return out @@ -291,8 +222,8 @@ def main( tokens += [token] # all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis - tokens = pick_tags_2(tokens) - tokens = flatten_token_attrib_dicts(tokens) + tokens = pick_tags(tokens) + # tokens = flatten_token_attrib_dicts(tokens) if use_label_map: tokens = map_labels(tokens, NER_pwr_to_spacy) From 6cebb3a024826c47fe129ec65c8e866b94f9e951 Mon Sep 17 00:00:00 2001 From: Mateusz Olko Date: Wed, 22 May 2019 14:38:27 +0200 Subject: [PATCH 3/8] Remove redundant lines --- ConvertNER/convert_NER_wroc.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 92f3b92..8cc1795 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -44,9 +44,6 @@ def get_NE(self): return None - def get_cooccurences(self): - res = setCounter - def __str__(self): return (self.orth + ":" + str(self.attribs)) @@ -55,8 +52,8 @@ def process_token(tok): attribs = [] orth = tok.find("orth").text for ann in tok.iter("ann"): - if ann.attrib['chan'].endswith("nam"): # and ann.text != "0": - attribs += [{ann.attrib['chan']:ann.text}] + if ann.attrib['chan'].endswith("nam"): # and ann.text != "0": + attribs += [{ann.attrib['chan']: ann.text}] return Token(orth, attribs, -1) @@ -88,7 +85,7 @@ def get_all_labels_with_cardinalities(tokens): def map_labels(tokens, map): for tok in tokens: - tok.attribs = [{map[k]:v} for attrib in tok.attribs for k,v in attrib.items()] + tok.attribs = [{map[k]: v} for attrib in tok.attribs for k, v in attrib.items()] return tokens @@ -111,7 +108,7 @@ def get_longest_sequences(tokens): attribs = [k for d in tokens[0].attribs for k in d] last_set = None - while e != len(tokens)-1: + while e != len(tokens) - 1: current_token = tokens[e] if last_set == None: @@ -121,7 +118,7 @@ def get_longest_sequences(tokens): new_set = [v for d in current_token.attribs for k, v in d.items()] if not still_in_sequence(last_set, new_set): label_id = get_last_label(last_set) - if(label_id != None): + if (label_id != None): label = attribs[label_id] res.append((b, e, label)) b = e @@ -133,14 +130,14 @@ def get_longest_sequences(tokens): emptyset = set() + + def pick_tags(tokens): longest_sequences = get_longest_sequences(tokens) - res = [] for b, e, label in longest_sequences: seq = tokens[b:e] for tok in seq: tok.attribs = [{label: '1'}] - # res += seq tokens[b:e] = seq return tokens @@ -197,12 +194,6 @@ def main( use_label_map, output_path, ): - if use_label_map: - # classes = set(NER_pwr_to_spacy.values()) - # output = f'NER_wroc_{len(classes)}.json' - # this would be a cool feature but I'm not sure if it's good for automatic pipelines - output = 'NER_wroc_spacy_labels.json' - all_labels = setCounter() corpus = [] doc_idx = 0 for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): From be41f7bdbd52d3795cfbe682c57501059a284bf2 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Thu, 30 May 2019 12:38:10 +0200 Subject: [PATCH 4/8] Add label set in squashing algorithm --- ConvertNER/convert_NER_wroc.py | 124 +++++++++++++++++++++------------ NER_wroc-19.json.dvc | 12 ++-- data/NER/.gitignore | 3 +- data/kpwr-1.1.dvc | 4 +- data/training/NER/.gitignore | 5 ++ ner-wroc-19-train.json.dvc | 10 +-- ner_wroc-19_word2vec.dvc | 10 +-- 7 files changed, 104 insertions(+), 64 deletions(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 92f3b92..41b6e1c 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -94,7 +94,7 @@ def map_labels(tokens, map): def still_in_sequence(v1, v2): - return any(v1e == v2e != "0" for v1e in v1 for v2e in v2) + return any(v1e == v2e != "0" for v1e, v2e in zip(v1,v2)) def get_last_label(v): @@ -104,27 +104,48 @@ def get_last_label(v): return None +def get_label_set(v): + res = set() + for i, e in enumerate(v): + if e != "0": + res.add(i) + + return res + + +import random +def get_any_label(v): + if v == emptyset(): + return None + return random.sample(v, 1)[0] + +def emptyset(): + return set() + def get_longest_sequences(tokens): res = [] b = 0 e = 0 attribs = [k for d in tokens[0].attribs for k in d] last_set = None - + label_set = emptyset() while e != len(tokens)-1: current_token = tokens[e] - if last_set == None: + if last_set == None or label_set == emptyset(): last_set = [v for d in current_token.attribs for k, v in d.items()] + label_set = get_label_set(last_set) b = e else: new_set = [v for d in current_token.attribs for k, v in d.items()] + label_set = label_set.intersection(get_label_set(new_set)) if not still_in_sequence(last_set, new_set): - label_id = get_last_label(last_set) + label_id = get_any_label(label_set) if(label_id != None): label = attribs[label_id] res.append((b, e, label)) b = e + label_set = emptyset() last_set = new_set e += 1 @@ -132,7 +153,7 @@ def get_longest_sequences(tokens): return res -emptyset = set() +# emptyset = set() def pick_tags(tokens): longest_sequences = get_longest_sequences(tokens) res = [] @@ -190,6 +211,18 @@ def convert_to_biluo(tokens): return out +def get_file_paths(index_path): + with open(index_path) as index_file: + files = [] + line = index_file.readline() + while line: + line = line.replace('\n', '') + files.append(line) + line = index_file.readline() + + return files + + @click.command() @click.option("-m", "--use-label-map", type=bool, default=False) @click.argument("output_path", type=str) @@ -205,46 +238,47 @@ def main( all_labels = setCounter() corpus = [] doc_idx = 0 - for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): - for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)): - if not file.endswith("rel.xml") and not file.endswith(".ini"): - sentences = [] - token_idx = 0 - tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file)) - root = tree.getroot() - sents = root.iter("sentence") - for sent in sents: - tokens = [] - for tok in sent.iter("tok"): - token = process_token(tok) - token.id = token_idx - token_idx += 1 - tokens += [token] - - # all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis - tokens = pick_tags(tokens) - # tokens = flatten_token_attrib_dicts(tokens) - - if use_label_map: - tokens = map_labels(tokens, NER_pwr_to_spacy) - tokens = convert_to_biluo(tokens) - - sent = {'tokens': [{ - 'orth': t.orth, - 'id': t.id, - 'ner': t.get_NE()} - for t in tokens - ], 'brackets': [] - } - - sentences += [sent] - - doc_json = { - 'id': doc_idx, - 'paragraphs': [{'sentences': sentences}] - } - corpus += [doc_json] - doc_idx += 1 + file_paths = get_file_paths(os.path.join(path_prefix, corpus_path, 'index_names.txt')) + for file in file_paths: + file = os.path.join(path_prefix, corpus_path, file) + assert(not file.endswith("rel.xml") and not file.endswith(".ini")) + sentences = [] + token_idx = 0 + tree = ET.parse(file) + root = tree.getroot() + sents = root.iter("sentence") + for sent in sents: + tokens = [] + for tok in sent.iter("tok"): + token = process_token(tok) + token.id = token_idx + token_idx += 1 + tokens += [token] + + # all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis + tokens = pick_tags(tokens) + # tokens = flatten_token_attrib_dicts(tokens) + + if use_label_map: + tokens = map_labels(tokens, NER_pwr_to_spacy) + tokens = convert_to_biluo(tokens) + + sent = {'tokens': [{ + 'orth': t.orth, + 'id': t.id, + 'ner': t.get_NE()} + for t in tokens + ], 'brackets': [] + } + + sentences += [sent] + + doc_json = { + 'id': doc_idx, + 'paragraphs': [{'sentences': sentences}] + } + corpus += [doc_json] + doc_idx += 1 with open(os.path.expanduser(output_path), 'w+') as f: json.dump(corpus, f) diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc index 2ca22da..b977cd6 100644 --- a/NER_wroc-19.json.dvc +++ b/NER_wroc-19.json.dvc @@ -1,15 +1,15 @@ -cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json +cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json deps: -- md5: d84971d4b907e5efc5d9320de6691027.dir +- md5: 94b53a67af0d7202fbd760d8ca1e0998.dir path: data/kpwr-1.1 -- md5: c8aa684e59762c66aeba79e2727c103f +- md5: e29fb8a7101c096712d632cb117131c6 path: ConvertNER/convert_NER_wroc.py -- md5: eee1569106fcf22473ee5a39f49f57bd +- md5: d7d343ce8b47f93f20e3870d91c6150e path: ConvertNER/NER_pwr_to_spacy.py -md5: 8edf603b1572083aedf1a95147deec94 +md5: 6506f60af20d47077f8e24c91e4033d4 outs: - cache: true - md5: ffd284e41307a7b0815d10623a0b4c99 + md5: 25117be4c42e22d242c1e50d066fa35d metric: false path: data/NER/NER_wroc-19.json persist: false diff --git a/data/NER/.gitignore b/data/NER/.gitignore index b222295..a602b47 100644 --- a/data/NER/.gitignore +++ b/data/NER/.gitignore @@ -1,4 +1,5 @@ /NER.json /NER_wroc.json /NER_wroc_19.json -/NER_wroc_spacy_labels.json \ No newline at end of file +/NER_wroc_spacy_labels.json +/NER_wroc-19.json \ No newline at end of file diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc index 69e04b8..7e17ff2 100644 --- a/data/kpwr-1.1.dvc +++ b/data/kpwr-1.1.dvc @@ -1,7 +1,7 @@ -md5: 86d998d87357a866a9993c1c0458b169 +md5: 776829dcd16aeae294a70cd147183d9c outs: - cache: true - md5: d84971d4b907e5efc5d9320de6691027.dir + md5: 94b53a67af0d7202fbd760d8ca1e0998.dir metric: false path: kpwr-1.1 persist: false diff --git a/data/training/NER/.gitignore b/data/training/NER/.gitignore index 99cb319..da15df0 100644 --- a/data/training/NER/.gitignore +++ b/data/training/NER/.gitignore @@ -2,3 +2,8 @@ /ner-train.json /ner-test.json /ner-validation.json + + +/ner-wroc-19-train.json +/ner-wroc-19-validation.json +/ner-wroc-19-test.json \ No newline at end of file diff --git a/ner-wroc-19-train.json.dvc b/ner-wroc-19-train.json.dvc index 1feac85..a36cb44 100644 --- a/ner-wroc-19-train.json.dvc +++ b/ner-wroc-19-train.json.dvc @@ -2,22 +2,22 @@ cmd: python training/split-data.py --input-file data/NER/NER_wroc-19.json --trai data/training/NER/ner-wroc-19-train.json --validation-output data/training/NER/ner-wroc-19-validation.json --test-output data/training/NER/ner-wroc-19-test.json deps: -- md5: 1f40fe3247574c0debdad863a63ae4de +- md5: 25117be4c42e22d242c1e50d066fa35d path: data/NER/NER_wroc-19.json -md5: 39ad2d77ad532f5705ec894a80c2b344 +md5: 4b9cf46376a1706104898c24954aa8d6 outs: - cache: true - md5: 2a8a96bd480cc7908e137d18ba1c06de + md5: a821c9f3c70d36c977673f7b06914c16 metric: false path: data/training/NER/ner-wroc-19-train.json persist: false - cache: true - md5: 3096af2e4e0434b2a869586e5b08954b + md5: 9ed18e190b97651637df77b1d541642c metric: false path: data/training/NER/ner-wroc-19-validation.json persist: false - cache: true - md5: fef72e0918b1e197d0d2e4d891de42f7 + md5: add29d611966be1070ebcb1cd9fc0aa8 metric: false path: data/training/NER/ner-wroc-19-test.json persist: false diff --git a/ner_wroc-19_word2vec.dvc b/ner_wroc-19_word2vec.dvc index b463f25..9d36393 100644 --- a/ner_wroc-19_word2vec.dvc +++ b/ner_wroc-19_word2vec.dvc @@ -2,16 +2,16 @@ cmd: python -m spacy train pl models/ner_wroc-19_word2vec data/training/NER/ner- data/training/NER/ner-wroc-19-validation.json --vectors models/blank_NKJP_word2vec -p ner -g 0 -n 20 -e 2 deps: -- md5: 61eda27883b647a6c0be5725d3eb3ccb.dir +- md5: b1be412efe7f8bcbf97cf43fc2c80ef0.dir path: models/blank_NKJP_word2vec -- md5: 2a8a96bd480cc7908e137d18ba1c06de +- md5: a821c9f3c70d36c977673f7b06914c16 path: data/training/NER/ner-wroc-19-train.json -- md5: 3096af2e4e0434b2a869586e5b08954b +- md5: 9ed18e190b97651637df77b1d541642c path: data/training/NER/ner-wroc-19-validation.json -md5: 28f3c4a56cacccda75862abfe01e121d +md5: 8cb208a1300c8020a0c038994d46544b outs: - cache: true - md5: 3af0b7f53c402e4e9a7d36045b8ab8b6.dir + md5: 37074c5206ed8912bbdf746d04d34b4c.dir metric: false path: models/ner_wroc-19_word2vec persist: false From 884aedc027f5f66be76575b8644c606a4ff4d209 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Wed, 12 Jun 2019 16:45:20 +0200 Subject: [PATCH 5/8] Add small label set conversion map --- ConvertNER/NER_pwr_to_wiki.py | 64 +++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 ConvertNER/NER_pwr_to_wiki.py diff --git a/ConvertNER/NER_pwr_to_wiki.py b/ConvertNER/NER_pwr_to_wiki.py new file mode 100644 index 0000000..0e5e304 --- /dev/null +++ b/ConvertNER/NER_pwr_to_wiki.py @@ -0,0 +1,64 @@ +# uncompyle6 version 3.3.1 +# Python bytecode 3.7 (3394) +# Decompiled from: Python 3.7.1 (default, Nov 6 2018, 18:46:03) +# [Clang 10.0.0 (clang-1000.11.45.5)] +# Embedded file name: /Users/quark/studia/zpp/new/utils/ConvertNER/NER_pwr_to_spacy.py +# Size of source mod 2**32: 1685 bytes +NER_pwr_to_spacy = {'person_nam':'PER', 'institution_nam':'ORG', + 'city_nam':'LOC', + 'person_last_nam':'PER', + 'person_first_nam':'PER', + 'document_nam':'MISC', + 'event_nam':'MISC', + 'organization_nam':'ORG', + 'country_nam':'LOC', + 'title_nam':'MISC', + 'band_nam':'ORG', + 'periodic_nam':'MISC', + 'company_nam':'ORG', + 'facility_nam':'ORG', + 'brand_nam':'ORG', + 'political_party_nam':'ORG', + 'road_nam':'LOC', + 'admin1_nam':'LOC', + 'person_add_nam':'PER', + 'software_nam':'MISC', + 'nation_nam':'MISC', + 'tech_nam':'MISC', + 'nam':'MISC', + 'treaty_nam':'MISC', + 'web_nam':'MISC', + 'admin2_nam':'LOC', + 'award_nam':'MISC', + 'continent_nam':'LOC', + 'astronomical_nam':'LOC', + 'media_nam':'ORG', + 'river_nam':'LOC', + 'currency_nam':'MISC', + 'toponym_nam':'LOC', + 'mountain_nam':'LOC', + 'historical_region_nam':'LOC', + 'district_nam':'LOC', + 'country_region_nam':'LOC', + 'subdivision_nam':'ORG', + 'admin3_nam':'LOC', + 'region_nam':'LOC', + 'square_nam':'LOC', + 'park_nam':'LOC', + 'island_nam':'LOC', + 'system_nam':'MISC', + 'www_nam':'MISC', + 'person_group_nam':'MISC', + 'license_nam':'MISC', + 'lake_nam':'LOC', + 'animal_nam':'MISC', + 'sea_nam':'LOC', + 'person_adj_nam':'PER', + 'bay_nam':'LOC', + 'peninsula_nam':'LOC', + 'conurbation_nam':'LOC', + 'vehicle_nam':'MISC', + 'organization_sub_nam':'ORG', + 'ocean_nam':'LOC', + 'cape_nam':'LOC'} +# okay decompiling NER_pwr_to_spacy.cpython-37.pyc From dc86cd8dba1f715d68f1e4cfb95d2e7c9ccabbf9 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Tue, 25 Jun 2019 23:58:40 +0200 Subject: [PATCH 6/8] is_NE change --- ConvertNER/convert_NER_wroc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index f155b27..9e6df87 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -34,7 +34,7 @@ def __init__(self, orth, attribs, id): self.id = id def is_NE(self): - return self.get_NE() is not None + return self.get_NE() is not None and self.get_NE() != "O" def get_NE(self): for attrib in self.attribs: From d48eb5e2ec237541c0ae4a2f1df430d226935b31 Mon Sep 17 00:00:00 2001 From: Ksiazek Date: Wed, 26 Jun 2019 00:09:27 +0200 Subject: [PATCH 7/8] Add missing (?) dvc file --- NER_wroc-19.json.dvc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc index b977cd6..fcbf993 100644 --- a/NER_wroc-19.json.dvc +++ b/NER_wroc-19.json.dvc @@ -1,12 +1,12 @@ cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json deps: -- md5: 94b53a67af0d7202fbd760d8ca1e0998.dir +- md5: edb877fcf74af64289c0c32299288927.dir path: data/kpwr-1.1 -- md5: e29fb8a7101c096712d632cb117131c6 +- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654 path: ConvertNER/convert_NER_wroc.py - md5: d7d343ce8b47f93f20e3870d91c6150e path: ConvertNER/NER_pwr_to_spacy.py -md5: 6506f60af20d47077f8e24c91e4033d4 +md5: 9618dffa84d6309d470a77da9e8de843 outs: - cache: true md5: 25117be4c42e22d242c1e50d066fa35d From 69c6f843a22dcc59a9d5f5d2c9bdcc1a1f5147a2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kowalczyk Date: Mon, 8 Jul 2019 18:26:40 +0000 Subject: [PATCH 8/8] Updated NER with 60.2 F-score, 1.0.0 models release --- NER_wroc-19.json.dvc | 1 + NER_wroc.json.dvc | 11 ++++++----- data/kpwr-1.1.dvc | 4 ++-- data/training/NER/.gitignore | 5 ++++- deployment/deploy.sh | 2 +- models/.gitignore | 4 +++- ner-wroc-train.json.dvc | 10 +++++----- ner_wroc-19_fasttext.dvc | 18 ++++++++++++++++++ ner_wroc_fasttext.dvc | 18 ++++++++++++++++++ 9 files changed, 58 insertions(+), 15 deletions(-) create mode 100644 ner_wroc-19_fasttext.dvc create mode 100644 ner_wroc_fasttext.dvc diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc index fcbf993..5749899 100644 --- a/NER_wroc-19.json.dvc +++ b/NER_wroc-19.json.dvc @@ -6,6 +6,7 @@ deps: path: ConvertNER/convert_NER_wroc.py - md5: d7d343ce8b47f93f20e3870d91c6150e path: ConvertNER/NER_pwr_to_spacy.py +locked: true md5: 9618dffa84d6309d470a77da9e8de843 outs: - cache: true diff --git a/NER_wroc.json.dvc b/NER_wroc.json.dvc index b5dfb2d..835e9b1 100644 --- a/NER_wroc.json.dvc +++ b/NER_wroc.json.dvc @@ -1,15 +1,16 @@ cmd: python ConvertNER/convert_NER_wroc.py -m false data/NER/NER_wroc.json deps: -- md5: d84971d4b907e5efc5d9320de6691027.dir +- md5: 96d1abc9f866c7f713a5d655cacb453a.dir path: data/kpwr-1.1 -- md5: c8aa684e59762c66aeba79e2727c103f +- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654 path: ConvertNER/convert_NER_wroc.py -- md5: eee1569106fcf22473ee5a39f49f57bd +- md5: d7d343ce8b47f93f20e3870d91c6150e path: ConvertNER/NER_pwr_to_spacy.py -md5: 8edf603b1572083aedf1a95147deec94 +locked: true +md5: 9f27d06e80815d0cd8d7a5489f47dbce outs: - cache: true - md5: ffd284e41307a7b0815d10623a0b4c99 + md5: ca5e2a82931ad7dced2b2f838761ea3b metric: false path: data/NER/NER_wroc.json persist: false diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc index 7e17ff2..4e30a4d 100644 --- a/data/kpwr-1.1.dvc +++ b/data/kpwr-1.1.dvc @@ -1,7 +1,7 @@ -md5: 776829dcd16aeae294a70cd147183d9c +md5: 14d0df72a86e1141a7938134eb3009d1 outs: - cache: true - md5: 94b53a67af0d7202fbd760d8ca1e0998.dir + md5: 96d1abc9f866c7f713a5d655cacb453a.dir metric: false path: kpwr-1.1 persist: false diff --git a/data/training/NER/.gitignore b/data/training/NER/.gitignore index da15df0..6b35a4d 100644 --- a/data/training/NER/.gitignore +++ b/data/training/NER/.gitignore @@ -6,4 +6,7 @@ /ner-wroc-19-train.json /ner-wroc-19-validation.json -/ner-wroc-19-test.json \ No newline at end of file +/ner-wroc-19-test.json +/ner-wroc-train.json +/ner-wroc-validation.json +/ner-wroc-test.json \ No newline at end of file diff --git a/deployment/deploy.sh b/deployment/deploy.sh index 44c813d..55a7e7a 100755 --- a/deployment/deploy.sh +++ b/deployment/deploy.sh @@ -6,7 +6,7 @@ echo "" # --- SETTINGS --- PACKAGE_DIR="release" # same as passed to combine or spacy.cli.package -MODEL_NAME="pl_model-0.2.0" # same as inputted in spacy.cli.package +MODEL_NAME="pl_model-1.0.0" # same as inputted in spacy.cli.package BUCKET_NAME="gs://spacy-pl-public-models" BUCKET_PUBLIC_URL="https://storage.googleapis.com/spacy-pl-public-models" diff --git a/models/.gitignore b/models/.gitignore index 5d0257c..cfa234a 100644 --- a/models/.gitignore +++ b/models/.gitignore @@ -11,4 +11,6 @@ /pos_NKJP-justpos_fasttext /trees_LFG_fasttext /ner_nkjp_fasttext -/ner_wroc-19_word2vec \ No newline at end of file +/ner_wroc-19_word2vec +/ner_wroc_fasttext +/ner_wroc-19_fasttext \ No newline at end of file diff --git a/ner-wroc-train.json.dvc b/ner-wroc-train.json.dvc index 454548f..2cc4129 100644 --- a/ner-wroc-train.json.dvc +++ b/ner-wroc-train.json.dvc @@ -2,22 +2,22 @@ cmd: python training/split-data.py --input-file data/NER/NER_wroc.json --train-o data/training/NER/ner-wroc-train.json --validation-output data/training/NER/ner-wroc-validation.json --test-output data/training/NER/ner-wroc-test.json deps: -- md5: ffd284e41307a7b0815d10623a0b4c99 +- md5: ca5e2a82931ad7dced2b2f838761ea3b path: data/NER/NER_wroc.json -md5: e8749979d1f6bff59eb4b724a1e1d0d5 +md5: c00b5ca9040325a0201f017df51b5332 outs: - cache: true - md5: 09e76ef24c694cdc9b5ce263cc6deca4 + md5: 583ae4d3d540b935495158f436f848ef metric: false path: data/training/NER/ner-wroc-train.json persist: false - cache: true - md5: ad8ce2a4657262084b8d81e9ed07ac1d + md5: 51eaf4ea624b692fb1b0daec14736224 metric: false path: data/training/NER/ner-wroc-validation.json persist: false - cache: true - md5: c02d5d314cf98f8327723edba692b8a4 + md5: c781b90cca2a14e35d20c9d597b49b5d metric: false path: data/training/NER/ner-wroc-test.json persist: false diff --git a/ner_wroc-19_fasttext.dvc b/ner_wroc-19_fasttext.dvc new file mode 100644 index 0000000..49bf3c8 --- /dev/null +++ b/ner_wroc-19_fasttext.dvc @@ -0,0 +1,18 @@ +cmd: python -m spacy train pl models/ner_wroc-19_fasttext data/training/NER/ner-wroc-19-train.json + data/training/NER/ner-wroc-19-validation.json --vectors models/blank_fasttext -p + ner -g 0 -n 80 -e 8 +deps: +- md5: b1be412efe7f8bcbf97cf43fc2c80ef0.dir + path: models/blank_NKJP_word2vec +- md5: a821c9f3c70d36c977673f7b06914c16 + path: data/training/NER/ner-wroc-19-train.json +- md5: 9ed18e190b97651637df77b1d541642c + path: data/training/NER/ner-wroc-19-validation.json +md5: 0947bbffdb3707555a0da9011f9a2f16 +outs: +- cache: true + md5: 099eb0933b6d2131641ff457013d43ba.dir + metric: false + path: models/ner_wroc-19_fasttext + persist: false +wdir: . diff --git a/ner_wroc_fasttext.dvc b/ner_wroc_fasttext.dvc new file mode 100644 index 0000000..254f3b4 --- /dev/null +++ b/ner_wroc_fasttext.dvc @@ -0,0 +1,18 @@ +cmd: python -m spacy train pl models/ner_wroc_fasttext data/training/NER/ner-wroc-train.json + data/training/NER/ner-wroc-validation.json --vectors models/blank_fasttext -p ner + -g 0 -n 80 -e 8 +deps: +- md5: fe3ebcb89593a8e1026e7668ffe6de23.dir + path: models/blank_fasttext +- md5: 583ae4d3d540b935495158f436f848ef + path: data/training/NER/ner-wroc-train.json +- md5: 51eaf4ea624b692fb1b0daec14736224 + path: data/training/NER/ner-wroc-validation.json +md5: f5c883da955b6daf11cb2bcdaf5492e0 +outs: +- cache: true + md5: 55a741e0221cc1041446eb2bfcf79908.dir + metric: false + path: models/ner_wroc_fasttext + persist: false +wdir: .