diff --git a/ConvertNER/NER_pwr_to_wiki.py b/ConvertNER/NER_pwr_to_wiki.py new file mode 100644 index 0000000..0e5e304 --- /dev/null +++ b/ConvertNER/NER_pwr_to_wiki.py @@ -0,0 +1,64 @@ +# uncompyle6 version 3.3.1 +# Python bytecode 3.7 (3394) +# Decompiled from: Python 3.7.1 (default, Nov 6 2018, 18:46:03) +# [Clang 10.0.0 (clang-1000.11.45.5)] +# Embedded file name: /Users/quark/studia/zpp/new/utils/ConvertNER/NER_pwr_to_spacy.py +# Size of source mod 2**32: 1685 bytes +NER_pwr_to_spacy = {'person_nam':'PER', 'institution_nam':'ORG', + 'city_nam':'LOC', + 'person_last_nam':'PER', + 'person_first_nam':'PER', + 'document_nam':'MISC', + 'event_nam':'MISC', + 'organization_nam':'ORG', + 'country_nam':'LOC', + 'title_nam':'MISC', + 'band_nam':'ORG', + 'periodic_nam':'MISC', + 'company_nam':'ORG', + 'facility_nam':'ORG', + 'brand_nam':'ORG', + 'political_party_nam':'ORG', + 'road_nam':'LOC', + 'admin1_nam':'LOC', + 'person_add_nam':'PER', + 'software_nam':'MISC', + 'nation_nam':'MISC', + 'tech_nam':'MISC', + 'nam':'MISC', + 'treaty_nam':'MISC', + 'web_nam':'MISC', + 'admin2_nam':'LOC', + 'award_nam':'MISC', + 'continent_nam':'LOC', + 'astronomical_nam':'LOC', + 'media_nam':'ORG', + 'river_nam':'LOC', + 'currency_nam':'MISC', + 'toponym_nam':'LOC', + 'mountain_nam':'LOC', + 'historical_region_nam':'LOC', + 'district_nam':'LOC', + 'country_region_nam':'LOC', + 'subdivision_nam':'ORG', + 'admin3_nam':'LOC', + 'region_nam':'LOC', + 'square_nam':'LOC', + 'park_nam':'LOC', + 'island_nam':'LOC', + 'system_nam':'MISC', + 'www_nam':'MISC', + 'person_group_nam':'MISC', + 'license_nam':'MISC', + 'lake_nam':'LOC', + 'animal_nam':'MISC', + 'sea_nam':'LOC', + 'person_adj_nam':'PER', + 'bay_nam':'LOC', + 'peninsula_nam':'LOC', + 'conurbation_nam':'LOC', + 'vehicle_nam':'MISC', + 'organization_sub_nam':'ORG', + 'ocean_nam':'LOC', + 'cape_nam':'LOC'} +# okay decompiling NER_pwr_to_spacy.cpython-37.pyc diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py index 76cebde..9e6df87 100644 --- a/ConvertNER/convert_NER_wroc.py +++ b/ConvertNER/convert_NER_wroc.py @@ -34,13 +34,15 @@ def __init__(self, orth, attribs, id): self.id = id def is_NE(self): - return len(self.attribs) != 0 + return self.get_NE() is not None and self.get_NE() != "O" def get_NE(self): - return self.attribs[0] if len(self.attribs) > 0 else "" + for attrib in self.attribs: + for k in attrib: + if attrib[k] != "0": + return k - def get_cooccurences(self): - res = setCounter + return None def __str__(self): return (self.orth + ":" + str(self.attribs)) @@ -50,8 +52,8 @@ def process_token(tok): attribs = [] orth = tok.find("orth").text for ann in tok.iter("ann"): - if ann.attrib['chan'].endswith("nam") and ann.text == "1": - attribs += [ann.attrib['chan']] + if ann.attrib['chan'].endswith("nam"): # and ann.text != "0": + attribs += [{ann.attrib['chan']: ann.text}] return Token(orth, attribs, -1) @@ -83,50 +85,80 @@ def get_all_labels_with_cardinalities(tokens): def map_labels(tokens, map): for tok in tokens: - tok.attribs = [map[attrib] for attrib in tok.attribs] + tok.attribs = [{map[k]: v} for attrib in tok.attribs for k, v in attrib.items()] return tokens -def pick_tags(tokens): - # first and last separately - if len(tokens) == 0: - return tokens - if len(tokens) == 1: - if tokens[0].is_NE(): - tokens[0].attribs = [tokens[0].attribs[0]] - return tokens - - t0 = tokens[0] - if len(t0.attribs) > 1: - new_tag = get_common_tag(t0, tokens[1]) - if new_tag is None: - t0.attribs = [t0.attribs[0]] - else: - t0.attribs = [new_tag] - - for i in range(1, len(tokens) - 1): - if len(tokens[i].attribs) > 1: - new_tag = get_common_tag(tokens[i - 1], tokens[i]) - if new_tag is None: - new_tag = get_common_tag(tokens[i], tokens[i + 1]) - if new_tag is None: - tokens[i].attribs = [tokens[i].attribs[0]] - else: - tokens[i].attribs = [new_tag] - else: - tokens[i].attribs = [new_tag] +def still_in_sequence(v1, v2): + return any(v1e == v2e != "0" for v1e, v2e in zip(v1,v2)) + + +def get_last_label(v): + for i, e in enumerate(v): + if e != "0": + return i + return None + + +def get_label_set(v): + res = set() + for i, e in enumerate(v): + if e != "0": + res.add(i) + + return res + + +import random +def get_any_label(v): + if v == emptyset(): + return None + return random.sample(v, 1)[0] - te = tokens[-1] - if len(te.attribs) > 1: - new_tag = get_common_tag(te, tokens[-2]) - if new_tag is None: - te.attribs = [te.attribs[0]] +def emptyset(): + return set() + +def get_longest_sequences(tokens): + res = [] + b = 0 + e = 0 + attribs = [k for d in tokens[0].attribs for k in d] + last_set = None + label_set = emptyset() + while e != len(tokens)-1: + current_token = tokens[e] + + if last_set == None or label_set == emptyset(): + last_set = [v for d in current_token.attribs for k, v in d.items()] + label_set = get_label_set(last_set) + b = e else: - te.attribs = [new_tag] + new_set = [v for d in current_token.attribs for k, v in d.items()] + label_set = label_set.intersection(get_label_set(new_set)) + if not still_in_sequence(last_set, new_set): + label_id = get_any_label(label_set) + if(label_id != None): + label = attribs[label_id] + res.append((b, e, label)) + b = e + label_set = emptyset() + + last_set = new_set + e += 1 + + return res + - assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te])) - return [t0] + tokens[1:-2] + [te] +# emptyset = set() +def pick_tags(tokens): + longest_sequences = get_longest_sequences(tokens) + for b, e, label in longest_sequences: + seq = tokens[b:e] + for tok in seq: + tok.attribs = [{label: '1'}] + tokens[b:e] = seq + return tokens def convert_to_biluo(tokens): @@ -137,10 +169,10 @@ def convert_to_biluo(tokens): if token.is_NE(): if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): # inner NE - out += [Token(token.orth, ["I-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"I-" + token.get_NE(): '1'}], token.id)] else: # last NE - out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)] in_ne = False else: # we shouldn't ever get here @@ -151,29 +183,41 @@ def convert_to_biluo(tokens): # new NE if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): # beginning NE - out += [Token(token.orth, ["B-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"B-" + token.get_NE(): '1'}], token.id)] in_ne = True else: # unit NE - out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)] in_ne = False else: # outside of NE - out += [Token(token.orth, ["O"], token.id)] + out += [Token(token.orth, [{"O": '1'}], token.id)] # process last token token = tokens[-1] if in_ne: - out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)] else: if token.is_NE(): - out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] + out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)] else: - out += [Token(token.orth, ["O"], token.id)] + out += [Token(token.orth, [{"O": '1'}], token.id)] return out +def get_file_paths(index_path): + with open(index_path) as index_file: + files = [] + line = index_file.readline() + while line: + line = line.replace('\n', '') + files.append(line) + line = index_file.readline() + + return files + + @click.command() @click.option("-m", "--use-label-map", type=bool, default=False) @click.argument("output_path", type=str) @@ -181,52 +225,49 @@ def main( use_label_map, output_path, ): - if use_label_map: - # classes = set(NER_pwr_to_spacy.values()) - # output = f'NER_wroc_{len(classes)}.json' - # this would be a cool feature but I'm not sure if it's good for automatic pipelines - output = 'NER_wroc_spacy_labels.json' - all_labels = setCounter() corpus = [] doc_idx = 0 - for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): - for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)): - if not file.endswith("rel.xml") and not file.endswith(".ini"): - sentences = [] - token_idx = 0 - tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file)) - root = tree.getroot() - sents = root.iter("sentence") - for sent in sents: - tokens = [] - for tok in sent.iter("tok"): - token = process_token(tok) - token.id = token_idx - token_idx += 1 - tokens += [token] - - all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis - tokens = pick_tags(tokens) - if use_label_map: - tokens = map_labels(tokens, NER_pwr_to_spacy) - tokens = convert_to_biluo(tokens) - - sent = {'tokens': [{ - 'orth': t.orth, - 'id': t.id, - 'ner': t.get_NE()} - for t in tokens - ], 'brackets': [] - } - - sentences += [sent] - - doc_json = { - 'id': doc_idx, - 'paragraphs': [{'sentences': sentences}] - } - corpus += [doc_json] - doc_idx += 1 + file_paths = get_file_paths(os.path.join(path_prefix, corpus_path, 'index_names.txt')) + for file in file_paths: + file = os.path.join(path_prefix, corpus_path, file) + assert(not file.endswith("rel.xml") and not file.endswith(".ini")) + sentences = [] + token_idx = 0 + tree = ET.parse(file) + root = tree.getroot() + sents = root.iter("sentence") + for sent in sents: + tokens = [] + for tok in sent.iter("tok"): + token = process_token(tok) + token.id = token_idx + token_idx += 1 + tokens += [token] + + # all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis + tokens = pick_tags(tokens) + # tokens = flatten_token_attrib_dicts(tokens) + + if use_label_map: + tokens = map_labels(tokens, NER_pwr_to_spacy) + tokens = convert_to_biluo(tokens) + + sent = {'tokens': [{ + 'orth': t.orth, + 'id': t.id, + 'ner': t.get_NE()} + for t in tokens + ], 'brackets': [] + } + + sentences += [sent] + + doc_json = { + 'id': doc_idx, + 'paragraphs': [{'sentences': sentences}] + } + corpus += [doc_json] + doc_idx += 1 with open(os.path.expanduser(output_path), 'w+') as f: json.dump(corpus, f) diff --git a/NER_wroc-19.json.dvc b/NER_wroc-19.json.dvc index 2ca22da..5749899 100644 --- a/NER_wroc-19.json.dvc +++ b/NER_wroc-19.json.dvc @@ -1,15 +1,16 @@ -cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json +cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json deps: -- md5: d84971d4b907e5efc5d9320de6691027.dir +- md5: edb877fcf74af64289c0c32299288927.dir path: data/kpwr-1.1 -- md5: c8aa684e59762c66aeba79e2727c103f +- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654 path: ConvertNER/convert_NER_wroc.py -- md5: eee1569106fcf22473ee5a39f49f57bd +- md5: d7d343ce8b47f93f20e3870d91c6150e path: ConvertNER/NER_pwr_to_spacy.py -md5: 8edf603b1572083aedf1a95147deec94 +locked: true +md5: 9618dffa84d6309d470a77da9e8de843 outs: - cache: true - md5: ffd284e41307a7b0815d10623a0b4c99 + md5: 25117be4c42e22d242c1e50d066fa35d metric: false path: data/NER/NER_wroc-19.json persist: false diff --git a/NER_wroc.json.dvc b/NER_wroc.json.dvc index b5dfb2d..835e9b1 100644 --- a/NER_wroc.json.dvc +++ b/NER_wroc.json.dvc @@ -1,15 +1,16 @@ cmd: python ConvertNER/convert_NER_wroc.py -m false data/NER/NER_wroc.json deps: -- md5: d84971d4b907e5efc5d9320de6691027.dir +- md5: 96d1abc9f866c7f713a5d655cacb453a.dir path: data/kpwr-1.1 -- md5: c8aa684e59762c66aeba79e2727c103f +- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654 path: ConvertNER/convert_NER_wroc.py -- md5: eee1569106fcf22473ee5a39f49f57bd +- md5: d7d343ce8b47f93f20e3870d91c6150e path: ConvertNER/NER_pwr_to_spacy.py -md5: 8edf603b1572083aedf1a95147deec94 +locked: true +md5: 9f27d06e80815d0cd8d7a5489f47dbce outs: - cache: true - md5: ffd284e41307a7b0815d10623a0b4c99 + md5: ca5e2a82931ad7dced2b2f838761ea3b metric: false path: data/NER/NER_wroc.json persist: false diff --git a/data/NER/.gitignore b/data/NER/.gitignore index b222295..a602b47 100644 --- a/data/NER/.gitignore +++ b/data/NER/.gitignore @@ -1,4 +1,5 @@ /NER.json /NER_wroc.json /NER_wroc_19.json -/NER_wroc_spacy_labels.json \ No newline at end of file +/NER_wroc_spacy_labels.json +/NER_wroc-19.json \ No newline at end of file diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc index 275217c..4e30a4d 100644 --- a/data/kpwr-1.1.dvc +++ b/data/kpwr-1.1.dvc @@ -1,7 +1,8 @@ -md5: 58cbc0bd05749d04e4b6a5e4c9d78c01 +md5: 14d0df72a86e1141a7938134eb3009d1 outs: - cache: true - md5: d84971d4b907e5efc5d9320de6691027.dir + md5: 96d1abc9f866c7f713a5d655cacb453a.dir metric: false path: kpwr-1.1 + persist: false wdir: . diff --git a/data/training/NER/.gitignore b/data/training/NER/.gitignore index 99cb319..6b35a4d 100644 --- a/data/training/NER/.gitignore +++ b/data/training/NER/.gitignore @@ -2,3 +2,11 @@ /ner-train.json /ner-test.json /ner-validation.json + + +/ner-wroc-19-train.json +/ner-wroc-19-validation.json +/ner-wroc-19-test.json +/ner-wroc-train.json +/ner-wroc-validation.json +/ner-wroc-test.json \ No newline at end of file diff --git a/deployment/deploy.sh b/deployment/deploy.sh index 44c813d..55a7e7a 100755 --- a/deployment/deploy.sh +++ b/deployment/deploy.sh @@ -6,7 +6,7 @@ echo "" # --- SETTINGS --- PACKAGE_DIR="release" # same as passed to combine or spacy.cli.package -MODEL_NAME="pl_model-0.2.0" # same as inputted in spacy.cli.package +MODEL_NAME="pl_model-1.0.0" # same as inputted in spacy.cli.package BUCKET_NAME="gs://spacy-pl-public-models" BUCKET_PUBLIC_URL="https://storage.googleapis.com/spacy-pl-public-models" diff --git a/models/.gitignore b/models/.gitignore index 5d0257c..cfa234a 100644 --- a/models/.gitignore +++ b/models/.gitignore @@ -11,4 +11,6 @@ /pos_NKJP-justpos_fasttext /trees_LFG_fasttext /ner_nkjp_fasttext -/ner_wroc-19_word2vec \ No newline at end of file +/ner_wroc-19_word2vec +/ner_wroc_fasttext +/ner_wroc-19_fasttext \ No newline at end of file diff --git a/ner-wroc-19-train.json.dvc b/ner-wroc-19-train.json.dvc index 1feac85..a36cb44 100644 --- a/ner-wroc-19-train.json.dvc +++ b/ner-wroc-19-train.json.dvc @@ -2,22 +2,22 @@ cmd: python training/split-data.py --input-file data/NER/NER_wroc-19.json --trai data/training/NER/ner-wroc-19-train.json --validation-output data/training/NER/ner-wroc-19-validation.json --test-output data/training/NER/ner-wroc-19-test.json deps: -- md5: 1f40fe3247574c0debdad863a63ae4de +- md5: 25117be4c42e22d242c1e50d066fa35d path: data/NER/NER_wroc-19.json -md5: 39ad2d77ad532f5705ec894a80c2b344 +md5: 4b9cf46376a1706104898c24954aa8d6 outs: - cache: true - md5: 2a8a96bd480cc7908e137d18ba1c06de + md5: a821c9f3c70d36c977673f7b06914c16 metric: false path: data/training/NER/ner-wroc-19-train.json persist: false - cache: true - md5: 3096af2e4e0434b2a869586e5b08954b + md5: 9ed18e190b97651637df77b1d541642c metric: false path: data/training/NER/ner-wroc-19-validation.json persist: false - cache: true - md5: fef72e0918b1e197d0d2e4d891de42f7 + md5: add29d611966be1070ebcb1cd9fc0aa8 metric: false path: data/training/NER/ner-wroc-19-test.json persist: false diff --git a/ner-wroc-train.json.dvc b/ner-wroc-train.json.dvc index 454548f..2cc4129 100644 --- a/ner-wroc-train.json.dvc +++ b/ner-wroc-train.json.dvc @@ -2,22 +2,22 @@ cmd: python training/split-data.py --input-file data/NER/NER_wroc.json --train-o data/training/NER/ner-wroc-train.json --validation-output data/training/NER/ner-wroc-validation.json --test-output data/training/NER/ner-wroc-test.json deps: -- md5: ffd284e41307a7b0815d10623a0b4c99 +- md5: ca5e2a82931ad7dced2b2f838761ea3b path: data/NER/NER_wroc.json -md5: e8749979d1f6bff59eb4b724a1e1d0d5 +md5: c00b5ca9040325a0201f017df51b5332 outs: - cache: true - md5: 09e76ef24c694cdc9b5ce263cc6deca4 + md5: 583ae4d3d540b935495158f436f848ef metric: false path: data/training/NER/ner-wroc-train.json persist: false - cache: true - md5: ad8ce2a4657262084b8d81e9ed07ac1d + md5: 51eaf4ea624b692fb1b0daec14736224 metric: false path: data/training/NER/ner-wroc-validation.json persist: false - cache: true - md5: c02d5d314cf98f8327723edba692b8a4 + md5: c781b90cca2a14e35d20c9d597b49b5d metric: false path: data/training/NER/ner-wroc-test.json persist: false diff --git a/ner_wroc-19_fasttext.dvc b/ner_wroc-19_fasttext.dvc new file mode 100644 index 0000000..49bf3c8 --- /dev/null +++ b/ner_wroc-19_fasttext.dvc @@ -0,0 +1,18 @@ +cmd: python -m spacy train pl models/ner_wroc-19_fasttext data/training/NER/ner-wroc-19-train.json + data/training/NER/ner-wroc-19-validation.json --vectors models/blank_fasttext -p + ner -g 0 -n 80 -e 8 +deps: +- md5: b1be412efe7f8bcbf97cf43fc2c80ef0.dir + path: models/blank_NKJP_word2vec +- md5: a821c9f3c70d36c977673f7b06914c16 + path: data/training/NER/ner-wroc-19-train.json +- md5: 9ed18e190b97651637df77b1d541642c + path: data/training/NER/ner-wroc-19-validation.json +md5: 0947bbffdb3707555a0da9011f9a2f16 +outs: +- cache: true + md5: 099eb0933b6d2131641ff457013d43ba.dir + metric: false + path: models/ner_wroc-19_fasttext + persist: false +wdir: . diff --git a/ner_wroc-19_word2vec.dvc b/ner_wroc-19_word2vec.dvc index b463f25..9d36393 100644 --- a/ner_wroc-19_word2vec.dvc +++ b/ner_wroc-19_word2vec.dvc @@ -2,16 +2,16 @@ cmd: python -m spacy train pl models/ner_wroc-19_word2vec data/training/NER/ner- data/training/NER/ner-wroc-19-validation.json --vectors models/blank_NKJP_word2vec -p ner -g 0 -n 20 -e 2 deps: -- md5: 61eda27883b647a6c0be5725d3eb3ccb.dir +- md5: b1be412efe7f8bcbf97cf43fc2c80ef0.dir path: models/blank_NKJP_word2vec -- md5: 2a8a96bd480cc7908e137d18ba1c06de +- md5: a821c9f3c70d36c977673f7b06914c16 path: data/training/NER/ner-wroc-19-train.json -- md5: 3096af2e4e0434b2a869586e5b08954b +- md5: 9ed18e190b97651637df77b1d541642c path: data/training/NER/ner-wroc-19-validation.json -md5: 28f3c4a56cacccda75862abfe01e121d +md5: 8cb208a1300c8020a0c038994d46544b outs: - cache: true - md5: 3af0b7f53c402e4e9a7d36045b8ab8b6.dir + md5: 37074c5206ed8912bbdf746d04d34b4c.dir metric: false path: models/ner_wroc-19_word2vec persist: false diff --git a/ner_wroc_fasttext.dvc b/ner_wroc_fasttext.dvc new file mode 100644 index 0000000..254f3b4 --- /dev/null +++ b/ner_wroc_fasttext.dvc @@ -0,0 +1,18 @@ +cmd: python -m spacy train pl models/ner_wroc_fasttext data/training/NER/ner-wroc-train.json + data/training/NER/ner-wroc-validation.json --vectors models/blank_fasttext -p ner + -g 0 -n 80 -e 8 +deps: +- md5: fe3ebcb89593a8e1026e7668ffe6de23.dir + path: models/blank_fasttext +- md5: 583ae4d3d540b935495158f436f848ef + path: data/training/NER/ner-wroc-train.json +- md5: 51eaf4ea624b692fb1b0daec14736224 + path: data/training/NER/ner-wroc-validation.json +md5: f5c883da955b6daf11cb2bcdaf5492e0 +outs: +- cache: true + md5: 55a741e0221cc1041446eb2bfcf79908.dir + metric: false + path: models/ner_wroc_fasttext + persist: false +wdir: .