diff --git a/ConvertNER/convert_NER.py b/ConvertNER/convert_NER.py index b6eefd3..d8a1db2 100644 --- a/ConvertNER/convert_NER.py +++ b/ConvertNER/convert_NER.py @@ -1,7 +1,6 @@ import xml.etree.ElementTree as ET -from spacy.lang.pl import Polish from spacy.gold import biluo_tags_from_offsets -import spacy +from spacy.lang.pl import Polish import json import os @@ -139,8 +138,8 @@ def required_files_exist(dir): if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))): continue - # we skip the docs that don't have the required annotations (certain .xml files) if not required_files_exist(current_folder): + # doc_id +=1 ? continue tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml)) @@ -209,9 +208,5 @@ def required_files_exist(dir): doc_id += 1 corpus += [doc_json] -out_path = os.path.expanduser(os.path.join(path_prefix, output_path)) -if not os.path.exists(out_path): - os.makedirs(out_path) - -with open(os.path.join(out_path, output), 'w+') as f: +with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: json.dump(corpus, f) diff --git a/ConvertNER/convert_NER_wroc.py b/ConvertNER/convert_NER_wroc.py new file mode 100644 index 0000000..fa98707 --- /dev/null +++ b/ConvertNER/convert_NER_wroc.py @@ -0,0 +1,185 @@ +import xml.etree.ElementTree as ET +import json +import os + +path_prefix = './' +corpus_path = 'data/kpwr-1.1/' +output_path = 'data/NER/' +output = 'NER_wroc.json' + +doc_id = 0 +corpus = [] + +NE_njkp_to_spacy = {'persName': 'PERSON', + 'placeName': 'LOC', + 'orgName': 'ORG', + 'date': 'DATE', + 'time': 'TIME', + 'geogName': 'LOC'} + + +class Token: + def __init__(self, orth, attribs, id): + self.orth = orth + self.attribs = attribs + self.id = id + + def is_NE(self): + return len(self.attribs) != 0 + + def get_NE(self): + return self.attribs[0] if len(self.attribs) > 0 else "" + + def __str__(self): + return (self.orth + ":" + str(self.attribs)) + + +def get_subdirs(dir): + return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))] + + +def process_token(tok): + attribs = [] + orth = tok.find("orth").text + for ann in tok.iter("ann"): + if ann.attrib['chan'].endswith("nam") and ann.text != "0": + attribs += [ann.attrib['chan']] + + return Token(orth, attribs, -1) + + +def get_common_tag(t1, t2): + set1 = set(t1.attribs) + set2 = set(t2.attribs) + common = list(set1 & set2) + return common[0] if len(common) > 0 else None + + +def pick_tags(tokens): + # first and last separately + if len(tokens) == 0: + return tokens + if len(tokens) == 1: + if tokens[0].is_NE(): + tokens[0].attribs = [tokens[0].attribs[0]] + return tokens + + t0 = tokens[0] + if len(t0.attribs) > 1: + new_tag = get_common_tag(t0, tokens[1]) + if new_tag is None: + t0.attribs = [t0.attribs[0]] + else: + t0.attribs = [new_tag] + + for i in range(1, len(tokens) - 1): + if len(tokens[i].attribs) > 1: + new_tag = get_common_tag(tokens[i - 1], tokens[i]) + if new_tag is None: + new_tag = get_common_tag(tokens[i], tokens[i + 1]) + if new_tag is None: + tokens[i].attribs = [tokens[i].attribs[0]] + else: + tokens[i].attribs = [new_tag] + else: + tokens[i].attribs = [new_tag] + + te = tokens[-1] + if len(te.attribs) > 1: + new_tag = get_common_tag(te, tokens[-2]) + if new_tag is None: + te.attribs = [te.attribs[0]] + else: + te.attribs = [new_tag] + + assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te])) + return [t0] + tokens[1:-2] + [te] + + +def convert_to_biluo(tokens): + out = [] + in_ne = False + for i, token in enumerate(tokens[:-1]): + if in_ne: + if token.is_NE(): + if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): + # inner NE + out += [Token(token.orth, ["I-" + token.get_NE()], token.id)] + else: + # last NE + out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] + in_ne = False + else: + # we shouldn't ever get here + assert (False) + + else: + if token.is_NE(): + # new NE + if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE(): + # beginning NE + out += [Token(token.orth, ["B-" + token.get_NE()], token.id)] + in_ne = True + else: + # unit NE + out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] + in_ne = False + else: + # outside of NE + out += [Token(token.orth, ["O"], token.id)] + + # process last token + token = tokens[-1] + if in_ne: + out += [Token(token.orth, ["L-" + token.get_NE()], token.id)] + else: + if token.is_NE(): + out += [Token(token.orth, ["U-" + token.get_NE()], token.id)] + else: + out += [Token(token.orth, ["O"], token.id)] + + return out + + +docs = [] +doc_idx = 0 +for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)): + for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)): + if not file.endswith("rel.xml") and not file.endswith(".ini"): + doc_json = {} + sentences = [] + token_idx = 0 + raw = "" + tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file)) + root = tree.getroot() + sents = root.iter("sentence") + for sent in sents: + tokens = [] + for tok in sent.iter("tok"): + token = process_token(tok) + token.id = token_idx + token_idx += 1 + tokens += [token] + + tokens = pick_tags(tokens) + tokens = convert_to_biluo(tokens) + + sent = {'tokens': [{ + 'orth': t.orth, + 'id': t.id, + 'ner': t.get_NE()} + for t in tokens + ], 'brackets': [] + } + + sentences += [sent] + + doc_json = { + 'id': doc_idx, + 'paragraphs': [{'sentences': sentences}] + } + corpus += [doc_json] + doc_idx += 1 + +with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f: + json.dump(corpus, f) diff --git a/data/.gitignore b/data/.gitignore index 6cefda7..5f63d8c 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -11,3 +11,4 @@ vocab.jsonl /vectors_300.txt tagmap.py NKJP-PodkorpusMilionowy-1.2 +/kpwr-1.1 \ No newline at end of file diff --git a/data/kpwr-1.1.dvc b/data/kpwr-1.1.dvc new file mode 100644 index 0000000..275217c --- /dev/null +++ b/data/kpwr-1.1.dvc @@ -0,0 +1,7 @@ +md5: 58cbc0bd05749d04e4b6a5e4c9d78c01 +outs: +- cache: true + md5: d84971d4b907e5efc5d9320de6691027.dir + metric: false + path: kpwr-1.1 +wdir: . diff --git a/data/lemmatizer_data/.gitignore b/data/lemmatizer_data/.gitignore index a6cd652..cdafee2 100644 --- a/data/lemmatizer_data/.gitignore +++ b/data/lemmatizer_data/.gitignore @@ -4,4 +4,5 @@ /rules.json /lemma_sources -/sjp_ispell.tar.bz2 \ No newline at end of file +/sjp_ispell.tar.bz2 +/lemma_sources_exp \ No newline at end of file diff --git a/lemma_sources_exp.dvc b/lemma_sources_exp.dvc new file mode 100644 index 0000000..76ca409 --- /dev/null +++ b/lemma_sources_exp.dvc @@ -0,0 +1,14 @@ +cmd: python lemma_rules_extraction/yield_all_suffixes.py +deps: +- md5: 947b48802b53bdff2ad02122c04063e5.dir + path: data/lemmatizer_data/lemma_sources +- md5: abb151c19621000ccc91d8e4b494674f + path: lemma_rules_extraction/yield_all_suffixes.py +md5: 53ab70bcbc27640766d00b4977b4733d +outs: +- cache: true + md5: 4f29377d4c9dd5c9997c74750af31321.dir + metric: false + path: data/lemmatizer_data/lemma_sources_exp + persist: false +wdir: .