Skip to content
11 changes: 3 additions & 8 deletions ConvertNER/convert_NER.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import xml.etree.ElementTree as ET
from spacy.lang.pl import Polish
from spacy.gold import biluo_tags_from_offsets
import spacy
from spacy.lang.pl import Polish
import json
import os

Expand Down Expand Up @@ -139,8 +138,8 @@ def required_files_exist(dir):
if not os.path.isdir((os.path.join(path_prefix,corpus_path,current_folder))):
continue

# we skip the docs that don't have the required annotations (certain .xml files)
if not required_files_exist(current_folder):
# doc_id +=1 ?
continue

tree_morphosyntax = ET.parse(os.path.join(path_prefix,corpus_path,current_folder,morphosyntax_xml))
Expand Down Expand Up @@ -209,9 +208,5 @@ def required_files_exist(dir):
doc_id += 1
corpus += [doc_json]

out_path = os.path.expanduser(os.path.join(path_prefix, output_path))
if not os.path.exists(out_path):
os.makedirs(out_path)

with open(os.path.join(out_path, output), 'w+') as f:
with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
json.dump(corpus, f)
185 changes: 185 additions & 0 deletions ConvertNER/convert_NER_wroc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import xml.etree.ElementTree as ET
import json
import os

path_prefix = './'
corpus_path = 'data/kpwr-1.1/'
output_path = 'data/NER/'
output = 'NER_wroc.json'

doc_id = 0
corpus = []

NE_njkp_to_spacy = {'persName': 'PERSON',
'placeName': 'LOC',
'orgName': 'ORG',
'date': 'DATE',
'time': 'TIME',
'geogName': 'LOC'}


class Token:
def __init__(self, orth, attribs, id):
self.orth = orth
self.attribs = attribs
self.id = id

def is_NE(self):
return len(self.attribs) != 0

def get_NE(self):
return self.attribs[0] if len(self.attribs) > 0 else ""

def __str__(self):
return (self.orth + ":" + str(self.attribs))


def get_subdirs(dir):
return [name for name in os.listdir(dir) if os.path.isdir(os.path.join(dir, name))]


def process_token(tok):
attribs = []
orth = tok.find("orth").text
for ann in tok.iter("ann"):
if ann.attrib['chan'].endswith("nam") and ann.text != "0":
attribs += [ann.attrib['chan']]

return Token(orth, attribs, -1)


def get_common_tag(t1, t2):
set1 = set(t1.attribs)
set2 = set(t2.attribs)
common = list(set1 & set2)
return common[0] if len(common) > 0 else None


def pick_tags(tokens):
# first and last separately
if len(tokens) == 0:
return tokens
if len(tokens) == 1:
if tokens[0].is_NE():
tokens[0].attribs = [tokens[0].attribs[0]]
return tokens

t0 = tokens[0]
if len(t0.attribs) > 1:
new_tag = get_common_tag(t0, tokens[1])
if new_tag is None:
t0.attribs = [t0.attribs[0]]
else:
t0.attribs = [new_tag]

for i in range(1, len(tokens) - 1):
if len(tokens[i].attribs) > 1:
new_tag = get_common_tag(tokens[i - 1], tokens[i])
if new_tag is None:
new_tag = get_common_tag(tokens[i], tokens[i + 1])
if new_tag is None:
tokens[i].attribs = [tokens[i].attribs[0]]
else:
tokens[i].attribs = [new_tag]
else:
tokens[i].attribs = [new_tag]

te = tokens[-1]
if len(te.attribs) > 1:
new_tag = get_common_tag(te, tokens[-2])
if new_tag is None:
te.attribs = [te.attribs[0]]
else:
te.attribs = [new_tag]

assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te]))
return [t0] + tokens[1:-2] + [te]


def convert_to_biluo(tokens):
out = []
in_ne = False
for i, token in enumerate(tokens[:-1]):
if in_ne:
if token.is_NE():
if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
# inner NE
out += [Token(token.orth, ["I-" + token.get_NE()], token.id)]
else:
# last NE
out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
in_ne = False
else:
# we shouldn't ever get here
assert (False)

else:
if token.is_NE():
# new NE
if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
# beginning NE
out += [Token(token.orth, ["B-" + token.get_NE()], token.id)]
in_ne = True
else:
# unit NE
out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
in_ne = False
else:
# outside of NE
out += [Token(token.orth, ["O"], token.id)]

# process last token
token = tokens[-1]
if in_ne:
out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
else:
if token.is_NE():
out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
else:
out += [Token(token.orth, ["O"], token.id)]

return out


docs = []
doc_idx = 0
for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
if not file.endswith("rel.xml") and not file.endswith(".ini"):
doc_json = {}
sentences = []
token_idx = 0
raw = ""
tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
root = tree.getroot()
sents = root.iter("sentence")
for sent in sents:
tokens = []
for tok in sent.iter("tok"):
token = process_token(tok)
token.id = token_idx
token_idx += 1
tokens += [token]

tokens = pick_tags(tokens)
tokens = convert_to_biluo(tokens)

sent = {'tokens': [{
'orth': t.orth,
'id': t.id,
'ner': t.get_NE()}
for t in tokens
], 'brackets': []
}

sentences += [sent]

doc_json = {
'id': doc_idx,
'paragraphs': [{'sentences': sentences}]
}
corpus += [doc_json]
doc_idx += 1

with open(os.path.expanduser(os.path.join(path_prefix, output_path, output)), 'w+') as f:
json.dump(corpus, f)
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ vocab.jsonl
/vectors_300.txt
tagmap.py
NKJP-PodkorpusMilionowy-1.2
/kpwr-1.1
7 changes: 7 additions & 0 deletions data/kpwr-1.1.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
md5: 58cbc0bd05749d04e4b6a5e4c9d78c01
outs:
- cache: true
md5: d84971d4b907e5efc5d9320de6691027.dir
metric: false
path: kpwr-1.1
wdir: .
3 changes: 2 additions & 1 deletion data/lemmatizer_data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
/rules.json

/lemma_sources
/sjp_ispell.tar.bz2
/sjp_ispell.tar.bz2
/lemma_sources_exp
14 changes: 14 additions & 0 deletions lemma_sources_exp.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
cmd: python lemma_rules_extraction/yield_all_suffixes.py
deps:
- md5: 947b48802b53bdff2ad02122c04063e5.dir
path: data/lemmatizer_data/lemma_sources
- md5: abb151c19621000ccc91d8e4b494674f
path: lemma_rules_extraction/yield_all_suffixes.py
md5: 53ab70bcbc27640766d00b4977b4733d
outs:
- cache: true
md5: 4f29377d4c9dd5c9997c74750af31321.dir
metric: false
path: data/lemmatizer_data/lemma_sources_exp
persist: false
wdir: .