Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions ConvertNER/NER_pwr_to_wiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# uncompyle6 version 3.3.1
# Python bytecode 3.7 (3394)
# Decompiled from: Python 3.7.1 (default, Nov 6 2018, 18:46:03)
# [Clang 10.0.0 (clang-1000.11.45.5)]
# Embedded file name: /Users/quark/studia/zpp/new/utils/ConvertNER/NER_pwr_to_spacy.py
# Size of source mod 2**32: 1685 bytes
NER_pwr_to_spacy = {'person_nam':'PER', 'institution_nam':'ORG',
'city_nam':'LOC',
'person_last_nam':'PER',
'person_first_nam':'PER',
'document_nam':'MISC',
'event_nam':'MISC',
'organization_nam':'ORG',
'country_nam':'LOC',
'title_nam':'MISC',
'band_nam':'ORG',
'periodic_nam':'MISC',
'company_nam':'ORG',
'facility_nam':'ORG',
'brand_nam':'ORG',
'political_party_nam':'ORG',
'road_nam':'LOC',
'admin1_nam':'LOC',
'person_add_nam':'PER',
'software_nam':'MISC',
'nation_nam':'MISC',
'tech_nam':'MISC',
'nam':'MISC',
'treaty_nam':'MISC',
'web_nam':'MISC',
'admin2_nam':'LOC',
'award_nam':'MISC',
'continent_nam':'LOC',
'astronomical_nam':'LOC',
'media_nam':'ORG',
'river_nam':'LOC',
'currency_nam':'MISC',
'toponym_nam':'LOC',
'mountain_nam':'LOC',
'historical_region_nam':'LOC',
'district_nam':'LOC',
'country_region_nam':'LOC',
'subdivision_nam':'ORG',
'admin3_nam':'LOC',
'region_nam':'LOC',
'square_nam':'LOC',
'park_nam':'LOC',
'island_nam':'LOC',
'system_nam':'MISC',
'www_nam':'MISC',
'person_group_nam':'MISC',
'license_nam':'MISC',
'lake_nam':'LOC',
'animal_nam':'MISC',
'sea_nam':'LOC',
'person_adj_nam':'PER',
'bay_nam':'LOC',
'peninsula_nam':'LOC',
'conurbation_nam':'LOC',
'vehicle_nam':'MISC',
'organization_sub_nam':'ORG',
'ocean_nam':'LOC',
'cape_nam':'LOC'}
# okay decompiling NER_pwr_to_spacy.cpython-37.pyc
231 changes: 136 additions & 95 deletions ConvertNER/convert_NER_wroc.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ def __init__(self, orth, attribs, id):
self.id = id

def is_NE(self):
return len(self.attribs) != 0
return self.get_NE() is not None and self.get_NE() != "O"

def get_NE(self):
return self.attribs[0] if len(self.attribs) > 0 else ""
for attrib in self.attribs:
for k in attrib:
if attrib[k] != "0":
return k

def get_cooccurences(self):
res = setCounter
return None

def __str__(self):
return (self.orth + ":" + str(self.attribs))
Expand All @@ -50,8 +52,8 @@ def process_token(tok):
attribs = []
orth = tok.find("orth").text
for ann in tok.iter("ann"):
if ann.attrib['chan'].endswith("nam") and ann.text == "1":
attribs += [ann.attrib['chan']]
if ann.attrib['chan'].endswith("nam"): # and ann.text != "0":
attribs += [{ann.attrib['chan']: ann.text}]

return Token(orth, attribs, -1)

Expand Down Expand Up @@ -83,50 +85,80 @@ def get_all_labels_with_cardinalities(tokens):

def map_labels(tokens, map):
for tok in tokens:
tok.attribs = [map[attrib] for attrib in tok.attribs]
tok.attribs = [{map[k]: v} for attrib in tok.attribs for k, v in attrib.items()]

return tokens


def pick_tags(tokens):
# first and last separately
if len(tokens) == 0:
return tokens
if len(tokens) == 1:
if tokens[0].is_NE():
tokens[0].attribs = [tokens[0].attribs[0]]
return tokens

t0 = tokens[0]
if len(t0.attribs) > 1:
new_tag = get_common_tag(t0, tokens[1])
if new_tag is None:
t0.attribs = [t0.attribs[0]]
else:
t0.attribs = [new_tag]

for i in range(1, len(tokens) - 1):
if len(tokens[i].attribs) > 1:
new_tag = get_common_tag(tokens[i - 1], tokens[i])
if new_tag is None:
new_tag = get_common_tag(tokens[i], tokens[i + 1])
if new_tag is None:
tokens[i].attribs = [tokens[i].attribs[0]]
else:
tokens[i].attribs = [new_tag]
else:
tokens[i].attribs = [new_tag]
def still_in_sequence(v1, v2):
return any(v1e == v2e != "0" for v1e, v2e in zip(v1,v2))


def get_last_label(v):
for i, e in enumerate(v):
if e != "0":
return i
return None


def get_label_set(v):
res = set()
for i, e in enumerate(v):
if e != "0":
res.add(i)

return res


import random
def get_any_label(v):
if v == emptyset():
return None
return random.sample(v, 1)[0]

te = tokens[-1]
if len(te.attribs) > 1:
new_tag = get_common_tag(te, tokens[-2])
if new_tag is None:
te.attribs = [te.attribs[0]]
def emptyset():
return set()

def get_longest_sequences(tokens):
res = []
b = 0
e = 0
attribs = [k for d in tokens[0].attribs for k in d]
last_set = None
label_set = emptyset()
while e != len(tokens)-1:
current_token = tokens[e]

if last_set == None or label_set == emptyset():
last_set = [v for d in current_token.attribs for k, v in d.items()]
label_set = get_label_set(last_set)
b = e
else:
te.attribs = [new_tag]
new_set = [v for d in current_token.attribs for k, v in d.items()]
label_set = label_set.intersection(get_label_set(new_set))
if not still_in_sequence(last_set, new_set):
label_id = get_any_label(label_set)
if(label_id != None):
label = attribs[label_id]
res.append((b, e, label))
b = e
label_set = emptyset()

last_set = new_set
e += 1

return res


assert (all(len(t.attribs) <= 1 for t in [t0] + tokens + [te]))
return [t0] + tokens[1:-2] + [te]
# emptyset = set()
def pick_tags(tokens):
longest_sequences = get_longest_sequences(tokens)
for b, e, label in longest_sequences:
seq = tokens[b:e]
for tok in seq:
tok.attribs = [{label: '1'}]
tokens[b:e] = seq
return tokens


def convert_to_biluo(tokens):
Expand All @@ -137,10 +169,10 @@ def convert_to_biluo(tokens):
if token.is_NE():
if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
# inner NE
out += [Token(token.orth, ["I-" + token.get_NE()], token.id)]
out += [Token(token.orth, [{"I-" + token.get_NE(): '1'}], token.id)]
else:
# last NE
out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)]
in_ne = False
else:
# we shouldn't ever get here
Expand All @@ -151,82 +183,91 @@ def convert_to_biluo(tokens):
# new NE
if tokens[i + 1].is_NE() and token.get_NE() == tokens[i + 1].get_NE():
# beginning NE
out += [Token(token.orth, ["B-" + token.get_NE()], token.id)]
out += [Token(token.orth, [{"B-" + token.get_NE(): '1'}], token.id)]
in_ne = True
else:
# unit NE
out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)]
in_ne = False
else:
# outside of NE
out += [Token(token.orth, ["O"], token.id)]
out += [Token(token.orth, [{"O": '1'}], token.id)]

# process last token
token = tokens[-1]
if in_ne:
out += [Token(token.orth, ["L-" + token.get_NE()], token.id)]
out += [Token(token.orth, [{"L-" + token.get_NE(): '1'}], token.id)]
else:
if token.is_NE():
out += [Token(token.orth, ["U-" + token.get_NE()], token.id)]
out += [Token(token.orth, [{"U-" + token.get_NE(): '1'}], token.id)]
else:
out += [Token(token.orth, ["O"], token.id)]
out += [Token(token.orth, [{"O": '1'}], token.id)]

return out


def get_file_paths(index_path):
with open(index_path) as index_file:
files = []
line = index_file.readline()
while line:
line = line.replace('\n', '')
files.append(line)
line = index_file.readline()

return files


@click.command()
@click.option("-m", "--use-label-map", type=bool, default=False)
@click.argument("output_path", type=str)
def main(
use_label_map,
output_path,
):
if use_label_map:
# classes = set(NER_pwr_to_spacy.values())
# output = f'NER_wroc_{len(classes)}.json'
# this would be a cool feature but I'm not sure if it's good for automatic pipelines
output = 'NER_wroc_spacy_labels.json'
all_labels = setCounter()
corpus = []
doc_idx = 0
for subfolder in get_subdirs(os.path.join(path_prefix, corpus_path)):
for file in os.listdir(os.path.join(path_prefix, corpus_path, subfolder)):
if not file.endswith("rel.xml") and not file.endswith(".ini"):
sentences = []
token_idx = 0
tree = ET.parse(os.path.join(path_prefix, corpus_path, subfolder, file))
root = tree.getroot()
sents = root.iter("sentence")
for sent in sents:
tokens = []
for tok in sent.iter("tok"):
token = process_token(tok)
token.id = token_idx
token_idx += 1
tokens += [token]

all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis
tokens = pick_tags(tokens)
if use_label_map:
tokens = map_labels(tokens, NER_pwr_to_spacy)
tokens = convert_to_biluo(tokens)

sent = {'tokens': [{
'orth': t.orth,
'id': t.id,
'ner': t.get_NE()}
for t in tokens
], 'brackets': []
}

sentences += [sent]

doc_json = {
'id': doc_idx,
'paragraphs': [{'sentences': sentences}]
}
corpus += [doc_json]
doc_idx += 1
file_paths = get_file_paths(os.path.join(path_prefix, corpus_path, 'index_names.txt'))
for file in file_paths:
file = os.path.join(path_prefix, corpus_path, file)
assert(not file.endswith("rel.xml") and not file.endswith(".ini"))
sentences = []
token_idx = 0
tree = ET.parse(file)
root = tree.getroot()
sents = root.iter("sentence")
for sent in sents:
tokens = []
for tok in sent.iter("tok"):
token = process_token(tok)
token.id = token_idx
token_idx += 1
tokens += [token]

# all_labels.merge(get_all_labels_with_cardinalities(tokens)) # for debug and analysis
tokens = pick_tags(tokens)
# tokens = flatten_token_attrib_dicts(tokens)

if use_label_map:
tokens = map_labels(tokens, NER_pwr_to_spacy)
tokens = convert_to_biluo(tokens)

sent = {'tokens': [{
'orth': t.orth,
'id': t.id,
'ner': t.get_NE()}
for t in tokens
], 'brackets': []
}

sentences += [sent]

doc_json = {
'id': doc_idx,
'paragraphs': [{'sentences': sentences}]
}
corpus += [doc_json]
doc_idx += 1

with open(os.path.expanduser(output_path), 'w+') as f:
json.dump(corpus, f)
Expand Down
13 changes: 7 additions & 6 deletions NER_wroc-19.json.dvc
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json
cmd: python ConvertNER/convert_NER_wroc.py -m true data/NER/NER_wroc-19.json
deps:
- md5: d84971d4b907e5efc5d9320de6691027.dir
- md5: edb877fcf74af64289c0c32299288927.dir
path: data/kpwr-1.1
- md5: c8aa684e59762c66aeba79e2727c103f
- md5: 0c7d0ba89998f4c6c7cf84a50d2a6654
path: ConvertNER/convert_NER_wroc.py
- md5: eee1569106fcf22473ee5a39f49f57bd
- md5: d7d343ce8b47f93f20e3870d91c6150e
path: ConvertNER/NER_pwr_to_spacy.py
md5: 8edf603b1572083aedf1a95147deec94
locked: true
md5: 9618dffa84d6309d470a77da9e8de843
outs:
- cache: true
md5: ffd284e41307a7b0815d10623a0b4c99
md5: 25117be4c42e22d242c1e50d066fa35d
metric: false
path: data/NER/NER_wroc-19.json
persist: false
Expand Down
Loading