Untitled
unknown
plain_text
3 years ago
20 kB
8
Indexable
import json
import os
import itertools
import pandas as pd
from coreftools.formats import conll
from corefdb.layers import load_cache, save_cache, get_value, get_list_value
from corefdb.layers._conll_tree_parser import TreeParser
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
def extract_named_entities(outfpath, *infpaths, key_callback=None):
print("- extracting named entities")
data = {
#doc.key: {
# (start, stop): kind
# for start, stop, kind in [
# ne
# for sent in doc.sentences
# for ne in conll.col2spans(sent.iter_tokens(10),
# offset=sent.first_token_index)
# ]
#}
doc.key: {
(start, stop): kind
for sent in doc.sentences
for start, stop, kind in conll.col2spans(sent.iter_tokens(10),
offset=sent.first_token_index)
}
for doc in conll.read_files(*infpaths, key_callback=key_callback)
}
save_cache(outfpath, data)
"""
# very slow because of `mentions.loc[i]`
#def annotate_name_entities(mentions, fpath):
dic = load_cache(fpath)
mentions['named_entity_type'] = [
get_value(
dic,
*mentions.loc[i][['text_id', 'text_start', 'text_stop']],
extend=1)
for i in mentions.index
]
mentions['is_named_entity'] = mentions['named_entity_type'].notna()
mentions['is_name'] = mentions['named_entity_type'].apply(
lambda x: x in {'PERSON', 'FAC', 'ORG', 'GPE', 'WORK_OF_ART'})
"""
"""
# better but uses a bit more memory and is a bit slower because create a second
# DF and then concat
#def annotate_name_entities(mentions, fpath):
dic = load_cache(fpath)
def gen(text, start, stop):
res = get_value(dic, text, start, stop, extend=1)
if res is None:
return [None]*3
return (res, res is not None,
res in {'PERSON', 'FAC', 'ORG', 'GPE', 'WORK_OF_ART'})
df = pd.DataFrame(
data=(
gen(text, start, stop)
for text, start, stop in zip(
mentions['text_id'],
mentions['text_start'],
mentions['text_stop'],
)
),
columns=[
'named_entity_type',
'is_named_entity',
'is_name',
],
index=mentions.index,
)
return pd.concat([mentions, df], axis=1)
"""
# the best option, not using `mentions.loc[i]` and not creating a second DF
def annotate_name_entities(mentions, fpath):
print("- named entities")
dic = load_cache(fpath)
mentions['named_entity_type'] = [
get_value(dic, text, start, stop, extend=1)
for text, start, stop in zip(
mentions['text_id'],
mentions['text_start'],
mentions['text_stop'],
)
]
mentions['is_named_entity'] = [
not pd.isnull(val)
for val in mentions['named_entity_type']
]
mentions['is_name'] = mentions['named_entity_type'].apply(
lambda x: x in {
'PERSON',
'FAC', 'FACILITY',
'ORG',
'GPE',
'WORK_OF_ART',
'NORP',
'LOCATION',
'PRODUCT',
'EVENT',
'LAW',
'LANGUAGE',
})
mentions['is_name'] = mentions['named_entity_type'] == 'PERSON'
def extract_speakers(outfpath, *infpaths, key_callback=None):
print("- extracting speaker")
data = {
doc.key:[
sent.tokens[0][9] if sent.tokens[0][9] not in "-_" else None
for sent in doc.sentences
]
for doc in conll.read_files(*infpaths, key_callback=key_callback)
}
save_cache(outfpath, data, as_is=True)
def annotate_speakers(mentions, fpath):
print("- spearkers")
dic = load_cache(fpath, as_is=True)
#for id_, ser in mentions[['text_id', 'text_sent_index']].iterrows():
# print(f"mention {id_}, text {ser['text_id']}, sent {ser['text_sent_index']}")
# print(dic[ser['text_id']][ser['text_sent_index']])
mentions['speaker'] = [
dic[text_id][text_sent_index]
for text_id, text_sent_index in zip(
mentions['text_id'],
mentions['text_sent_index'],
)
]
def extract_argument_structures(outfpath, *infpaths, key_callback=None):
print("- extracting argument structures")
data = dict()
counter = 0
for doc in conll.read_files(*infpaths, key_callback=key_callback):
print("... from '%s'" % doc.key)
data[doc.key] = dict()
for sent in doc.sentences:
cols = list(range(11, len(sent.tokens[0])-1))
for col in cols:
args = list(conll.col2spans(
sent.iter_tokens(col),
offset=sent.first_token_index))
is_neg = bool(
(next((x for x in args if x[2] == "ARGM-NEG"), None)))
dic = {
(start, stop): [counter+i, i, kind, is_neg]
for i, (start, stop, kind) in enumerate(args)
if kind != 'V'
}
counter += len(dic)
data[doc.key].update(dic)
save_cache(outfpath, data)
def annotate_argument_structures(mentions, fpath):
print("- argument structures")
dic = load_cache(fpath)
def gen(text, start, stop):
res = get_value(dic, text, start, stop, extend=1)
if res is None:
return (False, None, None, None, None, None)
return (True,
res[0],
res[1],
res[2],
res[3],
res[2] == "ARG0",
)
df = pd.DataFrame(
data=(
gen(text, start, stop)
for text, start, stop in zip(
mentions['text_id'],
mentions['text_start'],
mentions['text_stop'],
)
),
columns=[
'is_arg',
'struct_id',
'arg_index',
'arg_type',
'struct_is_negative',
'arg_is_agent',
],
index=mentions.index,
)
return pd.concat([mentions, df], axis=1)
def extract_wordnet_synsets(*infpaths, outfpath, inventories2wordnet_fpath,
syntax_cache_fpath, key_callback=None):
"""NOTE: WN in nltk is 3.0, so we are look for v3.0"""
print("- extracting WordNet synsets")
data = dict()
# extract heads: datas = { doc_key: {(start, stop): [None, None]} }
# (official synset, guessed synset)
syntax_cache = load_cache(syntax_cache_fpath)
h_text_start_index = _NODE_ATTRIBUTE_LIST.index('h_text_start')
h_text_stop_index = _NODE_ATTRIBUTE_LIST.index('h_text_stop')
for doc_key, syntax in syntax_cache.items():
data[doc_key] = dict()
for pos, syntax in syntax.items():
h_pos = (syntax[h_text_start_index], syntax[h_text_stop_index])
data[doc_key][h_pos] = [None, None]
del syntax_cache
# load cached data
inventories2wordnet = json.load(open(inventories2wordnet_fpath))
def synset_exists(synset):
try:
synset = wordnet.synset(synset)
if synset.lemmas()[0].name()[0].isupper():
return False
return True
except nltk.corpus.reader.wordnet.WordNetError:
return False
# for each head, extract (lemma, pos, word sense) and look into
# inventories2wordnet to find the word net synset. If not, set a guessed
# one. Check that the synset really exists.
lemmatizer = WordNetLemmatizer()
for doc in conll.read_files(*infpaths, key_callback=key_callback):
print("... from '%s'" % doc.key)
for sent in doc.sentences:
for i, (form, pos, lemma, sense) in enumerate(zip(
sent.iter_tokens(3),
sent.iter_tokens(4),
sent.iter_tokens(6),
sent.iter_tokens(8))):
i += sent.first_token_index
#print(i, form, lemma, pos, sense)
if (i,i+1) in data[doc.key]:
pos = pos[0].lower()
if not pos in 'nv':
continue
synset = None
# some sense recorded
if lemma != '-' and sense != '-':
inventory = "%s-%s-%s" % (lemma, pos, sense)
# for predicted pos, there may be some errors:
if inventory in inventories2wordnet:
for version, sense in inventories2wordnet[inventory]:
# WN in nltk is 3.0, so we are look for v3.0
if version == "3.0" and sense is not None:
synset = "%s.%s.%02d" % \
(lemma, pos, int(sense))
#input(synset)
if synset_exists(synset):
data[doc.key][(i,i+1)][0] = synset
#print('found (official)')
break
if not data[doc.key][(i,i+1)][0]: # not found
if lemma == '-':
lemma = lemmatizer.lemmatize(form.lower(), pos)
synset = "%s.%s.01" % (lemma, pos)
#input(synset)
if synset_exists(synset):
#print('found (guessed)')
data[doc.key][(i,i+1)][1] = synset
save_cache(outfpath, data)
def annotate_wordnet_synsets(mentions, fpath, keep_guessed=False,
merge_official_and_guessed=False):
print("- wordnet synsets")
dic = load_cache(fpath)
def gen(text, start, stop):
official, guessed = get_list_value(dic, text, start, stop, length=2)
if merge_official_and_guessed:
return official if official else guessed
return official, guessed
df = pd.DataFrame(
data=(
gen(text, start, stop)
for text, start, stop in zip(
mentions['text_id'],
mentions['h_text_start'],
mentions['h_text_stop'],
)
),
columns=['wn'] if merge_official_and_guessed else ['wn', 'guessed_wn'],
index=mentions.index,
)
if not keep_guessed:
df = df.drop('guessed_wn', axis=1)
return pd.concat([mentions, df], axis=1)
def _gattr(obj, attr, add=None):
if obj is None:
return None
attr = getattr(obj, attr)
if add is not None:
attr += add
return attr
_NODE_ATTRIBUTES = (
# tag
'tag',
('parent_phrase_tag', lambda n: _gattr(n.parent_phrase, 'tag')),
('parent_clause_tag', lambda n: _gattr(n.parent_clause, 'tag')),
#'function_tag', # not annotated in conll2012
# nature
'is_clause',
'is_phrase',
('is_word', lambda n: n.is_leaf),
('pspeech', lambda n: n.pspeech if n.is_leaf else None),
# for relations
('parent_phrase_id', lambda n: _gattr(n.parent_phrase, 'id_')),
('parent_clause_id', lambda n: _gattr(n.parent_clause, 'id_')),
# preposition
('in_pp', lambda n: bool(n.pp)),
('preposition', lambda n: _gattr(_gattr(n.pp, 'preposition'), 'lemma')),
# position
'node_depth',
'clause_depth',
'phrase_depth',
'is_in_main_clause',
'is_in_matrix',
'is_embedded',
'is_in_embedded',
# dependent
'dependent_count',
'predependent_count',
'postdependent_count',
'adjective_dependent_counter',
'noun_dependent_counter',
'clause_dependent_counter',
'phrase_dependent_counter',
'other_dependent_counter',
# determiner
('determiner_string', lambda n: n.determiner_string.lower()),
('determiner_head_string', lambda n: n.determiner_head_string.lower()),
'determiner_type',
'has_bare_determiner',
'has_genetive_determiner',
'has_complex_determiner',
'is_possessive',
'is_genitive',
# head
('head', lambda n: _gattr(n.head, 'string')),
('h_pspeech', lambda n: _gattr(n.head, 'tag')),
('h_ud_pspeech', lambda n: _gattr(n.head, 'ud_pspeech')),
('h_broad_pspeech', lambda n: _gattr(n.head, 'broad_pspeech')),
('h_noun_type', lambda n: _gattr(n.head, 'noun_type')),
('h_number', lambda n: {"True":"p", "False":"s", "None":None} \
[str(_gattr(n.head, 'is_plural_noun'))]),
('h_lemma', lambda n: _gattr(n.head, 'lemma')),
('h_node_depth', lambda n: _gattr(n.head, 'node_depth')),
('h_start', lambda n: _gattr(n.head, 'start')),
('h_stop', lambda n: _gattr(n.head, 'stop')),
('h_text_start', lambda n: _gattr(n.head, 'text_start')),
('h_text_stop', lambda n: _gattr(n.head, 'text_stop')),
)
_NODE_ATTRIBUTE_LIST = [
x if isinstance(x, str) else x[0] for x in _NODE_ATTRIBUTES
]
_NODE_ATTRIBUTES_FOR_RELATIONS = (
'descendant_list',
'phrase_descendant_list',
'clause_descendant_list',
)
def extract_syntax_tree(outfpath, outfpath_for_relations, *infpaths,
key_callback=None, head_pos_cache=None):
print("- extracting syntax tree")
heads = load_cache(head_pos_cache) if head_pos_cache else None
data = dict()
data_for_relations = dict()
for doc in conll.read_files(*infpaths, key_callback=key_callback):
print("... from %s" % doc.key)
data[doc.key] = dict()
data_for_relations[doc.key] = dict()
for sent in doc.sentences:
# cols
word_col = list(sent.iter_tokens(3))
pspeech_col = list(sent.iter_tokens(4))
tree_col = list(sent.iter_tokens(5))
lemma_col = list(sent.iter_tokens(6))
# prepare
tree_string = "\n".join(tree_col).replace('*', ' *')
lemma_col = [
l
if (l and l != '-') else w
for w, l in zip(word_col, lemma_col)
]
# parse
parser = TreeParser(
tree_string,
list(zip(pspeech_col, word_col, lemma_col)),
)
parser.parse()
# put text_start|stop in ALL node before processing
for node in parser.node_list:
node.text_start = node.start + sent.first_token_index
node.text_stop = node.stop + sent.first_token_index
if heads:
pos2nodes = {
(node.text_start, node.text_stop): node
for node in parser.node_list
}
doc_heads = heads[doc.key]
for node in parser.node_list:
pos = (node.text_start, node.text_stop)
if pos in doc_heads:
pos = (doc_heads[pos], doc_heads[pos]+1)
node.set_head(pos2nodes[pos])
print("Found!")
else:
print("WARNING: head not found")
for node in parser.node_list:
start = node.start + sent.first_token_index
stop = node.stop + sent.first_token_index
# take the most nested element (so overwrite), except if it is
# a leaf:
if node.is_leaf and (start, stop) in data[doc.key]:
continue
data[doc.key][(start, stop)] = []
for attr in _NODE_ATTRIBUTES:
if not isinstance(attr, str):
attr = attr[1](node)
else:
attr = getattr(node, attr)
data[doc.key][(start, stop)].append(attr)
data_for_relations[doc.key][(start, stop)] = []
for attr in _NODE_ATTRIBUTES_FOR_RELATIONS:
attr = getattr(node, attr)
data_for_relations[doc.key][(start, stop)].append(attr)
save_cache(outfpath, data)
save_cache(outfpath_for_relations, data_for_relations)
def annotate_syntax_tree(mentions, fpath):
print("- syntax tree")
dic = load_cache(fpath)
length = len(_NODE_ATTRIBUTES)
df = pd.DataFrame(
data=(
get_list_value(dic, text, start, stop, length)
for text, start, stop in zip(
mentions['text_id'],
mentions['text_start'],
mentions['text_stop'],
)
),
columns=[
attr if isinstance(attr, str) else attr[0]
for attr in _NODE_ATTRIBUTES
],
index=mentions.index,
)
return pd.concat([mentions, df], axis=1)
def add_other_annotations(db):
print("- other annotations")
#db['texts']['genre'] = [x[0:2] for x in db['texts'].index]
db['texts']['source'] = [x[2:x.index('_')] for x in db['texts'].index]
def add_token_pos(db, *infpaths):
print("- token part of speech")
tokens = db['tokens']
groupped = tokens.groupby('text_id')
dic = dict()
for doc in conll.read_files(*infpaths):
df = groupped.get_group(doc.key)
df = df.sort_index()
tks = [ t for sent in doc.sentences for t in sent.iter_tokens(4) ]
assert len(df.index) == len(tks)
d = dict(zip(df.index, tks))
dic.update(d)
#for sent in doc.sentences:
# pspeech_col = list(sent.iter_tokens(4))
tokens['pos'] = [ dic[index] for index in tokens.index ]
def build_cache_conll2012(*infpaths, cache_dir, inventories2wordnet_fpath):
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
extract_named_entities(
os.path.join(cache_dir, 'ne'),
*infpaths,
)
extract_speakers(
os.path.join(cache_dir, 'spk'),
*infpaths,
)
extract_syntax_tree(
os.path.join(cache_dir, 'tree'),
os.path.join(cache_dir, 'tree_rel'),
*infpaths,
#head_pos_cache=None,
)
extract_argument_structures(
os.path.join(cache_dir, 'struct'),
*infpaths,
)
extract_wordnet_synsets(
*infpaths,
outfpath=os.path.join(cache_dir, 'wordnet'),
inventories2wordnet_fpath=inventories2wordnet_fpath,
syntax_cache_fpath=os.path.join(cache_dir, "tree"),
)
def add_conll2012_specific_annotations(db, cache_dir, *infpaths):
#print("... named entities")
# in place
annotate_name_entities(
db['mentions'],
fpath=os.path.join(cache_dir, 'ne'))
#print("... speaker")
# in place
annotate_speakers(
db['mentions'],
fpath=os.path.join(cache_dir, 'spk'))
#print("... syntax")
db['mentions'] = annotate_syntax_tree(
db['mentions'],
fpath=os.path.join(cache_dir, 'tree'),
)
#print("... argument structure")
db['mentions'] = annotate_argument_structures(
db['mentions'],
fpath=os.path.join(cache_dir, 'struct'),
)
#print("... wordnet")
db['mentions'] = annotate_wordnet_synsets(
db['mentions'],
fpath=os.path.join(cache_dir, 'wordnet'),
merge_official_and_guessed=False,
)
#print("Adding other informations...")
add_other_annotations(db)
add_token_pos(db, *infpaths)
return db
Editor is loading...