Not perfect, but most of the work is there. Now on to hardcoding pronouns (such as 'it') and closed-class words and adding multiple targets to handle things like 'shattered'. Not a single-liner, but not an impossible task!
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from pandas import Series, DataFrame
import collections
from nltk import wordnet
wn = wordnet.wordnet
def tag(x):
return pos_tag(word_tokenize(x))
def flatten(l):
for el in l:
if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
for sub in flatten(el):
yield sub
else:
yield el
def noun_verb_match(phrase, nouns, verbs):
res = []
for i in range(len(phrase) -1):
if (phrase[i][1] in nouns) &\
(phrase[i + 1][1] in verbs):
res.append((phrase[i], phrase[i + 1]))
return res
def hypernym_paths(word, pos):
res = [x.hypernym_paths() for x in wn.synsets(word, pos)]
return set(flatten(res))
def bool_syn(double, noun_syn, verb_syn):
"""
Returns boolean if noun/verb double contains the target Wordnet Synsets.
Arguments:
double: ((noun, tag), (verb, tag))
noun_syn, verb_syn: Wordnet Synset string (i.e., 'travel.v.01')
"""
noun = double[0][0]
verb = double[1][0]
noun_bool = wn.synset(noun_syn) in hypernym_paths(noun, 'n')
verb_bool = wn.synset(verb_syn) in hypernym_paths(verb, 'v')
return noun_bool & verb_bool
def bool_loop(l, f):
"""
Tests all list elements for truthiness and
returns True if any is True.
Arguments:
l: List.
e: List element.
f: Function returning boolean.
"""
if len(l) == 0:
return False
else:
return f(l[0]) | bool_loop(l[1:], f)
def bool_noun_verb(series, nouns, verbs, noun_synset_target, verb_synset_target):
tagged = series.map(tag)
nvm = lambda x: noun_verb_match(x, nouns, verbs)
matches = tagged.apply(nvm)
bs = lambda x: bool_syn(x, noun_synset_target, verb_synset_target)
return matches.apply(lambda x: bool_loop(x, bs))
phrases = ['Box fell from shelf',
'Bulb shattered on the ground',
'A piece of plaster fell from the ceiling',
'The blame fell on Sarah',
'Berlin fell on May',
'The temperature fell abruptly',
'It fell on the floor']
nouns = "NN NNP PRP NNS".split()
verbs = "VB VBD VBZ".split()
noun_synset_target = 'artifact.n.01'
verb_synset_target = 'travel.v.01'
df = DataFrame()
df['text'] = Series(phrases)
df['fall'] = bool_noun_verb(df.text, nouns, verbs, noun_synset_target, verb_synset_target)
df