NLTK bag of bigrams words function raises "Don't know how to concatenate types" error - python

Question

Firstly, your variable words is not a list of tokens as you expect it to be, it is of type 'nltk.corpus.reader.util.StreamBackedCorpusView.

To verify that try:

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    print type(words)
    print type(bigrams)
    return bag_of_words(words + bigrams)

[out]:

<class 'nltk.corpus.reader.util.StreamBackedCorpusView'>
<type 'list'>

Note that when you specify a fileid when using the .words() function of a corpus object, you will get different CorpusView objects, see:

from nltk.corpus import movie_reviews
first_fileid =  movie_reviews.fileids(categories=["pos"])[0]
print first_fileid
print type(movie_reviews.words(first_fileid))
print type(movie_reviews.words())

[out]:

pos/cv000_29590.txt
<class 'nltk.corpus.reader.util.StreamBackedCorpusView'>
<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>

Possibly, what you need is a list of tokens instead of a CorpusView object, so you would need iterate through the generator to get a list of string:

from nltk.corpus import movie_reviews
first_fileid =  movie_reviews.fileids(categories=["pos"])[0]
tokens_in_file = [i for i in movie_reviews.words(first_fileid)]
print tokens_in_file
print type(tokens_in_file)

[out]:

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'", 're', 'about', 'superheroes', '(', 'batman', ',', 'superman', ',', 'spawn', ')', ',', 'or', 'geared', 'toward', 'kids', '(', 'casper', ')', 'or', 'the', 'arthouse', 'crowd', '(', 'ghost', 'world', ')', ',', 'but', 'there', "'", 's', 'never', 'really', 'been', 'a', 'comic', 'book', 'like', 'from', 'hell', 'before', '.', 'for', 'starters', ',', 'it', 'was', 'created', 'by', 'alan', 'moore', '(', 'and', 'eddie', 'campbell', ')', ',', 'who', 'brought', 'the', 'medium', 'to', 'a', 'whole', 'new', 'level', 'in', 'the', 'mid', "'", '80s', 'with', 'a', '12', '-', 'part', 'series', 'called', 'the', 'watchmen', '.', 'to', 'say', 'moore', 'and', 'campbell', 'thoroughly', 'researched', 'the', 'subject', 'of', 'jack', 'the', 'ripper', 'would', 'be', 'like', 'saying', 'michael', 'jackson', 'is', 'starting', 'to', 'look', 'a', 'little', 'odd', '.', 'the', 'book', '(', 'or', '"', 'graphic', 'novel', ',', '"', 'if', 'you', 'will', ')', 'is', 'over', '500', 'pages', 'long', 'and', 'includes', 'nearly', '30', 'more', 'that', 'consist', 'of', 'nothing', 'but', 'footnotes', '.', 'in', 'other', 'words', ',', 'don', "'", 't', 'dismiss', 'this', 'film', 'because', 'of', 'its', 'source', '.', 'if', 'you', 'can', 'get', 'past', 'the', 'whole', 'comic', 'book', 'thing', ',', 'you', 'might', 'find', 'another', 'stumbling', 'block', 'in', 'from', 'hell', "'", 's', 'directors', ',', 'albert', 'and', 'allen', 'hughes', '.', 'getting', 'the', 'hughes', 'brothers', 'to', 'direct', 'this', 'seems', 'almost', 'as', 'ludicrous', 'as', 'casting', 'carrot', 'top', 'in', ',', 'well', ',', 'anything', ',', 'but', 'riddle', 'me', 'this', ':', 'who', 'better', 'to', 'direct', 'a', 'film', 'that', "'", 's', 'set', 'in', 'the', 'ghetto', 'and', 'features', 'really', 'violent', 'street', 'crime', 'than', 'the', 'mad', 'geniuses', 'behind', 'menace', 'ii', 'society', '?', 'the', 'ghetto', 'in', 'question', 'is', ',', 'of', 'course', ',', 'whitechapel', 'in', '1888', 'london', "'", 's', 'east', 'end', '.', 'it', "'", 's', 'a', 'filthy', ',', 'sooty', 'place', 'where', 'the', 'whores', '(', 'called', '"', 'unfortunates', '"', ')', 'are', 'starting', 'to', 'get', 'a', 'little', 'nervous', 'about', 'this', 'mysterious', 'psychopath', 'who', 'has', 'been', 'carving', 'through', 'their', 'profession', 'with', 'surgical', 'precision', '.', 'when', 'the', 'first', 'stiff', 'turns', 'up', ',', 'copper', 'peter', 'godley', '(', 'robbie', 'coltrane', ',', 'the', 'world', 'is', 'not', 'enough', ')', 'calls', 'in', 'inspector', 'frederick', 'abberline', '(', 'johnny', 'depp', ',', 'blow', ')', 'to', 'crack', 'the', 'case', '.', 'abberline', ',', 'a', 'widower', ',', 'has', 'prophetic', 'dreams', 'he', 'unsuccessfully', 'tries', 'to', 'quell', 'with', 'copious', 'amounts', 'of', 'absinthe', 'and', 'opium', '.', 'upon', 'arriving', 'in', 'whitechapel', ',', 'he', 'befriends', 'an', 'unfortunate', 'named', 'mary', 'kelly', '(', 'heather', 'graham', ',', 'say', 'it', 'isn', "'", 't', 'so', ')', 'and', 'proceeds', 'to', 'investigate', 'the', 'horribly', 'gruesome', 'crimes', 'that', 'even', 'the', 'police', 'surgeon', 'can', "'", 't', 'stomach', '.', 'i', 'don', "'", 't', 'think', 'anyone', 'needs', 'to', 'be', 'briefed', 'on', 'jack', 'the', 'ripper', ',', 'so', 'i', 'won', "'", 't', 'go', 'into', 'the', 'particulars', 'here', ',', 'other', 'than', 'to', 'say', 'moore', 'and', 'campbell', 'have', 'a', 'unique', 'and', 'interesting', 'theory', 'about', 'both', 'the', 'identity', 'of', 'the', 'killer', 'and', 'the', 'reasons', 'he', 'chooses', 'to', 'slay', '.', 'in', 'the', 'comic', ',', 'they', 'don', "'", 't', 'bother', 'cloaking', 'the', 'identity', 'of', 'the', 'ripper', ',', 'but', 'screenwriters', 'terry', 'hayes', '(', 'vertical', 'limit', ')', 'and', 'rafael', 'yglesias', '(', 'les', 'mis', '?', 'rables', ')', 'do', 'a', 'good', 'job', 'of', 'keeping', 'him', 'hidden', 'from', 'viewers', 'until', 'the', 'very', 'end', '.', 'it', "'", 's', 'funny', 'to', 'watch', 'the', 'locals', 'blindly', 'point', 'the', 'finger', 'of', 'blame', 'at', 'jews', 'and', 'indians', 'because', ',', 'after', 'all', ',', 'an', 'englishman', 'could', 'never', 'be', 'capable', 'of', 'committing', 'such', 'ghastly', 'acts', '.', 'and', 'from', 'hell', "'", 's', 'ending', 'had', 'me', 'whistling', 'the', 'stonecutters', 'song', 'from', 'the', 'simpsons', 'for', 'days', '(', '"', 'who', 'holds', 'back', 'the', 'electric', 'car', '/', 'who', 'made', 'steve', 'guttenberg', 'a', 'star', '?', '"', ')', '.', 'don', "'", 't', 'worry', '-', 'it', "'", 'll', 'all', 'make', 'sense', 'when', 'you', 'see', 'it', '.', 'now', 'onto', 'from', 'hell', "'", 's', 'appearance', ':', 'it', "'", 's', 'certainly', 'dark', 'and', 'bleak', 'enough', ',', 'and', 'it', "'", 's', 'surprising', 'to', 'see', 'how', 'much', 'more', 'it', 'looks', 'like', 'a', 'tim', 'burton', 'film', 'than', 'planet', 'of', 'the', 'apes', 'did', '(', 'at', 'times', ',', 'it', 'seems', 'like', 'sleepy', 'hollow', '2', ')', '.', 'the', 'print', 'i', 'saw', 'wasn', "'", 't', 'completely', 'finished', '(', 'both', 'color', 'and', 'music', 'had', 'not', 'been', 'finalized', ',', 'so', 'no', 'comments', 'about', 'marilyn', 'manson', ')', ',', 'but', 'cinematographer', 'peter', 'deming', '(', 'don', "'", 't', 'say', 'a', 'word', ')', 'ably', 'captures', 'the', 'dreariness', 'of', 'victorian', '-', 'era', 'london', 'and', 'helped', 'make', 'the', 'flashy', 'killing', 'scenes', 'remind', 'me', 'of', 'the', 'crazy', 'flashbacks', 'in', 'twin', 'peaks', ',', 'even', 'though', 'the', 'violence', 'in', 'the', 'film', 'pales', 'in', 'comparison', 'to', 'that', 'in', 'the', 'black', '-', 'and', '-', 'white', 'comic', '.', 'oscar', 'winner', 'martin', 'childs', "'", '(', 'shakespeare', 'in', 'love', ')', 'production', 'design', 'turns', 'the', 'original', 'prague', 'surroundings', 'into', 'one', 'creepy', 'place', '.', 'even', 'the', 'acting', 'in', 'from', 'hell', 'is', 'solid', ',', 'with', 'the', 'dreamy', 'depp', 'turning', 'in', 'a', 'typically', 'strong', 'performance', 'and', 'deftly', 'handling', 'a', 'british', 'accent', '.', 'ians', 'holm', '(', 'joe', 'gould', "'", 's', 'secret', ')', 'and', 'richardson', '(', '102', 'dalmatians', ')', 'log', 'in', 'great', 'supporting', 'roles', ',', 'but', 'the', 'big', 'surprise', 'here', 'is', 'graham', '.', 'i', 'cringed', 'the', 'first', 'time', 'she', 'opened', 'her', 'mouth', ',', 'imagining', 'her', 'attempt', 'at', 'an', 'irish', 'accent', ',', 'but', 'it', 'actually', 'wasn', "'", 't', 'half', 'bad', '.', 'the', 'film', ',', 'however', ',', 'is', 'all', 'good', '.', '2', ':', '00', '-', 'r', 'for', 'strong', 'violence', '/', 'gore', ',', 'sexuality', ',', 'language', 'and', 'drug', 'content']
<type 'list'>

And back to your code, you would need to iterate through a generator before return bag_of_words(words + bigrams):

import nltk, collections
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


def bag_of_words(words):
    return dict([(word, True) for word in words])

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    words = [i for i in words]
    return bag_of_words(words + bigrams)

def label_feats_from_corpus(corp, feature_detector=bag_of_bigrams_words):
    label_feats = collections.defaultdict(list)
    for label in corp.categories():
        for fileid in corp.fileids(categories=[label]):
            feats = feature_detector(corp.words(fileids=[fileid]))
            label_feats[label].append(feats)
    return label_feats

from nltk.corpus import movie_reviews
lfeats = label_feats_from_corpus(movie_reviews)
print lfeats