When doing a word frequency count on my corpus, the results seem inacurate (are not the most frequent words to my feeling, and the frequency count is only one or two) and some results show 'as over\xe2' and '\xad'. Can anyone help?
def toptenwords(mycorpus):
mywords = mycorpus.words()
nocapitals = [word.lower() for word in mywords]
filtered = [word for word in nocapitals if word not in stoplist]
nopunctuation= [s.translate(None, string.punctuation) for s in filtered]
woordcounter = {}
for word in nopunctuation:
if word in wordcounter:
woordcounter[word] += 1
else:
woordcounter[word] = 1
frequentwords = sorted(wordcounter.iteritems(), key = itemgetter(1), reverse = True)
top10 = frequentwords[:10]
woord1 = frequentwords[1]
woord2 = frequentwords[2]
woord3 = frequentwords[3]
woord4 = frequentwords[4]
woord5 = frequentwords[5]
woord6 = frequentwords[6]
woord7 = frequentwords[7]
woord8 = frequentwords[8]
woord9 = frequentwords[9]
woord10 = frequentwords[10]
print "De 10 meest frequente woorden zijn: ", woord1, ",", woord2, ",", woord3, ",", woord4, ",", woord5, ",", woord6, ",", woord7, ",", woord8, ",", woord9, "en", woord10
Code is originally in dutch, this is the NOT translated code:
def toptienwoorden(mycorpus):
woorden = mycorpus.words()
zonderhoofdletters = [word.lower() for word in woorden]
gefiltered = [word for word in zonderhoofdletters if word not in stoplijst]
geenleestekens = [s.translate(None, string.punctuation) for s in gefiltered]
woordteller = {}
for word in geenleestekens:
if word in woordteller:
woordteller[word] += 1
else:
woordteller[word] = 1
frequentewoorden = sorted(woordteller.iteritems(), key = itemgetter(1), reverse = True)
top10 = frequentewoorden[:10]
woord1 = frequentewoorden[1]
woord2 = frequentewoorden[2]
woord3 = frequentewoorden[3]
woord4 = frequentewoorden[4]
woord5 = frequentewoorden[5]
woord6 = frequentewoorden[6]
woord7 = frequentewoorden[7]
woord8 = frequentewoorden[8]
woord9 = frequentewoorden[9]
woord10 = frequentewoorden[10]
print "De 10 meest frequente woorden zijn: ", woord1, ",", woord2, ",", woord3, ",", woord4, ",", woord5, ",", woord6, ",", woord7, ",", woord8, ",", woord9, "en", woord10