I am combining the processed data of two essays into one. I want to create a set two count how many different words are used as well as other analysis. However, when I combine them, and do set(entire), I am returned with just a set of letters. I have the code below as well as the output I am getting. I would like for the output to be all the words being used.
print set(entire)
set([' ', '1', '0', '3', '2', '5', '4', '6', '9', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x'])
from __future__ import division
import nltk
import csv
import re
from string import punctuation
import enchant
from enchant.checker import SpellChecker
dictionary = enchant.Dict("en_US")
chkr = SpellChecker("en_US")
with open('2012ShortAnswers.csv', 'rb') as csvfile:
data = csv.reader(csvfile, delimiter=",")
writer = csv.writer(open('2012output.csv', 'wb'))
for row in data:
row3 = row[3]
row3 = row3.lower().replace(' ', ' ')
row4 = row[4]
row4 = row4.lower().replace(' ', ' ')
row3 = row3.replace('\n', '')
row4 = row4.replace('\n', '')
for p in list(punctuation):
row3 = row3.replace(p, '')
row4 = row4.replace(p, '')
entire = row3 + row4
set(entire)