Question

I have two text files (1) sample of bad words and (2) sample of good words. Now I have perform nearest neighbor classification in which new word found will be classified as being good or bad.I want insight on how to approach this with my existing code. Thanks

class Words_Works():

def __init__(self):
    self.all_texts = {}
    self.categories = {}
    self.knn_results = {}
    self.stop_words = ['and','the','i','am','he','she','his',
                        'me','my','a','at','in','you','your',
                        'of','to','this','that','him','her',
                        'they','is','it','can','for','into',
                        'as','with','we','us','them','a', 
                        'it', 'on', 'so', 'too','k','the',
                        'but', 'are','though'
                        'very', 'here', 'even', 'from',
                        'then', 'than']

    self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']

def add_category(self,f,cat_name):
    f_in = open(f)
    self.text = f_in.read().lower()
    f_in.close()
    self.wordify()
    self.unstopify()
    self.unleafify()
    self.categories[cat_name] = {}
    for item  in self.unleaf:
        if self.categories[cat_name].has_key(item):
            self.categories[cat_name][item] += 1
        else:
            self.categories[cat_name][item] = 1


def load_categories(self):
    try:
        cat_db = open('tweetCategory.txt','rb')
        self.categories = cPickle.load(cat_db)
        cat_db.close()
        print 'File successfully loaded from categories db'
    except:
        print 'File not loaded from categories_db'


        # Finds the levenshtein's distance 
def levenshtein_distance(first, second):
"""Find the Levenshtein distance between two strings."""
if len(first) > len(second):
    first, second = second, first
    if len(second) == 0:
        return len(first)
        first_length = len(first) + 1
        second_length = len(second) + 1
        distance_matrix = [[0] * second_length for x in range(first_length)]
        for i in range(first_length):
            distance_matrix[i][0] = i
            for j in range(second_length):
               distance_matrix[0][j]=j
               for i in xrange(1, first_length):
                   for j in range(1, second_length):
                       deletion = distance_matrix[i-1][j] + 1
                       insertion = distance_matrix[i][j-1] + 1
                       substitution = distance_matrix[i-1][j-1]
                       if first[i-1] != second[j-1]:
                           substitution += 1
                           distance_matrix[i][j] = min(insertion, deletion, substitution)
    return distance_matrix[first_length-1][second_length-1]  

def add_text(self,f):
    f_in = open(f)
    self.text = f_in.read().lower()
    f_in.close()
    self.wordify()
    self.unstopify()
    self.unleafify()
    self.all_texts[f] = {}
    for item in self.unleaf:
        if self.all_texts[f].has_key(item):
            self.all_texts[f][item] += 1
        else:
            self.all_texts[f][item] = 1

def save_categories(self):
    cat_db = open('tweetCategory.txt','wb')
    cPickle.dump(cat_db,self.categories,-1)
    cat_db.close()

def unstopify(self):
    self.unstop = [item for item in self.words if item not in self.stop_words]

def unleafify(self):
    self.unleaf = self.unstop[:]
    for leaf in self.leaf_words:
        leaf_len = len(leaf)
        leaf_pattern = re.compile('%s$' % leaf)
        for i in range(len(self.unleaf)):
            if leaf_pattern.findall(self.unleaf[i]):
                self.unleaf[i] = self.unleaf[i][:-leaf_len]

def wordify(self):
    words_pattern = re.compile('//w+')
    self.words = words_pattern.findall(self.text)

def knn_calc(self):
    for text in self.all_texts.keys():
        self.knn_results[text] = {}
        for category in self.categories.keys():
            self.knn_results[text][category] = {}
            iterations = 0
            distance = 0
            for word in self.all_texts[text].keys():
                if word in self.categories[text].keys():

                    distance = levenshtein_distance(text,category)
                    self.knn_results[text][category]['Knn Distance'] = distance
                    self.knn_results [text][category]['Knn Iterations'] = iterations


def knn(self):
    for text in self.all_texts.keys():
        Result = None
        for category in self.categories.keys():
            if not result or self.knn_results[text][category]['Knn Distance'] < result:
                knn = category
                distance = self.knn_results[text][category]['Knn Distance']
                iterations = self.knn_results[text][category]['Knn Iterations']

                print 'File:',text
                print 'Knn:',category
                print 'Distance :', distance
                print 'Iterations :', iterations
                print 'End of nearest neighbour search'

And the test case to try it out:

mywork = Words_Works()

positive = 'positive.txt'
mywork.add_category(positive, 'Positive Tweets')               # Adding as category
negative = 'negative.txt'
mywork.add_category(negative, 'Negative Tweets')
neutral = 'neutral.txt'
mywork.add_category(neutral, 'Neutral Tweets')

for category in mywork.categories.keys():              # Print categories
    print category
    print mywork.categories[category]
    print
print

txts = ('samplegood.txt', 'samplebad.txt')                  # Creating list of files to

for text in txts:                                      # Adding them
    mywork.add_text(text)

for text in mywork.all_texts.keys():                   # Print texts
    print text
    print mywork.all_texts[text]
    print    
print

mywork.knn_calc()                                         # calculate knn

for files in mywork.knn_results.keys():                   # print detailed results
    print files
    for category in mywork.knn_results[files].keys():
        print category
        print mywork.knn_results[files][category]
    print
print    

mywork.knn()                                              # Display results
Was it helpful?

Solution

Two pieces of advice: first, as noted by @YvesDaoust, you should use the edit distance, also known as Levenshtein distance. You can find it in the python-Levenshtein package.

Second, use the unittest or doctest libraries in the standard library to test your code. It is a bad idea to use examples kept in external files to test your code, because a third person without access to those files (us, for instance) cannot know what is the input; avoid as well to print the output and manually inspect it, because this is slow, error-prone and again not reviewable by others.

OTHER TIPS

Use the Edit distance, you are not in Euclidean space. http://en.wikipedia.org/wiki/Edit_distance

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top