Question

I have a directory containing corpus text files, I want to create a table which contains the number of words in each document that is table contains column of document number & row contains word count in that document for each unique word...all should be done in python...please help...thank you...

The table should look like this:

          word1   word2   word3  ...
doc1      14      5       45
doc2      6       1       0
 .
 .
 .

 

import nltk
import collections
import os.path

def cleanDoc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

path = "c://Users/Desktop/corpus files"

i=0

for file in os.listdir(path) :

    f = open("c://Users/Desktop/corpus files/file%d.txt" %i,'r')
    data= f.read()
    words = cleanDoc(data)
    fw = open("c://Users/Desktop/words/words%d.txt" %i,'w')
    fd = collections.Counter(words)
    #fd = nltk.FreqDist(words)
    #plot(fd)

    row_format = "{:>15}" * (len(words) + 1)
    print row_format.format("document %d" %i, *words)
    #for

    fw.write(str(fd))
    fw.write(str(words))
    fw.close()
    i=i+1
    f.close()
Was it helpful?

Solution

I think this is fairly close, if not exactly, what you want. In case it isn't, I tried to make things easy to change.

To produce the table desired processing is done two phases. In the first, the unique words in each document file of the formfile<document-number>.txtare found and saved in a corresponding words<document-number>.txtfile, plus they are added to a set of comprising all the unique words seen among all document files. This set is needed to produce table columns that consist of all the unique words in all the files and is why two phases of processing were required.

In the second phase, the word files are read back in and turned back into dictionies which used to fill in the corresponding columns of the table being printed.

import ast
import collections
import nltk
import re
import os

user_name = "UserName"
path = "c://Users/%s/Desktop/corpus files" % user_name

def cleanDoc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens
                           if token.lower() not in stopset and len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    return final

# phase 1 -- find unique words, create word files, update overall unique word set

corpus_file_pattern = re.compile(r"""file(\d+).txt""")
unique_words = set()
longest_filename = 0
document_nums = []

for filename in os.listdir(path):
    corpus_file_match = corpus_file_pattern.match(filename)
    if corpus_file_match:  # corpus text file?
        if len(filename) > longest_filename:
            longest_filename = len(filename)
        document_num = int(corpus_file_match.group(1))
        document_nums.append(document_num)
        with open(os.path.join(path, filename)) as file:
            data = file.read()
        words = cleanDoc(data)
        unique_words.update(words)
        fd = collections.Counter(words)
        words_filename = "words%d.txt" % document_num
        with open(os.path.join(path, words_filename), mode = 'wt') as fw:
            fw.write(repr(dict(fd)) + '\n')  # write representation as dict

# phase 2 -- create table using unique_words and data in word files

unique_words_list = sorted(unique_words)
unique_words_empty_counter = collections.Counter({word: 0 for word
                                                            in unique_words})
document_nums = sorted(document_nums)
padding = 2  # spaces between columns
min_col_width = 5
col_headings = ["Document"] + unique_words_list
col_widths = [max(min_col_width, len(word))+padding for word in col_headings]
col_widths[0] = longest_filename+padding  # first col is special case

# print table headings
for i, word in enumerate(col_headings):
    print "{:{align}{width}}".format(word, align='>' if i else '<',
                                     width=col_widths[i]),
print

for document_num in document_nums:
    # read word in document dictionary back in
    filename = "words%d.txt" % document_num
    file_words = unique_words_empty_counter.copy()
    with open(os.path.join(path, filename)) as file:
        data = file.read()
    # convert data read into dict and update with file word counts
    file_words.update(ast.literal_eval(data))
    # print row of data
    print "{:<{width}}".format(filename, width=col_widths[0]),
    for i, word in enumerate(col_headings[1:], 1):
        print "{:>{width}n}".format(file_words[word], width=col_widths[i]),
    print
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top