Question

I have a project which uses Lucene.net (latest version) to index a file containing large (2 million) lines of text using the StandardAnalyzer.

I am writing each line to the index as a single document; using the index generated I now want to get a dictionary of each word and its total count across all the documents.

What is the best way to achieve this?

Thanks in advance.

Was it helpful?

Solution

Luke.NET does this. Here's the function from it's source that should help get you on your way:

public static TermInfo[] GetHighFreqTerms(Directory dir,
                                          Hashtable junkWords,
                                          int numTerms,
                                          String[] fields)
{
    if (dir == null || fields == null) return new TermInfo[0];

    IndexReader reader = IndexReader.Open(dir, true);
    TermInfoQueue tiq = new TermInfoQueue(numTerms);
    TermEnum terms = reader.Terms();

    int minFreq = 0;

    while (terms.Next())
    {
        String field = terms.Term.Field;

        if (fields != null && fields.Length > 0)
        {
            bool skip = true;

            for (int i = 0; i < fields.Length; i++)
            {
                if (field.Equals(fields[i]))
                {
                    skip = false;
                    break;
                }
            }
            if (skip) continue;
        }

        if (junkWords != null && junkWords[terms.Term.Text] != null)
            continue;

        if (terms.DocFreq() > minFreq)
        {
            tiq.Add(new TermInfo(terms.Term, terms.DocFreq()));
            if (tiq.Size() >= numTerms)              // if tiq overfull
            {
                tiq.Pop();                   // remove lowest in tiq
                minFreq = ((TermInfo)tiq.Top()).DocFreq; // reset minFreq
            }
        }
    }

    TermInfo[] res = new TermInfo[tiq.Size()];

    for (int i = 0; i < res.Length; i++)
    {
        res[res.Length - i - 1] = (TermInfo)tiq.Pop();
    }

    reader.Dispose();

    return res;
}
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top