I have a project which uses Lucene.net (latest version) to index a file containing large (2 million) lines of text using the StandardAnalyzer.

I am writing each line to the index as a single document; using the index generated I now want to get a dictionary of each word and its total count across all the documents.

What is the best way to achieve this?

Thanks in advance.

有帮助吗?

解决方案

Luke.NET does this. Here's the function from it's source that should help get you on your way:

public static TermInfo[] GetHighFreqTerms(Directory dir,
                                          Hashtable junkWords,
                                          int numTerms,
                                          String[] fields)
{
    if (dir == null || fields == null) return new TermInfo[0];

    IndexReader reader = IndexReader.Open(dir, true);
    TermInfoQueue tiq = new TermInfoQueue(numTerms);
    TermEnum terms = reader.Terms();

    int minFreq = 0;

    while (terms.Next())
    {
        String field = terms.Term.Field;

        if (fields != null && fields.Length > 0)
        {
            bool skip = true;

            for (int i = 0; i < fields.Length; i++)
            {
                if (field.Equals(fields[i]))
                {
                    skip = false;
                    break;
                }
            }
            if (skip) continue;
        }

        if (junkWords != null && junkWords[terms.Term.Text] != null)
            continue;

        if (terms.DocFreq() > minFreq)
        {
            tiq.Add(new TermInfo(terms.Term, terms.DocFreq()));
            if (tiq.Size() >= numTerms)              // if tiq overfull
            {
                tiq.Pop();                   // remove lowest in tiq
                minFreq = ((TermInfo)tiq.Top()).DocFreq; // reset minFreq
            }
        }
    }

    TermInfo[] res = new TermInfo[tiq.Size()];

    for (int i = 0; i < res.Length; i++)
    {
        res[res.Length - i - 1] = (TermInfo)tiq.Pop();
    }

    reader.Dispose();

    return res;
}
许可以下: CC-BY-SA归因
不隶属于 StackOverflow
scroll top