Question

What's the best and most efficient way to count keywords in JavaScript? Basically, I'd like to take a string and get the top N words or phrases that occur in the string, mainly for the use of suggesting tags. I'm looking more for conceptual hints or links to real-life examples than actual code, but I certainly wouldn't mind if you'd like to share code as well. If there are particular functions that would help, I'd also appreciate that.

Right now I think I'm at using the split() function to separate the string by spaces and then cleaning punctuation out with a regular expression. I'd also want it to be case-insensitive.

Was it helpful?

Solution

Cut, paste + execute demo:

var text = "Text to be examined to determine which n words are used the most";

// Find 'em!
var wordRegExp = /\w+(?:'\w{1,2})?/g;
var words = {};
var matches;
while ((matches = wordRegExp.exec(text)) != null)
{
    var word = matches[0].toLowerCase();
    if (typeof words[word] == "undefined")
    {
        words[word] = 1;
    }
    else
    {
        words[word]++;
    }
}

// Sort 'em!
var wordList = [];
for (var word in words)
{
    if (words.hasOwnProperty(word))
    {
        wordList.push([word, words[word]]);
    }
}
wordList.sort(function(a, b) { return b[1] - a[1]; });

// Come back any time, straaanger!
var n = 10;
var message = ["The top " + n + " words are:"];
for (var i = 0; i < n; i++)
{
    message.push(wordList[i][0] + " - " + wordList[i][1] + " occurance" +
                 (wordList[i][1] == 1 ? "" : "s"));
}
alert(message.join("\n"));

Reusable function:

function getTopNWords(text, n)
{
    var wordRegExp = /\w+(?:'\w{1,2})?/g;
    var words = {};
    var matches;
    while ((matches = wordRegExp.exec(text)) != null)
    {
        var word = matches[0].toLowerCase();
        if (typeof words[word] == "undefined")
        {
            words[word] = 1;
        }
        else
        {
            words[word]++;
        }
    }

    var wordList = [];
    for (var word in words)
    {
        if (words.hasOwnProperty(word))
        {
            wordList.push([word, words[word]]);
        }
    }
    wordList.sort(function(a, b) { return b[1] - a[1]; });

    var topWords = [];
    for (var i = 0; i < n; i++)
    {
        topWords.push(wordList[i][0]);
    }
    return topWords;
}

OTHER TIPS

Once you have that array of words cleaned up, and let's say you call it wordArray:

var keywordRegistry = {};

for(var i = 0; i < wordArray.length; i++) {
   if(keywordRegistry.hasOwnProperty(wordArray[i]) == false) {
      keywordRegistry[wordArray[i]] = 0;
   }
   keywordRegistry[wordArray[i]] = keywordRegistry[wordArray[i]] + 1;
}

// now keywordRegistry will have, as properties, all of the 
// words in your word array with their respective counts 

// this will alert (choose something better than alert) all words and their counts
for(var keyword in keywordRegistry) {
  alert("The keyword '" + keyword + "' occurred " + keywordRegistry[keyword] + " times");
}

That should give you the basics of doing this part of the work.

Try to split you string on words and count the resulting words, then sort on the counts.

This builds upon a previous answer by insin by only having one loop:

function top_words(text, n) {
    // Split text on non word characters
    var words = text.toLowerCase().split(/\W+/)
    var positions = new Array()
    var word_counts = new Array()
    for (var i=0; i<words.length; i++) {
        var word = words[i]
        if (!word) {
            continue
        }

        if (typeof positions[word] == 'undefined') {
            positions[word] = word_counts.length
            word_counts.push([word, 1])
        } else {
            word_counts[positions[word]][1]++
        }
    }
    // Put most frequent words at the beginning.
    word_counts.sort(function (a, b) {return b[1] - a[1]})
    // Return the first n items
    return word_counts.slice(0, n)
}

// Let's see if it works.
var text = "Words in here are repeated. Are repeated, repeated!"
alert(top_words(text, 3))

The result of the example is: [['repeated',3], ['are',2], ['words', 1]]

I would do exactly what you have mentioned above to isolate each word. I would then probably add each word as the index of an array with the number of occurrences as the value.

For example:

var a = new Array;
a[word] = a[word]?a[word]+1:1;

Now you know how many unique words there are (a.length) and how many occurrences of each word existed (a[word]).

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top