Question

I'm designing system that should be able to process millions of documents and report on them in different ways. mongoDb map\reduce task is what I'm trying to implement (currently doing some investigation on that). The very basic document structure is

db.test.insert(
{
        "_id" : ObjectId("4f6063601caf46303c36eb27"),
        "verbId" : NumberLong(1506281),
        "sentences" : [
                {
                        "sId" : NumberLong(2446630),
                        "sentiment" : 2,
                        "categories" : [
                                NumberLong(3257),
                                NumberLong(3221),
                                NumberLong(3291)
                        ]
                },
                {
                        "sId" : NumberLong(2446631),
                        "sentiment" : 0,
                        "categories" : [
                                NumberLong(2785),
                                NumberLong(2762),
                                NumberLong(2928),
                                NumberLong(2952)
                        ]
                },
                {
                        "sId" : NumberLong(2446632),
                        "sentiment" : 0,
                        "categories" : [
                                NumberLong(-2393)
                        ]
                },
                {
                        "sId" : NumberLong(2446633),
                        "sentiment" : 0,
                        "categories" : [
                                NumberLong(-2393)
                        ]
                }
        ]
})

So that each document contains sentences, that could belong to different categories. The report I'm trying to get is number of sentences in category (with percent of verbatims).

I'm doing next map-reduce jobs with finalize method to count different averages.

var map = function() {
        var docCategories = new Array();
        var catValues = new Array();
        for (var i = 0; i < this.sentences.length; i++) { //iterate over sentences.
            sentence = this.sentences[i];
            for (var j = 0; j < sentence.categories.length; j++) {//iterate over categories
                catId= sentence.categories[j].toNumber();
                if (docCategories.indexOf(catId) < 0) {
                    docCategories.push(catId);
                    catValues.push({sentiment : sentence.sentiment, sentenceCnt: 1});
                } else {
                    categoryIdx = docCategories.indexOf(catId);
                    catValue = catValues[categoryIdx];
                    catValue.sentiment = catValue.sentiment + sentence.sentiment;
                    catValue.sentenceCnt = catValue.sentenceCnt + 1;
                }
            }

        }
        totalCount++; //here we do try to count distinctCases see scope.
        for (var i = 0; i < docCategories.length; i ++) {
            emit(docCategories[i], {count: 1, sentenceCnt: catValues[i].sentenceCnt, sentiment: catValues[i].sentiment, totalCnt : totalCount});
        }

    };

var reduce = function(key, values) {
    var res = {count : 0, sentenceCnt : 0, sentiment : 0};
    for ( var i = 0; i < values.length; i ++ ) {
        res.count += values[i].count;
        res.sentenceCnt += values[i].sentenceCnt;
        res.sentiment += values[i].sentiment;
    }

    return res;
};

var finalize = function(category, values) {
    values.sentimentAvg = values.sentiment / values.sentenceCnt; 
    values.percentOfVerbatim = values.count / totalCount //scope variable (global)
    return values;
};


var res = db.runCommand( { mapreduce:'test',
                  map:map,
                  reduce:reduce,
                  out: 'cat_volume',
                  finalize:finalize,
                  scope:{totalCount : 0},
                });

The most interesting part here is that I'm using totalCount - to count number of verbatims I'm emitting. totalCount is the scope (global) variable. Everything went well on One mongoDb installation, but when going to a shard instances I'm getting "Infinity" for percentOfVerbatim.

Actually in that case totalCount would be just db.test.count() (number of documents) but in future I'm going to add different conditions for documents to be count. Doing any other query is very undesirable since db is very heavy.

Are there any other approaches to using global (scope) variables on multi-instance mongodb installation? Or should I use something else?

Was it helpful?

Solution

The scope variables are not shared among the shards. You can treat it as a global constant. Updates to the value won't be visible to map or reduce functions running on different shards.

OTHER TIPS

Finally I've found the way how to count number of documents I'm emitting. The only way that worked for me is emitting documentId, and puting ids into the array on reduce. On client side (I'm writing java program) I have to count just all distinct Ids. So, while doing map I do emit

emit(docCategories[i], {verbIds : [this.verbId.toNumber()], count: 1, sentenceCnt: catValues[i].sentenceCnt, sentiment: catValues[i].sentiment, totalCnt : totalCount});

Reduce function is the following:

var reduce = function(key, values) {
    var res = {verbIds : [], count : 0, sentenceCnt : 0, sentiment : 0};
    for ( var i = 0; i < values.length; i ++ ) {
//      res.verbIds = res.verbIds.concat(values[i].verbIds); //works slow
        for ( var j = 0; j < values[i].verbIds.length; j ++ ) {
            res.verbIds.push(values[i].verbIds[j]);
        }
        res.count += values[i].count;
        res.sentenceCnt += values[i].sentenceCnt;
        res.sentiment += values[i].sentiment;
    }

    return res;
};

Java side program just count distinct Ids over all of the results.

Actually for 1.1M documents execution slows down significantly

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top