I'd go with your second way using collections.Counter
and set
.
>>> from collections import Counter
>>> sum((Counter(set(x)) for x in docDict.itervalues()), Counter())
Counter({'c': 2, 'b': 2, 'a': 1, 'd': 1})
Update 1:
>>> c = sum((Counter(set(x)) for x in docDict.itervalues()), Counter())
>>> {k: {k1:c[k1] for k1 in set(v)} for k, v in docDict.iteritems()}
{'alpha': {'a': 1, 'c': 2, 'b': 2}, 'bravo': {'c': 2, 'b': 2, 'd': 1}}
update 2::
If performance is an concern then don't use Counter
with sum
, here another way to do it. Note that unlike @user2931409 answer I am not keeping a set of words in memory just to get their length, so this is much more memory efficient but slightly slower than their answer.
result = Counter()
for v in docDict.itervalues():
result.update(set(v))
return result
Timing comparison:
def func1():
#http://stackoverflow.com/a/22787509/846892
result = defaultdict(set)
for k, vlist in docDict.items():
for v in vlist:
result[v].add(k)
return dict(zip(result.keys(), map(lambda x:len(x), result.values())))
def func2():
result = Counter()
for v in docDict.itervalues():
result.update(set(v))
return result
In [94]: docDict = {''.join(random.choice(lis) for _ in xrange(8)): random.sample(lis, 25)
...: for _ in xrange(70000)}
In [95]: %timeit func1(docDict)
1 loops, best of 3: 380 ms per loop
In [96]: %timeit func2(docDict)
1 loops, best of 3: 591 ms per loop
In [97]: docDict = {''.join(random.choice(lis) for _ in xrange(8)): random.sample(lis, 25)
...: for _ in xrange(10**5)}
In [98]: %timeit func1(docDict)
1 loops, best of 3: 529 ms per loop
In [99]: %timeit func2(docDict)
1 loops, best of 3: 848 ms per loop
In [101]: func1(docDict) == func2(docDict)
Out[101]: True