No, I do not know of any predefined function that will identify the common prefixes for the right side of a production rule. The following function seems to work fairly well, however. I use izip_longest from the itertools module. You may be able to optimize or improve this function even further:
from itertools import izip_longest
def find_prefixes(strings):
zipped = izip_longest(*strings, fillvalue='')
for index, letters in enumerate(zipped):
if index == 0:
prefixes = letters # assumes there will always be a prefix
else:
poss_prefixes = [prefix + letters[i] for i, prefix in enumerate(prefixes)]
prefixes = [prefix if poss_prefixes.count(prefix) == letters.count(prefix) # changed > 1 to == letters.count(prefix)
else prefixes[i] for i, prefix in enumerate(poss_prefixes)]
return set(prefixes)
Then, if you want, you can also find the suffixes:
def find_prefix_suffixes(strings, prefixes):
prefix_suffix = dict()
for s in strings:
for prefix in sorted(list(prefixes), key=lambda x: len(x), reverse=True):
if s.startswith(prefix):
if prefix in prefix_suffix:
prefix_suffix[prefix].add(s.replace(prefix, '', 1))
else:
prefix_suffix[prefix] = set([s.replace(prefix, '', 1)])
return prefix_suffix
Here is the result:
>>> s = ['ab', 'abc', 'abcd', 'acd', 'bc', 'bd', 'd']
>>> find_prefixes(s)
set(['a', 'b', 'd'])
>>> prefixes = find_prefixes(s)
>>> find_prefix_suffixes(s, prefixes)
{'a': set(['bcd', 'cd', 'b', 'bc']), 'b': set(['c', 'd']), 'd': set([''])}