Recursion is definitely needed to solve this. In pyparsing, you define a recursive grammar using the Forward
class. See the annotations in this code sample:
from pyparsing import (Suppress, Word, nums, alphas, Regex, Forward, Group,
Optional, OneOrMore, ParseResults)
from collections import defaultdict
"""
BNF for simple chemical formula (no nesting)
integer :: '0'..'9'+
element :: 'A'..'Z' 'a'..'z'*
term :: element [integer]
formula :: term+
BNF for nested chemical formula
integer :: '0'..'9'+
element :: 'A'..'Z' 'a'..'z'*
term :: (element | '(' formula ')') [integer]
formula :: term+
"""
LPAR,RPAR = map(Suppress,"()")
integer = Word(nums)
# add parse action to convert integers to ints, to support doing addition
# and multiplication at parse time
integer.setParseAction(lambda t:int(t[0]))
element = Word(alphas.upper(), alphas.lower())
# or if you want to be more specific, use this Regex
# element = Regex(r"A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|E[rsu]|F[emr]?|"
# "G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|M[dgnot]|N[abdeiop]?|"
# "Os?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilm]|"
# "Uu[bhopqst]|U|V|W|Xe|Yb?|Z[nr]")
# forward declare 'formula' so it can be used in definition of 'term'
formula = Forward()
term = Group((element | Group(LPAR + formula + RPAR)("subgroup")) +
Optional(integer, default=1)("mult"))
# define contents of a formula as one or more terms
formula << OneOrMore(term)
# add parse actions for parse-time processing
# parse action to multiply out subgroups
def multiplyContents(tokens):
t = tokens[0]
# if these tokens contain a subgroup, then use multiplier to
# extend counts of all elements in the subgroup
if t.subgroup:
mult = t.mult
for term in t.subgroup:
term[1] *= mult
return t.subgroup
term.setParseAction(multiplyContents)
# add parse action to sum up multiple references to the same element
def sumByElement(tokens):
elementsList = [t[0] for t in tokens]
# construct set to see if there are duplicates
duplicates = len(elementsList) > len(set(elementsList))
# if there are duplicate element names, sum up by element and
# return a new nested ParseResults
if duplicates:
ctr = defaultdict(int)
for t in tokens:
ctr[t[0]] += t[1]
return ParseResults([ParseResults([k,v]) for k,v in ctr.items()])
formula.setParseAction(sumByElement)
# run some tests
tests = """\
H
NaCl
HO
H2O
HOH
(H2O)2
(H2O)2OH
((H2O)2OH)12
C6H5OH
""".splitlines()
for t in tests:
if t.strip():
results = formula.parseString(t)
print t, '->', dict(results.asList())
Prints out:
H -> {'H': 1}
NaCl -> {'Na': 1, 'Cl': 1}
HO -> {'H': 1, 'O': 1}
H2O -> {'H': 2, 'O': 1}
HOH -> {'H': 2, 'O': 1}
(H2O)2 -> {'H': 4, 'O': 2}
(H2O)2OH -> {'H': 5, 'O': 3}
((H2O)2OH)12 -> {'H': 60, 'O': 36}
C6H5OH -> {'H': 6, 'C': 6, 'O': 1}