If you need to match recursive pattern in default python re module, you can do like me for recursive comments I build recently for css preprocessor.
Generally use re just for splitting text to tokens and then use loops with nesting level variable to find all syntax. Here is my code:
COMMENTsRe = re.compile( r"""
// |
\n |
/\* |
\*/
""", re.X )
def rm_comments( cut ):
nocomment = 0 # no inside comment
c = 1 # c-like comments, but nested
cpp = 2 # c++like comments
mode = nocomment
clevel = 0 # nesting level of c-like comments
matchesidx = []
# in pure RE we cannot find nestesd structuries
# so we are just finding all boundires and parse it here
matches = COMMENTsRe.finditer( str(cut) )
start = 0
for i in matches:
m = i.group()
if mode == cpp:
if m == "\n":
matchesidx.append( ( start, i.end()-1 ) ) # -1 because without \n
mode = nocomment
elif mode == c:
if m == "/*":
clevel += 1
if m == "*/":
clevel -= 1
if clevel == 0:
matchesidx.append( ( start, i.end() ) )
mode = nocomment
else:
if m == "//":
start = i.start()
mode = cpp
elif m == "/*":
start = i.start()
mode = c
clevel += 1
cut.rm_and_save( matchesidx )