There is an undocumented tool in the re
module which may be helpful here. You could use it like this:
import re
import sys
def section(scanner, token):
return "SECTION", scanner.match.group(1)
def some_line(scanner, token):
return "SOME LINE", token
def garbage(scanner, token):
sys.exit('Found garbage: {}'.format(token))
# scanner will attempt to match these patterns in the order listed.
# If there is a match, the second argument is called.
scanner = re.Scanner([
(r"section (\d+)$$", section),
(r"some line$", some_line),
(r"\s+", None), # skip whitespace
(r".+", garbage), # if you get here it's garbage
], flags=re.MULTILINE)
tokens, remainder = scanner.scan('''\
section 1
some line
''')
for token in tokens:
print(token)
yields
('SECTION', '1')
('SOME LINE', 'some line')