If the input is very consistent (as shown), you could probably get by with re
.
For anything more complicated, you might want to look at a more robust parser like pyparsing
.
Edit: Here is a very simple finite-state-machine parser using regular expressions; it handles blank lines, unnested select;
and end;
statements, and initial/successive when
s. I don't handle label
s because I'm not sure what they do - rename the V variable back to X?
import re
class SasTranslator:
def __init__(self):
# modes:
# 0 not in START..END
# 1 in START..END, no CASE seen yet
# 2 in START..END, CASE already found
self.mode = 0
self.offset = -1 # input line #
def handle_blank(self, match):
return ""
def handle_start(self, match):
if self.mode == 0:
self.mode = 1
return None
else:
raise ValueError("Found 'select;' in select block, line {}".format(self.offset))
def handle_end(self, match):
if self.mode == 0:
raise ValueError("Found 'end;' with no opening 'select;', line {}".format(self.offset))
elif self.mode == 1:
raise ValueError("Found empty 'select;' .. 'end;', line {}".format(self.offset))
elif self.mode == 2:
self.mode = 0
return None
def handle_case(self, match):
if self.mode == 0:
raise ValueError("Found 'when' clause outside 'select;' .. 'end;', line {}".format(self.offset))
elif self.mode == 1:
test = "if"
self.mode = 2
# note: code continues after if..else block
elif self.mode == 2:
test = "elif"
# note: code continues after if..else block
test_var, op, test_val, assign_var, assign_val = match.groups()
return (
"{test} {test_var} {op} {test_val}:\n"
" {assign_var} = {assign_val}".format(
test = test,
test_var = test_var,
op = op,
test_val = test_val,
assign_var = assign_var,
assign_val = assign_val
)
)
#
# Build a dispatch table for the handlers
#
BLANK = re.compile("\s*$")
START = re.compile("select;\s*$")
END = re.compile("end;\s*$")
CASE = re.compile("\s*when\((\w+)\s*([<>=]+)\s*([\d.-]+)\s*\)\s*(\w+)\s*=\s*([\d.-]+)\s*;\s*$")
dispatch_table = [
(BLANK, handle_blank),
(START, handle_start),
(END, handle_end),
(CASE, handle_case)
]
def __call__(self, line):
"""
Translate a single line of input
"""
self.offset += 1
for test,handler in SasTranslator.dispatch_table:
match = test.match(line)
if match is not None:
return handler(self, match)
# nothing matched!
return None
def main():
with open("my_file.sas") as inf:
trans = SasTranslator()
for line in inf:
result = trans(line)
if result is not None:
print(result)
else:
print("***unknown*** {}".format(line.rstrip()))
if __name__=="__main__":
main()
and run against your sample input it produces
if X_1 <= 6.7278:
V_1 = -0.0594
elif X_1 <= 19.5338:
V_1 = 0.0604
elif X_1 <= 45.1458:
V_1 = 0.1755
elif X_1 <= 83.5638:
V_1 = 0.2867
elif X_1 <= 203.0878:
V_1 = 0.395
elif X_1 > 203.0878:
V_1 = 0.5011
***unknown*** label V_1 ="X_1 ";
if X_2 <= 0.0836:
V_2 = 0.0562
elif X_2 <= 0.1826:
V_2 = 0.07
elif X_2 <= 0.2486:
V_2 = 0.0836
elif X_2 <= 0.3146:
V_2 = 0.0969
elif X_2 <= 0.3806:
V_2 = 0.1095
elif X_2 <= 0.4466:
V_2 = 0.1212
elif X_2 <= 0.5126:
V_2 = 0.132
elif X_2 <= 0.5786:
V_2 = 0.1419
elif X_2 <= 0.6446:
V_2 = 0.1511
elif X_2 <= 0.7106:
V_2 = 0.1596
elif X_2 <= 0.8526:
V_2 = 0.1679
elif X_2 > 0.8526:
V_2 = 0.176
***unknown*** label V_2 ="X_2 ";
Depending how often you use this, it might be worth making a binomial-lookup function using bisect
and translating the select;
..end;
blocks into that form instead (although you would want to be very careful that the comparison operators are what you expect!) - something like
V_1 = index_into(
X_1,
[ 6.7278, 19.5338, 45.1458, 83.5638, 203.0878 ],
[-0.0594, 0.0604, 0.1755, 0.2867, 0.395, 0.5011]
)
It could be significantly faster-running (especially as the number of options goes up) and much easier to comprehend and maintain.