import sys # # ParserInput # # This class represents the input data and the current # position in the data. # # Brief note about 'max_position': # # When running a complex parser, the current position moves forward # when a particular sub-parser matches and moves backward when # it doesn't match. The 'max_position' field tracks the farthest position # over the entire execution of a complex parser. # When the complex parser fails, the farthest position is a good indicator # of the location of the error. # # The purpose of the 'max_position' field is to report # the error location when the main parser doesn't match. # class ParserInput(object) : def __init__(self, data, position) : self._data = data self._position = position self._max_position = position def data(self) : return self._data def position(self) : return self._position def remaining(self) : return len(self._data) - self._position def read(self) : i = self._position return self._data[i:i+1] def inc_position(self) : self._position += 1 self._max_position = max(self._max_position, self._position) def set_position(self, position) : self._position = position self._max_position = max(self._max_position, self._position) def max_position(self) : return self._max_position # # Match, NoMatch # # A parser object returns either a Match or NoMatch object. # The Match object contains a 'value' property that holds # the result of parsing. # class Match(object) : def __init__(self, value) : self.value = value def __repr__(self) : return "Match(%s)" % (self.value,) class NoMatch(object) : def __repr__(self) : return "NoMatch()" def is_match(result) : return isinstance(result, Match) # # Parser # # This is the base-class representing parser objects. # The 'parse' method is the method that does the parsing. # It accepts a ParserInput object and returns either a Match # or NoMatch object. # # In the case of a match, the parsed result is returned # in the 'value' property of the Match object and the # parser input position is advanced by the length of the input # that was parsed. # # In the case of no-match, the parser input position must # remain unchanged from what it was when the 'parse' method # was called. If the input position was advanced during # intermediate stages, then it should be restored back to # what it was when the 'parse' method was called. # # Note on the 'match' method and 'modifier' attribute: # # The 'modifier' attribute can be set to a function that # modifies the 'value' property of the Match object in the event # of a successful match. The 'match' method is a helper method # that applies the 'modifier' if it is present. # class Parser(object) : def match(self, value) : modifier = getattr(self, "modifier", None) if modifier is not None : value = self.modifier(value) return Match(value) def parse(self, parser_input) : raise NotImplementedError # # Char # # The Char parser parses a single character matching the character # provided in the constructor if it is found at the current position # and fails if it's not found or if there are no more characters left. # class Char(Parser) : def __init__(self, char) : self._char = char def parse(self, parser_input) : if parser_input.remaining() == 0 : return NoMatch() c = parser_input.read() if c != self._char : return NoMatch() parser_input.inc_position() return self.match(c) # # CharSet # # The CharSet parser parses a single character matching any one # of the characters passed in the constructor. The input to the # constructor can be a string, a list of characters, or a set of # characters. # class CharSet(Parser) : def __init__(self, chars) : self._chars = chars def parse(self, parser_input) : if parser_input.remaining() == 0 : return NoMatch() c = parser_input.read() if c not in self._chars : return NoMatch() parser_input.inc_position() return self.match(c) # # ZeroOrMore # # ZeroOrMore takes another parser as input and applies it repeatedly # until it doesn't match. All the results of the repeated application # are collected in a list and returned as the result of parsing. # # This parser never fails to match because it simply returns an # empty list as a match if the input parser doesn't match even once. # class ZeroOrMore(Parser) : def __init__(self, parser) : self._parser = parser def parse(self, parser_input) : values = [] while True : result = self._parser.parse(parser_input) if not is_match(result) : break values.append(result.value) return self.match(values) # # OneOrMore # # OneOrMore takes another parser as input and applies it repeatedly # until it doesn't match. All the results of the repeated application # are collected in a list and returned as the result of parsing. # # This parser matches only if the input parser matches atleast once. # class OneOrMore(Parser) : def __init__(self, parser) : self._parser = parser def parse(self, parser_input) : values = [] while True : result = self._parser.parse(parser_input) if not is_match(result) : break values.append(result.value) if len(values) == 0 : return NoMatch() return self.match(values) # # Sequence # # The Sequence parser takes N parsers as input and applies them # one after another. Sequence succeeds in matching if all the input # parsers succeed and doesn't match even if one of input parsers don't # match. In the event of a successful match, the results of the input # parsers are collected in a list and returned as the result of parsing. # class Sequence(Parser) : def __init__(self, *parsers) : self._parsers = parsers def parse(self, parser_input) : values = [] saved = parser_input.position() for parser in self._parsers : result = parser.parse(parser_input) if not is_match(result) : parser_input.set_position(saved) return NoMatch() values.append(result.value) return self.match(values) # # Choice # # The Choice parser takes N parsers as input. It applies the first one # and if it succeeds then the result is returned as a successful match. # If the first parser doesn't match, then the second parser is attempted. # This continues until one of the parsers matches. The result of the # first matching parser is returned as the match result of # the Choice parser. The Choice parser doesn't match if all the # input parsers fail to match. # class Choice(Parser) : def __init__(self, *parsers) : self._parsers = parsers def parse(self, parser_input) : for parser in self._parsers : result = parser.parse(parser_input) if is_match(result) : return self.match(result.value) return NoMatch() # # Optional # # The Optional parser takes a parser as input and always succeeds # in matching. If the input parser succeeds in matching, then its # result is returned as the result of a successful match. If the # input parser doesn't match, then None is returned as a successful # match. # class Optional(Parser) : def __init__(self, parser) : self._parser = parser def parse(self, parser_input) : result = self._parser.parse(parser_input) if not is_match(result) : return self.match(None) return self.match(result.value) # # Wrapper # # Wrapper is a simple direct wrapper around another parser. # The input parser is available in the 'parser' property. # It can be leftout during construction and can be set directly # set later. This allows for forward declaring a parser that needs # to call itself recursively. # class Wrapper(Parser) : def __init__(self, parser=None) : self.parser = parser def parse(self, parser_input) : result = self.parser.parse(parser_input) if not is_match(result) : return result return self.match(result.value) # # EOF # # The EOF parser succeeds in matching if the input posiition # is at the end. It fails to match otherwise. # class EOF(Parser) : def parse(self, parser_input) : if parser_input.remaining() == 0 : return self.match(None) else : return NoMatch() # # Example 1 # # Parse one or more digits # def example1() : digit = CharSet("0123456789") digits = OneOrMore(digit) return digits # # Example 2 # # Parse one or more digits and convert to integer # def example2() : # convert single digit to integer def char_to_digit(c) : return ord(c) - ord("0") # convert list of digit values to integer def digits_to_number(a) : result = 0 for x in a : result = result*10 + x return result digit = CharSet("0123456789") digit.modifier = char_to_digit digits = OneOrMore(digit) digits.modifier = digits_to_number return digits # # Example 3 # # An expression parser. # def example3() : # Helpers def char_to_digit(c) : return ord(c) - ord("0") def digits_to_number(a) : result = 0 for x in a : result = result*10 + x return result def bracket_inner(a) : return a[1] def apply_sign(a) : sign = a[0] if sign is not None : return [sign, a[1]] else : return a[1] def make_operation_tree(a) : value = a[0] for operator, value2 in a[1] : value = [operator, value, value2] return value # parsers whitespace = ZeroOrMore(Char(" ")) def eat_space(parser) : parser2 = Sequence(whitespace, parser) parser2.modifier = lambda a : a[1] return Wrapper(parser2) digit = CharSet("0123456789") digit.modifier = char_to_digit number = eat_space(OneOrMore(digit)) number.modifier = digits_to_number expr = Wrapper() left_parenthesis = eat_space(Char("(")) right_parenthesis = eat_space(Char(")")) bracketed = Sequence(left_parenthesis, expr, right_parenthesis) bracketed.modifier = bracket_inner sign = eat_space(CharSet("+-")) atomic = Sequence(Optional(sign), Choice(number, bracketed)) atomic.modifier = apply_sign term_operator = eat_space(CharSet("*/")) term_operation = Sequence(term_operator, atomic) term = Sequence(atomic, ZeroOrMore(term_operation)) term.modifier = make_operation_tree expr_operator = eat_space(CharSet("+-")) expr_operation = Sequence(expr_operator, term) expr.parser = Sequence(term, ZeroOrMore(expr_operation)) expr.modifier = make_operation_tree return expr # # main # main_parser = None # # Uncomment one of the following lines. # # main_parser = example1() # main_parser = example2() # main_parser = example3() def main() : if main_parser is None : print "Please uncomment one of the example initializer lines" return if len(sys.argv) != 2 : print "%s <input>" % (sys.argv[0],) return s = sys.argv[1] parser_input = ParserInput(s, 0) result = main_parser.parse(parser_input) if not is_match(result) : print "ERROR: %s" % (s,) i = len("ERROR: ") + parser_input.max_position() print "%s^" % ("-"*i,) else : print "RESULT:", result.value i = parser_input.position() data = parser_input.data() if i < len(data) : print "UNPARSED LEFTOVER:", data[i:] if __name__ == "__main__" : main()