Skip to content

Instantly share code, notes, and snippets.

@ponyta
Created August 8, 2018 14:39
Show Gist options
  • Save ponyta/a373eb2f96a35efbca5202fd539e874e to your computer and use it in GitHub Desktop.
Save ponyta/a373eb2f96a35efbca5202fd539e874e to your computer and use it in GitHub Desktop.
lisp parser
#!/usr/bin/env python3
from enum import Enum, auto
# Represents an AST Node
class Node:
def __init__(self, key, children=None):
self.key = key
self.children = children
def __str__(self):
return str(self.key) + "[ " + ",".join(map(str, self.children)) + " ]"
class TokenType(Enum):
OPEN_PAREN = auto()
CLOSE_PAREN = auto()
NUMBER = auto()
FUNCTION = auto()
class Token:
def __init__(self, token_type, value=None):
self.type = token_type
if token_type is TokenType.OPEN_PAREN:
self.value = '('
elif token_type is TokenType.CLOSE_PAREN:
self.value = ')'
else:
if value is None:
raise AssertionError('NUMBER or FUNCTION tokens must supply a value')
self.value = value
def __str__(self):
return self.type.name + " " + self.value
def __repr__(self):
return self.type.name + " " + self.value
def is_whitespace(char):
if char is '\t' or char is '\n' or char is ' ':
return True
return False
# For now, we are only parsing natural numbers. Also we allow arbitrary leading
# zeros.
# TODO: implement ^[+|-]?[1-9][0-9]*\.?[0-9]*$ for arbitrary numbers
def is_number(char):
if char <= '9' and char >= '0':
return True
return False
# Valid letters for an identifier (for a function, since there are no variables yet)
def is_letter(char):
if ('a' <= char <= 'z' or 'A' <= char <= 'Z'
or char is '+' or char is '-' or char is '*' or char is '/'):
return True
return False
# Turns the input string into an array of tokens
def tokenize(string):
tokens = []
i = 0
while i < len(string):
if is_whitespace(string[i]): # just ignore any whitespace
while is_whitespace(string[i]):
i += 1
i -= 1
elif string[i] == '(':
tokens.append(Token(TokenType.OPEN_PAREN))
elif string[i] == ')':
tokens.append(Token(TokenType.CLOSE_PAREN))
elif is_number(string[i]):
num_start = i
while is_number(string[i]):
i += 1
tokens.append(Token(TokenType.NUMBER, string[num_start:i]))
i -= 1
elif is_letter(string[i]):
id_start = i
while is_letter(string[i]):
i += 1
tokens.append(Token(TokenType.FUNCTION, string[id_start:i]))
i -= 1
i += 1
return tokens
"""
My basic LISP language
s-expr -> atom
| OPEN_PAREN FUNCTION {s-expr} CLOSE_PAREN
atom -> NUMBER
"""
def parse_sexpr(tokens):
print("parsing sexp for: ", tokens)
if len(tokens) == 0:
raise AssertionError("Expected tokens, not nothing")
if tokens[0].type is TokenType.OPEN_PAREN:
if len(tokens) < 3:
raise AssertionError("Failed to parse s-expr rule for tokens: " + str(tokens))
if tokens[1].type is not TokenType.FUNCTION:
raise AssertionError("Expected token with type FUNCTION, got " + str(tokens[1]) + " instead.")
tokens.pop(0) # pop the open paren
parent = Node(tokens.pop(0))
children = []
while tokens[0].type is not TokenType.CLOSE_PAREN:
children.append(parse_sexpr(tokens))
parent.children = children
tokens.pop(0) # pop off the close paren
return parent
elif tokens[0].type is TokenType.NUMBER:
return tokens.pop(0) # an atomic token is a node in an AST
else:
raise AssertionError("Could not parse tokens: " + str(tokens))
test = "(first (list 1 (+ 2 34) 9))"
print(parse_sexpr(tokenize(test)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment