Last active
May 27, 2020 21:31
-
-
Save selimslab/7eae7fed17db07beb311f585f75a781f to your computer and use it in GitHub Desktop.
a simple interpreter in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Here be dragons | |
This is a basic interpreter | |
a computer program is just text. Compilers and interpreters translate this text for the machine | |
let's say our program is 2 * 7 + 3 | |
1. First step is lexical analysis, a fancy term for tokenizing | |
Lexer creates tokens [2, *, 7, +, 3] | |
2. Second is the syntax analysis or parsing | |
Parser takes the tokens and produces and intermediate representation or IR | |
abstract syntax tree (AST) is a very simple IR, there are only values and operators, no metadata | |
here is the AST for 2 * 7 + 3 | |
+ | |
/ \ | |
* 3 | |
/ \ | |
2 7 | |
3. Interpreter evaluates the AST | |
one way to implement an interpreter is visitor pattern and it makes it easy to add new operations later | |
just visit the nodes and execute the operations | |
program text -> Tokenizer -> tokens -> Parser -> abstract syntax tree -> Interpreter -> result | |
""" | |
# LEXER or Tokenizer | |
from enum import Enum | |
class TokenType(Enum): | |
PLUS = '+' | |
MINUS = "-" | |
INTEGER = 'INTEGER' | |
EOF = 'EOF' | |
MUL = "*" | |
DIV = "/" | |
LPAREN = "(" | |
RPAREN = ")" | |
class Token: | |
def __init__(self, type: TokenType, value): | |
self.type = type | |
self.value = value | |
class LexerError(Exception): | |
pass | |
class Lexer: | |
""" | |
The process is called lexical analysis and the part of the interpreter | |
that does it is called a lexical analyzer, lexer, scanner, or tokenizer. | |
Here is our own lexer from the ground up without using regular expressions or any other tools like Lex. | |
""" | |
def __init__(self, text): | |
self.text = text | |
self.pos = 0 | |
self.current_char = self.text[self.pos] | |
def advance(self): | |
"""Advance the 'pos' pointer and set the 'current_char' variable.""" | |
self.pos += 1 | |
if self.pos > len(self.text) - 1: | |
self.current_char = None # Indicates end of input | |
else: | |
self.current_char = self.text[self.pos] | |
def skip_whitespace(self): | |
while self.current_char is not None and self.current_char.isspace(): | |
self.advance() | |
def multidigit(self): | |
"""Return a (multidigit) integer consumed from the input.""" | |
result = '' | |
while self.current_char is not None and self.current_char.isdigit(): | |
result += self.current_char | |
self.advance() | |
return result | |
def get_next_token(self): | |
while self.current_char is not None: | |
if self.current_char.isspace(): | |
self.skip_whitespace() | |
continue | |
if self.current_char.isdigit(): | |
value = self.multidigit() | |
return Token(TokenType.INTEGER, int(value)) | |
try: | |
# TokenType('+') --> TokenType.PLUS | |
token_type = TokenType(self.current_char) | |
self.advance() | |
return Token(token_type, token_type.value) | |
except ValueError: | |
raise | |
return Token(TokenType.EOF, None) | |
# AST | |
class ASTNode: | |
pass | |
class BinaryOperator(ASTNode): | |
def __init__(self, left, op, right): | |
self.left = left | |
self.token = self.op = op | |
self.right = right | |
class Integer(ASTNode): | |
def __init__(self, token): | |
self.token = token | |
self.value = token.value | |
# PARSER | |
class ParserError(Exception): | |
pass | |
class Parser: | |
""" | |
How to recognize a phrase in the stream of tokens ? | |
Finding structure in the stream of tokens is called parsing or syntax analysis. | |
Here is the parser or syntax analyzer. | |
""" | |
def __init__(self, tokenizer): | |
self.tokenizer = tokenizer | |
self.current_token = self.tokenizer.get_next_token() | |
def eat(self, token_type): | |
if self.current_token.type == token_type: | |
self.current_token = self.tokenizer.get_next_token() | |
else: | |
raise ParserError(f"invalid input, {self.current_token.type, token_type}") | |
def factor(self): | |
""" | |
factor : int or (expr) | |
""" | |
token = self.current_token | |
if token.type == TokenType.INTEGER: | |
self.eat(TokenType.INTEGER) | |
return Integer(token) | |
elif token.type == TokenType.LPAREN: | |
self.eat(TokenType.LPAREN) | |
node = self.expr() | |
self.eat(TokenType.RPAREN) | |
return node | |
def term(self): | |
""" | |
term : factor (* or / factor).. | |
""" | |
node = self.factor() | |
while self.current_token.type in (TokenType.MUL, TokenType.DIV): | |
token = self.current_token | |
if token.type == TokenType.MUL: | |
self.eat(TokenType.MUL) | |
elif token.type == TokenType.DIV: | |
self.eat(TokenType.DIV) | |
node = BinaryOperator(left=node, op=token, right=self.factor()) | |
return node | |
def expr(self): | |
""" | |
expr : term ( + or - term).. | |
term : factor (* or / factor).. | |
factor : int or (expr) | |
""" | |
node = self.term() | |
while self.current_token.type in (TokenType.PLUS, TokenType.MINUS): | |
token = self.current_token | |
if token.type == TokenType.PLUS: | |
self.eat(TokenType.PLUS) | |
elif token.type == TokenType.MINUS: | |
self.eat(TokenType.MINUS) | |
node = BinaryOperator(left=node, op=token, right=self.term()) | |
return node | |
def program(self): | |
return self.expr() | |
def parse(self): | |
return self.program() | |
# INTERPRETER | |
class NodeVisitorError(Exception): | |
pass | |
class InterpreterError(Exception): | |
pass | |
class NodeVisitor: | |
def visit(self, node): | |
method_name = 'visit_' + type(node).__name__ | |
visitor = getattr(self, method_name, self.generic_visit) | |
return visitor(node) | |
def generic_visit(self, node): | |
raise NodeVisitorError(f'No method to visit {type(node).__name__}') | |
class Interpreter(NodeVisitor): | |
def __init__(self, parser): | |
self.parser = parser | |
def visit_BinaryOperator(self, node): | |
left = self.visit(node.left) | |
right = self.visit(node.right) | |
if node.op.type == TokenType.PLUS: | |
return left + right | |
elif node.op.type == TokenType.MINUS: | |
return left - right | |
elif node.op.type == TokenType.MUL: | |
return left * right | |
elif node.op.type == TokenType.DIV: | |
return left // right | |
def visit_Integer(self, node): | |
return node.value | |
def eval(self, ast:ASTNode): | |
return self.visit(ast) | |
def eval_program(program:str): | |
lexer = Lexer(program) | |
parser = Parser(lexer) | |
ast = parser.parse() | |
interpreter = Interpreter(parser) | |
result = interpreter.eval(ast) | |
return result | |
# REPL | |
def repl(): | |
while True: | |
program = input('repl> ') | |
print(program) | |
if program: | |
try: | |
result = eval_program(program) | |
print(result) | |
except (ParserError, LexerError, NodeVisitorError) as e: | |
print(e) | |
def test_paspy(): | |
test_cases = [ | |
("2+2",4), | |
(" 23432 * 423 ", 9911736), | |
(" (1024/16)+36*2 ", 136) | |
] | |
for case, expected_answer in test_cases: | |
assert eval_program(case) == expected_answer | |
print("OK") | |
if __name__ == '__main__': | |
test_paspy() | |
repl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment