Skip to content

Instantly share code, notes, and snippets.

@bistcuite
Created December 9, 2021 11:59
Show Gist options
  • Save bistcuite/a3179118a2a6eff7e8430dda872db301 to your computer and use it in GitHub Desktop.
Save bistcuite/a3179118a2a6eff7e8430dda872db301 to your computer and use it in GitHub Desktop.
def tokenizer(input_program):
# A `current` variable for tracking our position in the code like a cursor.
current = 0
# And a `tokens` array for pushing our tokens to.
tokens = []
#* this is a little optimization, since `input_program` length won't change
# in the excecution it is safe to get the length at the beginning.
program_length = len(input_program)
#* this is to simplify the regexp's, since in JS regexp can be
# inline and look very pretty.
REGEX_WHITESPACE = re.compile(r"\s");
REGEX_NUMBERS = re.compile(r"[0-9]");
REGEX_LETTERS = re.compile(r"[a-z]", re.I);
# We start by creating a `while` loop where we are setting up our `current`
# variable to be incremented as much as we want `inside` the loop.
#
# We do this because we may want to increment `current` many times within a
# single loop because our tokens can be any length.
while current < program_length:
# We're also going to store the `current` character in the `input`.
char = input_program[current]
# The first thing we want to check for is an open parenthesis. This will
# later be used for `CallExpressions` but for now we only care about the
# character.
#
# We check to see if we have an open parenthesis:
if char == '(':
tokens.append({
'type': 'lparen',
'value': '('
})
# Then we increment `current`
current = current+1
# And we `continue` onto the next cycle of the loop.
continue
# Next we're going to check for a closing parenthesis. We do the same exact
# thing as before: Check for a closing parenthesis, add a new token,
# increment `current`, and `continue`.
if char == ')':
tokens.append({
'type': 'rparen',
'value': ')'
})
current = current+1
continue
# Moving on, we're now going to check for whitespace. This is interesting
# because we care that whitespace exists to separate characters, but it
# isn't actually important for us to store as a token. We would only throw
# it out later.
#
# So here we're just going to test for existence and if it does exist we're
# going to just `continue` on.
if re.match(REGEX_WHITESPACE, char):
current = current+1
continue
# The next type of token is a number. This is different than what we have
# seen before because a number could be any number of characters and we
# want to capture the entire sequence of characters as one token.
#
# (add 123 456)
# ^^^ ^^^
# Only two separate tokens
#
# So we start this off when we encounter the first number in a sequence.
if re.match(REGEX_NUMBERS, char):
# We're going to create a `value` string that we are going to push
# characters to.
value = ''
# Then we're going to loop through each character in the sequence until
# we encounter a character that is not a number, pushing each character
# that is a number to our `value` and incrementing `current` as we go.
while re.match(REGEX_NUMBERS, char):
value += char
current = current+1
char = input_program[current];
# After that we push our `number` token to the `tokens` array.
tokens.append({
'type': 'number',
'value': value
})
# And we continue on.
continue
# The last type of token will be a `name` token. This is a sequence of
# letters instead of numbers, that are the names of functions in our lisp
# syntax.
#
# (add 2 4)
# ^^^
# Name token
#
if re.match(REGEX_LETTERS, char):
value = ''
# Again we're just going to loop through all the letters pushing them to
# a value.
while re.match(REGEX_LETTERS, char):
value += char
current = current+1
char = input_program[current]
# And pushing that value as a token with the type `name` and continuing.
tokens.append({
'type': 'name',
'value': value
})
continue
# Finally if we have not matched a character by now, we're going to throw
# an error and completely exit.
raise ValueError('I dont know what this character is: ' + char);
#Then at the end of our `tokenizer` we simply return the tokens array.
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment