Created
May 19, 2012 13:13
-
-
Save osa1/2730819 to your computer and use it in GitHub Desktop.
hand-written lexer for EtuLang (a hypothetical PL for a PL course) in Haskell
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# OPTIONS_GHC -Wall #-} | |
module Lexer where | |
import Prelude hiding (lex) | |
import Control.Monad.State | |
import qualified Data.Set as S | |
import Char (toLower) | |
data LexemeClass | |
= LLParen | |
| LRParen | |
| LAssign | |
| LAdd | |
| LSub | |
| LDiv | |
| LMult | |
| LSemicolon | |
| LNoteq | |
| LGreater | |
| LLess | |
| LGreq | |
| LLesseq | |
| LComma | |
| LColon | |
| LId | |
| LInt | |
| LKeyword | |
| LEOF | |
deriving (Show, Eq) | |
data Token = Token LexemeClass Pos String deriving (Show) | |
data Pos = Pos { line :: Int , col :: Int } deriving (Show) | |
data LexerState = LexerState | |
{ currentPos :: Pos | |
, rest :: !String | |
, curLexeme :: String | |
} deriving (Show) | |
type LexerError = (String, Pos) | |
type Lexer = State LexerState (Either LexerError Token) | |
keywords :: S.Set String | |
keywords = S.fromList [ | |
"begin", "end", "if", "then", "else", "while", "program", "integer" | |
] | |
initLexerState :: String -> LexerState | |
initLexerState s = LexerState (Pos 0 0) s [] | |
runLexer :: Lexer -> LexerState -> (Either LexerError Token, LexerState) | |
runLexer = runState | |
evalLexer :: Lexer -> LexerState -> Either LexerError Token | |
evalLexer = evalState | |
skipComment :: LexerState -> Either LexerError LexerState | |
skipComment (LexerState p [] _) = Left $ ("EOF while reading comment", p) | |
skipComment (LexerState (Pos l c) (f:r) _) | |
| f == '%' = Right $ LexerState (Pos l (c+1)) r "" | |
| f == '\n' = skipComment $ LexerState (Pos (l+1) 0) r "" | |
| otherwise = skipComment $ LexerState (Pos l (c+1)) r "" | |
readToken :: [Char] -> LexemeClass -> Lexer | |
readToken chars cls = do | |
s <- get | |
case s of | |
(LexerState p@(Pos l c) [] lexeme) -> do | |
put (LexerState p [] "") | |
return $ Right $ Token cls (Pos l (c-(length lexeme))) lexeme | |
(LexerState (Pos l c) str@(f:r) lexeme) | |
| toLower f `elem` chars -> do | |
put (LexerState (Pos l (c+1)) r (lexeme++[f])) | |
readToken chars cls | |
| otherwise -> do | |
let pos' = if f == '\n' | |
then (Pos (l+1) 0) | |
else (Pos l c) | |
put $ LexerState pos' str "" | |
return $ Right $ Token cls (Pos l (c-(length lexeme))) lexeme | |
readId :: Lexer | |
readId = do | |
t <- readToken (['0'..'9'] ++ ['a'..'z']) LId | |
case t of | |
Left err -> return $ Left err | |
Right tok@(Token _ p s) -> | |
if S.member s keywords | |
then return $ Right (Token LKeyword p s) | |
else return $ Right tok | |
readInt :: Lexer | |
readInt = readToken ['0'..'9'] LInt | |
incCol :: LexerState -> LexerState | |
incCol (LexerState (Pos l c) (_:r) lm) = LexerState (Pos l (c+1)) r lm | |
lex' :: Lexer | |
lex' = do | |
st@(LexerState p s _) <- get | |
put $ incCol st | |
case s of | |
[] -> return $ Right $ Token LEOF p "" | |
(f:r) | toLower f `elem` ['a'..'z'] -> readId | |
| f `elem` ['0'..'9'] -> readInt | |
| f == '(' -> return $ Right $ Token LLParen p "(" | |
| f == ')' -> return $ Right $ Token LRParen p ")" | |
| f == ':' -> case r of | |
('=':_) -> do put $ incCol st | |
return $ Right $ Token LAssign p ":=" | |
_ -> return $ Right $ Token LColon p ":" | |
| f == '+' -> return $ Right $ Token LAdd p "+" | |
| f == '-' -> return $ Right $ Token LSub p "-" | |
| f == '/' -> return $ Right $ Token LDiv p "/" | |
| f == '*' -> return $ Right $ Token LMult p "*" | |
| f == ';' -> return $ Right $ Token LSemicolon p ";" | |
| f == '<' -> case r of | |
('>':_) -> do put $ incCol st | |
return $ Right $ Token LNoteq p "<>" | |
_ -> return $ Right $ Token LLess p "<" | |
| f == '>' -> case r of | |
('=':_) -> do put $ incCol st | |
return $ Right $ Token LGreq p ">=" | |
_ -> return $ Right $ Token LGreater p ">" | |
| f == ',' -> return $ Right $ Token LComma p "," | |
| f == '%' -> do let st' = skipComment st | |
either (\e -> return $ Left e) | |
(\ns -> (put ns) >> lex') | |
st' | |
| f == ' ' -> (put $ incCol st) >> lex' | |
| f == '\n' -> (put $ incCol st) >> lex' | |
| otherwise -> return $ Left ("Unrecognized character.", p) | |
lex :: String -> Either LexerError [Token] | |
lex s = lexA (initLexerState s) [] | |
where lexA :: LexerState -> [Token] -> Either LexerError [Token] | |
lexA state tokens = | |
let (r,s) = runLexer lex' state | |
in case r of | |
Left err -> Left err | |
Right t@(Token c _ _) -> | |
if c == LEOF | |
then Right $ tokens ++ [t] | |
else lexA s (tokens++[t]) | |
main :: IO () | |
main = do | |
input <- getContents | |
let tokens = lex input | |
putStrLn $ show tokens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment