module HsLexerPass1 where import HsLex(haskellLex) import HsLexUtils import HsLayoutPre(layoutPre,Pos,PosToken) import List(mapAccumL) default(Int)
The function lexerPass1 handles the part of lexical analysis that can be done independently of the parser, i.e., the tokenization and the addition of the extra layout tokens <n> and {n}, as specified in section 9.3 of the revised Haskell 98 Report.
type LexerOutput = [PosToken]
type Lexer = String -> LexerOutput
lexerPass1 :: Lexer
lexerPass1 = lexerPass1Only . lexerPass0
lexerPass1Only = layoutPre . rmSpace
rmSpace = filter (notWhite.fst)
notWhite t = t/=Whitespace &&
t/=Commentstart && t/=Comment &&
t/=NestedComment
-- Tokenize and add position information:
lexerPass0 :: Lexer
lexerPass0 = lexerPass0' startPos
lexerPass0' :: Pos -> Lexer
lexerPass0' pos0 = addPos . haskellLex . rmcr
where
addPos = snd . mapAccumL pos pos0
pos p (t,r) = (nextPos p s,(t,(p,s)))
where s = reverse r
startPos = (1,1) :: Pos -- The first column is designated column 1, not 0.
nextPos :: Pos -> String -> Pos
nextPos = foldl nextPos1
nextPos1 :: Pos -> Char -> Pos
nextPos1 (y,x) c =
case c of
-- The characters newline, return, linefeed, and formfeed, all start
-- a new line.
'\n' -> (y+1, 1)
'\CR' -> (y+1, 1)
'\LF' -> (y+1, 1)
'\FF' -> (y+1, 1)
-- Tab stops are 8 characters apart.
-- A tab character causes the insertion of enough spaces to align the
-- current position with the next tab stop.
-- + (not in the report) the first tab stop is column 1.
'\t' -> (y, x+8-(x-1) `mod` 8)
_ -> (y, x+1)
Since nextPos examines one character at a time, it will increase the line number by 2 if it sees \CR\LF, which can happen when reading DOS files on a Unix like system. Since the extra \CR characters can cause trouble later as well, we choose to simply remove them here.
rmcr ('\CR':'\LF':s) = '\LF':rmcr s rmcr (c:s) = c:rmcr s rmcr "" = ""