Lexer.hs

{-

ToDo: Parsing floats is a *real* hack...
ToDo: FloatTok should have three parts (integer part, fraction, exponent)
ToDo: Use a lexical analyser generator (lx?)

Known bug: It seems that _x is a valid identifier name; our lexer, on the
other hand, produces two lexemes, the wildcard followed by x. Argh!!!!
e.p. FIXED e.p.

-}

module Lexer (Token(..), lexer, isSymbol) where


import ParseMonad
import LexUtil
import SrcLoc
import Char
import Ratio
--import IOExts(trace)
import Token


{-

The source location, (f, y, x), is the coordinates of the previous token.
col is the current column in the source file f.  If col is 0, we are
somewhere at the beginning of the line before the first token.

Setting col to 0 is used in two places: just after emitting a virtual
close brace due to layout, so that next time through we check whether
we also need to emit a semi-colon, and at the beginning of the file,
to kick off the lexer.

-}
lexer :: (Token -> PM a) -> PM a
lexer cont =
    PM $ \input (SrcLoc f y x) col ->
    if col == 0 then tab f y x   True  input  col
                else tab f y col False input  col -- throw away old x
    where
    -- move past whitespace and comments
    tab f y x bol []          col = (unPM $ cont EOF) [] (SrcLoc f y x) col
    tab f y x bol ('\t':s)    col = tab f y (nextTab x) bol s col
    tab f y x bol ('\n':s)    col = newLine f s y col
    tab f y x bol ('-':'-':s) col = newLine f (drop 1 (dropWhile (/= '\n') s))
				      y col
    tab f y x bol ('{':'-':s) col = nestedComment tab f y x bol s col
    tab f y x bol (c:s)       col
		 | isSpace c = tab f y (x + 1) bol s col
	         | otherwise =
		     if bol then
		         (unPM $ lexBOL cont)   (c:s) (SrcLoc f y x) x
		     else
		         (unPM $ lexToken cont) (c:s) (SrcLoc f y x) x

    newLine f s y col = tab f (y + 1) 1 True s col


nextTab x = x + (tab_length - (x - 1) `mod` tab_length)

{-

When we are lexing the first token of a line, check whether we need to
insert virtual semicolons or close braces due to layout.

-}
lexBOL :: (Token -> PM a) -> PM a
lexBOL cont =
    PM $ \ s loc@(SrcLoc f y x) col (state@ctx) ->
    if need_close_curly x ctx then 
        -- trace "layout: inserting '}'\n" $
	-- Set col to 0, indicating that we're still at the
	-- beginning of the line, in case we need a semi-colon too.
	-- Also pop the context here, so that we don't insert
	-- another close brace before the parser can pop it.
        (unPM $ cont VRightCurly) s loc 0 (tail ctx)
    else if need_semi_colon x ctx then
        -- trace "layout: inserting ';'\n" $
	(unPM $ cont SemiColon) s loc col state
    else
        (unPM $ lexToken cont)  s loc col state
    where
        need_close_curly x []    = False
	need_close_curly x (i:_) = case i of
				   NoLayout -> False
				   Layout n -> x < n

        need_semi_colon x []     = False
	need_semi_colon x (i:_)  = case i of
				   NoLayout -> False
				   Layout n -> x == n


lexToken :: (Token -> PM a) -> PM a
lexToken cont =
    PM lexToken'
    where
    lexToken' (c:s) loc@(SrcLoc f y x') x =
        -- trace ("lexer: y = " ++ show y ++ " x = " ++ show x ++ "\n") $ 
	case c of
	-- First the special symbols
        '(' -> special LeftParen
        ')' -> special RightParen
        ',' -> special Comma
        ';' -> special SemiColon
        '[' -> special LeftSquare
        ']' -> special RightSquare
        '`' -> special BackQuote
        '{' -> special LeftCurly . (NoLayout:) -- push context on '{'
        '}' -> \state ->
               case state of
               _:ctxt ->
	           special RightCurly ctxt -- pop context on '}'
               []       ->
		   (unPM $ parseError "parse error (possibly incorrect indentation)")
                   s loc x []

        '\'' -> (unPM $ lexChar cont)   s loc (x + 1)
        '\"' -> (unPM $ lexString cont) s loc (x + 1)


        c | isDigit c ->
	      case lexInt (c:s) of
	      Decimal (n, rest) ->
		  case rest of
		  ('.':c2:rest2) | isDigit c2 ->
				     case lexFloatRest (c2:rest2) of
				     Nothing -> (unPM $
						 parseError "illegal float.")
						s loc x
				     Just (n2,rest3) ->
					 let f = n ++ ('.':n2) in
				         forward (length f) (FloatTok f) rest3
		  _ -> forward (length n) (IntTok n) rest
	      Octal       (n,rest) -> forward (length n) (IntTok n) rest
	      Hexadecimal (n,rest) -> forward (length n) (IntTok n) rest

          | isLower_ c ->
	      let (vidtail, rest) = span isIdent s
		  vid             = c:vidtail
		  l_vid           = 1 + length vidtail
	      in
		  case lookup vid reserved_ids of
		  Just keyword -> forward l_vid keyword rest
		  Nothing      -> forward l_vid (VarId vid) rest

          | isUpper c ->
	      let (contail, rest) = span isIdent s
		  l_con           = 1 + length contail
		  con             = c:contail
	      in
		  case lookup con reserved_ids of
		  Just keyword -> forward l_con keyword rest
		  Nothing      -> lexQual l_con con rest
                                      (forward l_con (ConId con) rest)
		  
          | isSymbol c ->
	      let (symtail, rest) = span isSymbol s
		  sym             = c : symtail
		  l_sym           = 1 + length symtail
	      in
	          case lookup sym reserved_ops of
		  Just t  -> forward l_sym t rest
		  Nothing -> case c of
			     ':' -> forward l_sym (ConSym sym) rest
                             '.' | l_sym == 1 -> special Period
                                 | otherwise  ->
                                    forward l_sym (VarSym sym) rest
			     _   -> forward l_sym (VarSym sym) rest

          | otherwise ->
	      (unPM $
	       parseError ("illegal character \'" ++
			   showLitChar c "" ++ "\'\n"))
	      s loc x
		  
      where
      special t = forward 1 t s
      forward n t s = (unPM $ cont t) s loc (x + n)
 
      lexFloatRest r = case span isDigit r of
		       (r2, 'e':r3) -> lexFloatExp (r2 ++ "e") r3
		       (r2, 'E':r3) -> lexFloatExp (r2 ++ "e") r3
		       f@(r2,   r3) -> Just f
 
      lexFloatExp r1 ('-':r2) = lexFloatExp2 (r1 ++ "-") r2
      lexFloatExp r1 ('+':r2) = lexFloatExp2 (r1 ++ "+") r2
      lexFloatExp r1      r2  = lexFloatExp2 r1          r2

      lexFloatExp2 r1 r2 = case span isDigit r2 of
			   ("", _ ) -> Nothing
			   (ds, r3) -> Just (r1++ds,r3)

      lexQual l_mod mod s lexConId =
          case s of
          '.':c:rest
           | isLower c ->	-- qualified varid?
	       let (vidtail, rest1) = span isIdent rest
		   vid              = c:vidtail
		   l_vid            = 1 + length vidtail
	       in
	       case lookup vid reserved_ids of -- cannot qualify reserved word
	       Just keyword -> lexConId
	       Nothing      -> forward (l_mod + l_vid + 1) -- + 1 for the '.'
				       (QVarId (mod, vid)) rest1

           | isUpper c ->	-- qualified conid?
	       let (con1, rest1) = span isIdent rest
		   qcon          = c : con1
		   l_con1        = 1 + length con1
	       in
		   forward (l_mod + l_con1 + 1) -- + 1 for the '.'
			   (QConId (mod, qcon)) rest1

           | isSymbol c ->	-- qualified symbol?
               let (symtail, rest1) = span isSymbol rest    
		   sym              = c : symtail
		   l_sym            = 1 + length symtail
               in
	       case lookup sym reserved_ops of
	       -- cannot qualify a reserved operator
	       Just _  -> lexConId
	       Nothing ->
		   case c of
	           ':' -> forward (l_mod + l_sym + 1) -- + 1 for the '.'
			          (QConSym (mod, sym)) rest1
		   _   -> forward (l_mod + l_sym + 1) -- + 1 for the '.'
			          (QVarSym (mod, sym)) rest1

           -- special case for M.[]; allows whitespace between '[' and ']'
	   -- (provided layout is not violated)
           | c == '[' ->
	       case snd $ span isSpace rest of
	       (']':_) -> forward l_mod (QModId mod) s
               _       -> lexConId

           -- special case for M.(), M.(,,,), etc.; allows whitespace between
	   -- '(' and ')', and '(' and ',' (provided layout is not violated)
           | c == '(' ->
	       case snd $ span isSpace rest of
	       (')':_) -> forward l_mod (QModId mod) s
               (',':_) -> forward l_mod (QModId mod) s
	       _       -> lexConId

	  _ -> lexConId -- not a qualified object
			      
    lexToken' _  _ _ =
	error "Lexer.lexToken: Internal error: empty input stream."


lexInt ('0':o:d:r) | toLower o == 'o' && isOctDigit d
    = let (ds, rs) = span isOctDigit r
      in
           Octal       ('0':'o':d:ds, rs)
lexInt ('0':x:d:r) | toLower x == 'x' && isHexDigit d
    = let (ds, rs) = span isHexDigit r
      in 
           Hexadecimal ('0':'x':d:ds, rs)
lexInt r = Decimal     (span isDigit r)


lexChar :: (Token -> PM a) -> PM a
lexChar cont = PM lexChar'
    where
    lexChar' s loc x =
	case s of
	'\\':s ->
	    let (e, s2, i) =
		  runPM (escapeChar s) "" loc x []
	    in
                charEnd e s2 loc (x + i)
	c:s  -> charEnd c s  loc (x + 1)
	[]   -> error "Lexer.lexChar: Internal error: empty list."

    charEnd c ('\'':s)   =
	\loc x -> (unPM $ cont (Character c)) s loc (x + 1)
    charEnd c s         =
	(unPM $ parseError "improperly terminated character constant.") s 


lexString :: (Token -> PM a) -> PM a
lexString cont = PM lexString'
    where
    lexString' s loc@(SrcLoc f y _) x = loop "" s x y
	where
	loop e s x y =
	    case s of
            '\\':'&':s  -> loop e s (x+2) y
	    '\\':c:s | isSpace c -> stringGap e s (x + 2) y
		     | otherwise ->
			 let (e', sr, i) =
			       runPM (escapeChar (c:s)) ""  loc x []
                         in  loop (e':e) sr (x+i) y
            '\"':s{-"-} -> (unPM $ cont (StringTok (reverse e))) s  loc (x + 1)
	    c:s	      -> loop (c:e) s (x + 1) y
	    []          -> (unPM $ parseError "improperly terminated string.")
			            s  loc x

	stringGap e s x y =
	    case s of
		'\n':s -> stringGap e s 1 (y + 1)
	        '\\':s -> loop e s (x + 1) y
	        c:s' | isSpace c -> stringGap e s' (x + 1) y
	             | otherwise ->
			 (unPM $ parseError "illegal character in string gap.")
			 s  loc x
	        []     -> error "Lexer.stringGap: Internal error: empty list."


escapeChar :: String -> PM (Char, String, Int)
escapeChar s = case s of
    -- Production charesc from section B.2 (Note: \& is handled by caller)
   'a':s 	 -> return ('\a', s, 2)
   'b':s 	 -> return ('\b', s, 2)
   'f':s 	 -> return ('\f', s, 2)
   'n':s 	 -> return ('\n', s, 2)
   'r':s 	 -> return ('\r', s, 2)
   't':s 	 -> return ('\t', s, 2)
   'v':s 	 -> return ('\v', s, 2)
   '\\':s        -> return ('\\', s, 2)
   '"':s         -> return ('\"', s, 2)
   '\'':s        -> return ('\'', s, 2)

    -- Production ascii from section B.2
   '^':x@(c:s)   -> cntrl x
   'N':'U':'L':s -> return ('\NUL', s, 4)
   'S':'O':'H':s -> return ('\SOH', s, 4)
   'S':'T':'X':s -> return ('\STX', s, 4)
   'E':'T':'X':s -> return ('\ETX', s, 4)
   'E':'O':'T':s -> return ('\EOT', s, 4)
   'E':'N':'Q':s -> return ('\ENQ', s, 4)
   'A':'C':'K':s -> return ('\ACK', s, 4)
   'B':'E':'L':s -> return ('\BEL', s, 4)
   'B':'S':s     -> return ('\BS',  s, 3)
   'H':'T':s  	 -> return ('\HT',  s, 3)
   'L':'F':s 	 -> return ('\LF',  s, 3)
   'V':'T':s 	 -> return ('\VT',  s, 3)
   'F':'F':s 	 -> return ('\FF',  s, 3)
   'C':'R':s 	 -> return ('\CR',  s, 3)
   'S':'O':s 	 -> return ('\SO',  s, 3)
   'S':'I':s 	 -> return ('\SI',  s, 3)
   'D':'L':'E':s -> return ('\DLE', s, 4)
   'D':'C':'1':s -> return ('\DC1', s, 4)
   'D':'C':'2':s -> return ('\DC2', s, 4)
   'D':'C':'3':s -> return ('\DC3', s, 4)
   'D':'C':'4':s -> return ('\DC4', s, 4)
   'N':'A':'K':s -> return ('\NAK', s, 4)
   'S':'Y':'N':s -> return ('\SYN', s, 4)
   'E':'T':'B':s -> return ('\ETB', s, 4)
   'C':'A':'N':s -> return ('\CAN', s, 4)
   'E':'M':s     -> return ('\EM',  s, 3)
   'S':'U':'B':s -> return ('\SUB', s, 4)
   'E':'S':'C':s -> return ('\ESC', s, 4)
   'F':'S':s     -> return ('\FS',  s, 3)
   'G':'S':s     -> return ('\GS',  s, 3)
   'R':'S':s     -> return ('\RS',  s, 3)
   'U':'S':s     -> return ('\US',  s, 3)
   'S':'P':s     -> return ('\SP',  s, 3)
   'D':'E':'L':s -> return ('\DEL', s, 4)


   -- Depending upon the compiler/interpreter's Char type, these yield either
   -- just 8-bit ISO-8859-1 or 2^16 UniCode.  The report says it should be the
   -- latter.

   -- Octal representation of a character
   'o':s           -> let (ds, s') = span isOctDigit s
			  n        = readNumber 8 ds
                      in 
		          numberToChar n s' (length ds + 1)

   -- Hexadecimal representation of a character
   'x':s           -> let (ds, s') = span isHexDigit s
			  n        = readNumber 16 ds
                      in
		          numberToChar n s' (length ds + 1)
 
   -- Base 10 representation of a charactef
   d:s | isDigit d -> let (ds, s') = span isDigit s
			  n        = readNumber 10 (d:ds)
                      in 
		          numberToChar n s' (length ds + 1)

   _               -> parseError "illegal escape sequence."

   where numberToChar n s l_n =
	     if toInteger (fromEnum (minBound :: Char))<=n &&
		n <= toInteger (fromEnum (maxBound :: Char))
	     then return (chr $ fromInteger n, s, l_n)
	     else parseError $ "illegal character literal (number "++
			       show n++" out of range)."
            
{-

Production cntrl from section B.2

-}
cntrl :: String -> PM (Char, String, Int)
cntrl (c   :s) | isUpper c = return (chr (ord c - ord 'A'), s, 2)
cntrl ('@' :s)             = return ('\^@', s, 2)
cntrl ('[' :s)             = return ('\^[', s, 2)
cntrl ('\\':s)             = return ('\^\', s, 2)
cntrl (']' :s)             = return ('\^]', s, 2)
cntrl ('^' :s)             = return ('\^^', s, 2)
cntrl ('_' :s)             = return ('\^_', s, 2)
cntrl _                    = parseError "illegal control character"


nestedComment cont f y x bol s  col =
    case s of
    '-':'}':s -> cont f y (x + 2) bol s  col
    '{':'-':s -> nestedComment (nestedComment cont) f y (x + 2) bol s  col
    '\t':s    -> nestedComment cont f y (nextTab x) bol s  col
    '\n':s    -> nestedComment cont f (y + 1) 1 True s  col
    c:s       -> nestedComment cont f y (x + 1) bol s  col
    []        -> error "Lexer.nestedComment: Internal error: empty list."


{-
getTokens 0       = return []
getTokens (n + 1) = lexer (\t -> do { l <- getTokens n
			            ; return (t:l)
				    }
			  )

tokens f s =
    (unPM $ getTokens (length s)) s (SrcLoc f 0 0) 0
        (error "Lexer.tokens: problem with initial infix environment.", [])

-}

{-
instance Printable Token where
    ppi (VarId v)        = text v
    ppi (QVarId (m, v))  = text m <> '.' <> text v
    ppi (ConId c)        = text c
    ppi (QConId (m, c))  = text m <> '.' <> text c
    ppi (VarSym v)       = text v
    ppi (ConSym c)       = text c
    ppi (QVarSym (m, v)) = text m <> '.' <> text v
    ppi (QConSym (m, c)) = text m <> '.' <> text c
    ppi (QModId m)       = text m
    ppi (IntTok i)       = text i
    ppi (FloatTok f)     = text f
    ppi (Character ch)   = char ch
    ppi (StringTok s)    = text s

    ppi LeftParen    = lparen
    ppi RightParen   = rparen
    ppi SemiColon    = semi
    ppi LeftCurly    = lcurly
    ppi RightCurly   = rcurly
    ppi VRightCurly  = text "virtual" <+> rcurly
    ppi LeftSquare   = lbrack
    ppi RightSquare  = rbrack
    ppi Comma        = comma
    ppi Underscore   = char '_'
    ppi BackQuote    = backQuote
    ppi Period       = char '.'

    ppi DotDot       = text ".."
    ppi DoubleColon  = text "::"
    ppi Equals       = text "=="
    ppi Backslash    = char '\\'
    ppi Bar          = char '|'
    ppi LeftArrow    = text "<-"
    ppi RightArrow   = text "->"
    ppi At           = char'@'
    ppi Tilde        = char '~'
    ppi DoubleArrow  = text "=>"
    ppi Minus        = char '-'
    ppi Exclamation  = char '!'

    ppi KW_As        = text "as"
    ppi KW_Case      = text "case"     
    ppi KW_Class     = text "class"    
    ppi KW_Data      = text "data"     
    ppi KW_Default   = text "default"  
    ppi KW_Deriving  = text "deriving" 
    ppi KW_Do        = text "do"       
    ppi KW_Else      = text "else"     
    ppi KW_If        = text "if"       
    ppi KW_Import    = text "import"   
    ppi KW_In        = text "in"       
    ppi KW_Infix     = text "infix"    
    ppi KW_InfixL    = text "infixl"   
    ppi KW_InfixR    = text "infixr"   
    ppi KW_Instance  = text "instance" 
    ppi KW_Let       = text "let"      
    ppi KW_Module    = text "module"   
    ppi KW_NewType   = text "newtype"  
    ppi KW_Of        = text "of"       
    ppi KW_Then      = text "then"     
    ppi KW_Type      = text "type"     
    ppi KW_Where     = text "where"    
    ppi KW_Qualified = text "qualified"
    ppi KW_Hiding    = text "hiding"
    ppi KW_Primitive = text "primitive"

    ppi KW_Property  = text "property"
    ppi KW_QAll      = text "All"
    ppi KW_QExists   = text "Ex"
    ppi KW_QAllDef   = text "AllDef"

    ppi EOF = text "EOF"
-}