forked from GitHub/gf-core
152 lines
4.4 KiB
Haskell
152 lines
4.4 KiB
Haskell
module Tokenize where
|
|
|
|
import Operations
|
|
---- import UseGrammar (isLiteral,identC)
|
|
import CFIdent
|
|
|
|
import Char
|
|
|
|
-- lexers = tokenizers, to prepare input for GF grammars. AR 4/1/2002
|
|
-- an entry for each is included in Custom.customTokenizer
|
|
|
|
-- just words
|
|
|
|
tokWords :: String -> [CFTok]
|
|
tokWords = map tS . words
|
|
|
|
tokLits :: String -> [CFTok]
|
|
tokLits = map mkCFTok . words
|
|
|
|
tokVars :: String -> [CFTok]
|
|
tokVars = map mkCFTokVar . words
|
|
|
|
mkCFTok :: String -> CFTok
|
|
mkCFTok s = case s of
|
|
'"' :cs@(_:_) -> tL $ init cs
|
|
'\'':cs@(_:_) -> tL $ init cs --- 's Gravenhage
|
|
_:_ | all isDigit s -> tI s
|
|
_ -> tS s
|
|
|
|
mkCFTokVar :: String -> CFTok
|
|
mkCFTokVar s = case s of
|
|
'?':_:_ -> tM s --- "?" --- compat with prCF
|
|
'x':'_':_ -> tV s
|
|
'x':[] -> tV s
|
|
'$':xs@(_:_) -> if last s == '$' then tV (init xs) else tS s
|
|
_ -> tS s
|
|
|
|
mkTokVars :: (String -> [CFTok]) -> String -> [CFTok]
|
|
mkTokVars tok = map tv . tok where
|
|
tv (TS s) = mkCFTokVar s
|
|
tv t = t
|
|
|
|
mkLit :: String -> CFTok
|
|
mkLit s = if (all isDigit s) then (tI s) else (tL s)
|
|
|
|
mkTL :: String -> CFTok
|
|
mkTL s = if (all isDigit s) then (tI s) else (tL ("'" ++ s ++ "'"))
|
|
|
|
|
|
-- Haskell lexer, usable for much code
|
|
|
|
lexHaskell :: String -> [CFTok]
|
|
lexHaskell ss = case lex ss of
|
|
[(w@(_:_),ws)] -> tS w : lexHaskell ws
|
|
_ -> []
|
|
|
|
-- somewhat shaky text lexer
|
|
|
|
lexText :: String -> [CFTok]
|
|
lexText = uncap . lx where
|
|
|
|
lx s = case s of
|
|
p : cs | isMPunct p -> tS [p] : uncap (lx cs)
|
|
p : cs | isPunct p -> tS [p] : lx cs
|
|
s : cs | isSpace s -> lx cs
|
|
_ : _ -> getWord s
|
|
_ -> []
|
|
|
|
getWord s = tS w : lx ws where (w,ws) = span isNotSpec s
|
|
isMPunct c = elem c ".!?"
|
|
isPunct c = elem c ",:;()\""
|
|
isNotSpec c = not (isMPunct c || isPunct c || isSpace c)
|
|
uncap (TS (c:cs) : ws) = tC (c:cs) : ws
|
|
uncap s = s
|
|
|
|
-- lexer for C--, a mini variant of C
|
|
|
|
lexC2M :: String -> [CFTok]
|
|
lexC2M = lexC2M' False
|
|
|
|
lexC2M' :: Bool -> String -> [CFTok]
|
|
lexC2M' isHigherOrder s = case s of
|
|
'#':cs -> lexC $ dropWhile (/='\n') cs
|
|
'/':'*':cs -> lexC $ dropComment cs
|
|
c:cs | isSpace c -> lexC cs
|
|
c:cs | isAlpha c -> getId s
|
|
c:cs | isDigit c -> getLit s
|
|
c:d:cs | isSymb [c,d] -> tS [c,d] : lexC cs
|
|
c:cs | isSymb [c] -> tS [c] : lexC cs
|
|
_ -> [] --- covers end of file and unknown characters
|
|
where
|
|
lexC = lexC2M' isHigherOrder
|
|
getId s = mkT i : lexC cs where (i,cs) = span isIdChar s
|
|
getLit s = tI i : lexC cs where (i,cs) = span isDigit s
|
|
isIdChar c = isAlpha c || isDigit c || elem c "'_"
|
|
isSymb = reservedAnsiCSymbol
|
|
dropComment s = case s of
|
|
'*':'/':cs -> cs
|
|
_:cs -> dropComment cs
|
|
_ -> []
|
|
mkT i = if (isRes i) then (tS i) else
|
|
if isHigherOrder then (tV i) else (tL ("'" ++ i ++ "'"))
|
|
isRes = reservedAnsiC
|
|
|
|
|
|
reservedAnsiCSymbol s = case lookupTree show s ansiCtree of
|
|
Ok True -> True
|
|
_ -> False
|
|
|
|
reservedAnsiC s = case lookupTree show s ansiCtree of
|
|
Ok False -> True
|
|
_ -> False
|
|
|
|
-- for an efficient lexer: precompile this!
|
|
ansiCtree = buildTree $ [(s,True) | s <- reservedAnsiCSymbols] ++
|
|
[(s,False) | s <- reservedAnsiCWords]
|
|
|
|
reservedAnsiCSymbols = words $
|
|
"<<= >>= << >> ++ -- == <= >= *= += -= %= /= &= ^= |= " ++
|
|
"^ { } = , ; + * - ( ) < > & % ! ~"
|
|
|
|
reservedAnsiCWords = words $
|
|
"auto break case char const continue default " ++
|
|
"do double else enum extern float for goto if int " ++
|
|
"long register return short signed sizeof static struct switch typedef " ++
|
|
"union unsigned void volatile while " ++
|
|
"main printin putchar" --- these are not ansi-C
|
|
|
|
-- turn unknown tokens into string literals; not recursively for literals 123, 'foo'
|
|
|
|
unknown2string :: (String -> Bool) -> [CFTok] -> [CFTok]
|
|
unknown2string isKnown = map mkOne where
|
|
mkOne t@(TS s) = if isKnown s then t else mkTL s
|
|
mkOne t@(TC s) = if isKnown s then t else mkTL s
|
|
mkOne t = t
|
|
|
|
unknown2var :: (String -> Bool) -> [CFTok] -> [CFTok]
|
|
unknown2var isKnown = map mkOne where
|
|
mkOne t@(TS "??") = if isKnown "??" then t else tM "??"
|
|
mkOne t@(TS s) = if isKnown s then t else tV s
|
|
mkOne t@(TC s) = if isKnown s then t else tV s
|
|
mkOne t = t
|
|
|
|
lexTextLiteral isKnown = unknown2string (eitherUpper isKnown) . lexText
|
|
lexHaskellLiteral isKnown = unknown2string isKnown . lexHaskell
|
|
|
|
lexHaskellVar isKnown = unknown2var isKnown . lexHaskell
|
|
|
|
eitherUpper isKnown w@(c:cs) = isKnown (toLower c : cs) || isKnown (toUpper c : cs)
|
|
eitherUpper isKnown w = isKnown w
|
|
|