diff --git a/gf.cabal b/gf.cabal index 3e823f26e..625865b1b 100644 --- a/gf.cabal +++ b/gf.cabal @@ -99,6 +99,7 @@ Library PGF.Optimize PGF.Printer PGF.Utilities + PGF.Lexing other-modules: PGF.CId PGF.Expr diff --git a/src/compiler/GF/Text/Lexing.hs b/src/compiler/GF/Text/Lexing.hs index 87d6ba4f7..29647a786 100644 --- a/src/compiler/GF/Text/Lexing.hs +++ b/src/compiler/GF/Text/Lexing.hs @@ -1,12 +1,12 @@ +-- | Lexers and unlexers - they work on space-separated word strings module GF.Text.Lexing (stringOp,opInEnv) where import GF.Text.Transliterations +import PGF.Lexing -import Data.Char +import Data.Char (isSpace) import Data.List (intersperse) --- lexers and unlexers - they work on space-separated word strings - stringOp :: String -> Maybe (String -> String) stringOp name = case name of "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) @@ -51,84 +51,3 @@ appUnlexer f = f . words wrapHTML :: String -> String wrapHTML = unlines . tag . intersperse "
" . lines where tag ss = "":"":"":"":"" : ss ++ ["",""] - -lexText :: String -> [String] -lexText = uncap . lext where - lext s = case s of - c:cs | isMajorPunct c -> [c] : uncap (lext cs) - c:cs | isMinorPunct c -> [c] : lext cs - c:cs | isSpace c -> lext cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs - _ -> [s] - uncap s = case s of - (c:cs):ws -> (toLower c : cs):ws - _ -> s - --- | Haskell lexer, usable for much code -lexCode :: String -> [String] -lexCode ss = case lex ss of - [(w@(_:_),ws)] -> w : lexCode ws - _ -> [] - --- | LaTeX style lexer, with "math" environment using Code between $...$ -lexMixed :: String -> [String] -lexMixed = concat . alternate False where - alternate env s = case s of - _:_ -> case break (=='$') s of - (t,[]) -> lex env t : [] - (t,c:m) -> lex env t : [[c]] : alternate (not env) m - _ -> [] - lex env = if env then lexCode else lexText - -bindTok :: [String] -> String -bindTok ws = case ws of - w:"&+":ws2 -> w ++ bindTok ws2 - w:[] -> w - w:ws2 -> w ++ " " ++ bindTok ws2 - [] -> "" - -unlexText :: [String] -> String -unlexText = unlext where - unlext s = case s of - w:[] -> w - w:[c]:[] | isPunct c -> w ++ [c] - w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) - w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs - w:ws -> w ++ " " ++ unlext ws - _ -> [] - --- capitalize first letter -capitInit s = case s of - c:cs -> toUpper c : cs - _ -> s - --- unquote each string of form "foo" -unquote = map unq where - unq s = case s of - '"':cs@(_:_) | last cs == '"' -> init cs - _ -> s - -unlexCode :: [String] -> String -unlexCode s = case s of - w:[] -> w - [c]:cs | isParen c -> [c] ++ unlexCode cs - w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs - w:ws -> w ++ " " ++ unlexCode ws - _ -> [] - - -unlexMixed :: [String] -> String -unlexMixed = concat . alternate False where - alternate env s = case s of - _:_ -> case break (=="$") s of - (t,[]) -> unlex env t : [] - (t,c:m) -> unlex env t : sep env c : alternate (not env) m - _ -> [] - unlex env = if env then unlexCode else unlexText - sep env c = if env then c ++ " " else " " ++ c - -isPunct = flip elem ".?!,:;" -isMajorPunct = flip elem ".?!" -isMinorPunct = flip elem ",:;" -isParen = flip elem "()[]{}" -isClosing = flip elem ")]}" diff --git a/src/runtime/haskell/PGF/Lexing.hs b/src/runtime/haskell/PGF/Lexing.hs new file mode 100644 index 000000000..808a2af6f --- /dev/null +++ b/src/runtime/haskell/PGF/Lexing.hs @@ -0,0 +1,91 @@ +module PGF.Lexing where +import Data.Char(isSpace,toLower,toUpper) + +-- * Text lexing +lexText :: String -> [String] +lexText = uncap . lext where + lext s = case s of + c:cs | isMajorPunct c -> [c] : uncap (lext cs) + c:cs | isMinorPunct c -> [c] : lext cs + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs + _ -> [s] + uncap s = case s of + (c:cs):ws -> (toLower c : cs):ws + _ -> s + +unlexText :: [String] -> String +unlexText = unlext where + unlext s = case s of + w:[] -> w + w:[c]:[] | isPunct c -> w ++ [c] + w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) + w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs + w:ws -> w ++ " " ++ unlext ws + _ -> [] + +-- | Bind tokens separated by Prelude.BIND, i.e. &+ +bindTok :: [String] -> String +bindTok ws = case ws of + w:"&+":ws2 -> w ++ bindTok ws2 + w:[] -> w + w:ws2 -> w ++ " " ++ bindTok ws2 + [] -> "" + +-- * Code lexing + +-- | Haskell lexer, usable for much code +lexCode :: String -> [String] +lexCode ss = case lex ss of + [(w@(_:_),ws)] -> w : lexCode ws + _ -> [] + +unlexCode :: [String] -> String +unlexCode s = case s of + w:[] -> w + [c]:cs | isParen c -> [c] ++ unlexCode cs + w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs + w:ws -> w ++ " " ++ unlexCode ws + _ -> [] + + +-- * Mixed lexing + +-- | LaTeX style lexer, with "math" environment using Code between $...$ +lexMixed :: String -> [String] +lexMixed = concat . alternate False where + alternate env s = case s of + _:_ -> case break (=='$') s of + (t,[]) -> lex env t : [] + (t,c:m) -> lex env t : [[c]] : alternate (not env) m + _ -> [] + lex env = if env then lexCode else lexText + +unlexMixed :: [String] -> String +unlexMixed = concat . alternate False where + alternate env s = case s of + _:_ -> case break (=="$") s of + (t,[]) -> unlex env t : [] + (t,c:m) -> unlex env t : sep env c : alternate (not env) m + _ -> [] + unlex env = if env then unlexCode else unlexText + sep env c = if env then c ++ " " else " " ++ c + +-- * Additional lexing uitilties + +-- | Capitalize first letter +capitInit s = case s of + c:cs -> toUpper c : cs + _ -> s + +-- | Unquote each string wrapped in double quotes +unquote = map unq where + unq s = case s of + '"':cs@(_:_) | last cs == '"' -> init cs + _ -> s + +isPunct = flip elem ".?!,:;" +isMajorPunct = flip elem ".?!" +isMinorPunct = flip elem ",:;" +isParen = flip elem "()[]{}" +isClosing = flip elem ")]}"