mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
Move basic lexing functions from GF.Text.Lexing to the new module PGF.Lexing
They are thus part of the PGF Run-Time Library, making it possible to add lexing functionality in PGF service in a natural way.
This commit is contained in:
1
gf.cabal
1
gf.cabal
@@ -99,6 +99,7 @@ Library
|
||||
PGF.Optimize
|
||||
PGF.Printer
|
||||
PGF.Utilities
|
||||
PGF.Lexing
|
||||
other-modules:
|
||||
PGF.CId
|
||||
PGF.Expr
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
-- | Lexers and unlexers - they work on space-separated word strings
|
||||
module GF.Text.Lexing (stringOp,opInEnv) where
|
||||
|
||||
import GF.Text.Transliterations
|
||||
import PGF.Lexing
|
||||
|
||||
import Data.Char
|
||||
import Data.Char (isSpace)
|
||||
import Data.List (intersperse)
|
||||
|
||||
-- lexers and unlexers - they work on space-separated word strings
|
||||
|
||||
stringOp :: String -> Maybe (String -> String)
|
||||
stringOp name = case name of
|
||||
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
|
||||
@@ -51,84 +51,3 @@ appUnlexer f = f . words
|
||||
wrapHTML :: String -> String
|
||||
wrapHTML = unlines . tag . intersperse "<br>" . lines where
|
||||
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
|
||||
|
||||
lexText :: String -> [String]
|
||||
lexText = uncap . lext where
|
||||
lext s = case s of
|
||||
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
||||
c:cs | isMinorPunct c -> [c] : lext cs
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
||||
_ -> [s]
|
||||
uncap s = case s of
|
||||
(c:cs):ws -> (toLower c : cs):ws
|
||||
_ -> s
|
||||
|
||||
-- | Haskell lexer, usable for much code
|
||||
lexCode :: String -> [String]
|
||||
lexCode ss = case lex ss of
|
||||
[(w@(_:_),ws)] -> w : lexCode ws
|
||||
_ -> []
|
||||
|
||||
-- | LaTeX style lexer, with "math" environment using Code between $...$
|
||||
lexMixed :: String -> [String]
|
||||
lexMixed = concat . alternate False where
|
||||
alternate env s = case s of
|
||||
_:_ -> case break (=='$') s of
|
||||
(t,[]) -> lex env t : []
|
||||
(t,c:m) -> lex env t : [[c]] : alternate (not env) m
|
||||
_ -> []
|
||||
lex env = if env then lexCode else lexText
|
||||
|
||||
bindTok :: [String] -> String
|
||||
bindTok ws = case ws of
|
||||
w:"&+":ws2 -> w ++ bindTok ws2
|
||||
w:[] -> w
|
||||
w:ws2 -> w ++ " " ++ bindTok ws2
|
||||
[] -> ""
|
||||
|
||||
unlexText :: [String] -> String
|
||||
unlexText = unlext where
|
||||
unlext s = case s of
|
||||
w:[] -> w
|
||||
w:[c]:[] | isPunct c -> w ++ [c]
|
||||
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
|
||||
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
|
||||
w:ws -> w ++ " " ++ unlext ws
|
||||
_ -> []
|
||||
|
||||
-- capitalize first letter
|
||||
capitInit s = case s of
|
||||
c:cs -> toUpper c : cs
|
||||
_ -> s
|
||||
|
||||
-- unquote each string of form "foo"
|
||||
unquote = map unq where
|
||||
unq s = case s of
|
||||
'"':cs@(_:_) | last cs == '"' -> init cs
|
||||
_ -> s
|
||||
|
||||
unlexCode :: [String] -> String
|
||||
unlexCode s = case s of
|
||||
w:[] -> w
|
||||
[c]:cs | isParen c -> [c] ++ unlexCode cs
|
||||
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
|
||||
w:ws -> w ++ " " ++ unlexCode ws
|
||||
_ -> []
|
||||
|
||||
|
||||
unlexMixed :: [String] -> String
|
||||
unlexMixed = concat . alternate False where
|
||||
alternate env s = case s of
|
||||
_:_ -> case break (=="$") s of
|
||||
(t,[]) -> unlex env t : []
|
||||
(t,c:m) -> unlex env t : sep env c : alternate (not env) m
|
||||
_ -> []
|
||||
unlex env = if env then unlexCode else unlexText
|
||||
sep env c = if env then c ++ " " else " " ++ c
|
||||
|
||||
isPunct = flip elem ".?!,:;"
|
||||
isMajorPunct = flip elem ".?!"
|
||||
isMinorPunct = flip elem ",:;"
|
||||
isParen = flip elem "()[]{}"
|
||||
isClosing = flip elem ")]}"
|
||||
|
||||
91
src/runtime/haskell/PGF/Lexing.hs
Normal file
91
src/runtime/haskell/PGF/Lexing.hs
Normal file
@@ -0,0 +1,91 @@
|
||||
module PGF.Lexing where
|
||||
import Data.Char(isSpace,toLower,toUpper)
|
||||
|
||||
-- * Text lexing
|
||||
lexText :: String -> [String]
|
||||
lexText = uncap . lext where
|
||||
lext s = case s of
|
||||
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
||||
c:cs | isMinorPunct c -> [c] : lext cs
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
||||
_ -> [s]
|
||||
uncap s = case s of
|
||||
(c:cs):ws -> (toLower c : cs):ws
|
||||
_ -> s
|
||||
|
||||
unlexText :: [String] -> String
|
||||
unlexText = unlext where
|
||||
unlext s = case s of
|
||||
w:[] -> w
|
||||
w:[c]:[] | isPunct c -> w ++ [c]
|
||||
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
|
||||
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
|
||||
w:ws -> w ++ " " ++ unlext ws
|
||||
_ -> []
|
||||
|
||||
-- | Bind tokens separated by Prelude.BIND, i.e. &+
|
||||
bindTok :: [String] -> String
|
||||
bindTok ws = case ws of
|
||||
w:"&+":ws2 -> w ++ bindTok ws2
|
||||
w:[] -> w
|
||||
w:ws2 -> w ++ " " ++ bindTok ws2
|
||||
[] -> ""
|
||||
|
||||
-- * Code lexing
|
||||
|
||||
-- | Haskell lexer, usable for much code
|
||||
lexCode :: String -> [String]
|
||||
lexCode ss = case lex ss of
|
||||
[(w@(_:_),ws)] -> w : lexCode ws
|
||||
_ -> []
|
||||
|
||||
unlexCode :: [String] -> String
|
||||
unlexCode s = case s of
|
||||
w:[] -> w
|
||||
[c]:cs | isParen c -> [c] ++ unlexCode cs
|
||||
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
|
||||
w:ws -> w ++ " " ++ unlexCode ws
|
||||
_ -> []
|
||||
|
||||
|
||||
-- * Mixed lexing
|
||||
|
||||
-- | LaTeX style lexer, with "math" environment using Code between $...$
|
||||
lexMixed :: String -> [String]
|
||||
lexMixed = concat . alternate False where
|
||||
alternate env s = case s of
|
||||
_:_ -> case break (=='$') s of
|
||||
(t,[]) -> lex env t : []
|
||||
(t,c:m) -> lex env t : [[c]] : alternate (not env) m
|
||||
_ -> []
|
||||
lex env = if env then lexCode else lexText
|
||||
|
||||
unlexMixed :: [String] -> String
|
||||
unlexMixed = concat . alternate False where
|
||||
alternate env s = case s of
|
||||
_:_ -> case break (=="$") s of
|
||||
(t,[]) -> unlex env t : []
|
||||
(t,c:m) -> unlex env t : sep env c : alternate (not env) m
|
||||
_ -> []
|
||||
unlex env = if env then unlexCode else unlexText
|
||||
sep env c = if env then c ++ " " else " " ++ c
|
||||
|
||||
-- * Additional lexing uitilties
|
||||
|
||||
-- | Capitalize first letter
|
||||
capitInit s = case s of
|
||||
c:cs -> toUpper c : cs
|
||||
_ -> s
|
||||
|
||||
-- | Unquote each string wrapped in double quotes
|
||||
unquote = map unq where
|
||||
unq s = case s of
|
||||
'"':cs@(_:_) | last cs == '"' -> init cs
|
||||
_ -> s
|
||||
|
||||
isPunct = flip elem ".?!,:;"
|
||||
isMajorPunct = flip elem ".?!"
|
||||
isMinorPunct = flip elem ",:;"
|
||||
isParen = flip elem "()[]{}"
|
||||
isClosing = flip elem ")]}"
|
||||
Reference in New Issue
Block a user