mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-20 16:42:51 -06:00
Move basic lexing functions from GF.Text.Lexing to the new module PGF.Lexing
They are thus part of the PGF Run-Time Library, making it possible to add lexing functionality in PGF service in a natural way.
This commit is contained in:
1
gf.cabal
1
gf.cabal
@@ -99,6 +99,7 @@ Library
|
|||||||
PGF.Optimize
|
PGF.Optimize
|
||||||
PGF.Printer
|
PGF.Printer
|
||||||
PGF.Utilities
|
PGF.Utilities
|
||||||
|
PGF.Lexing
|
||||||
other-modules:
|
other-modules:
|
||||||
PGF.CId
|
PGF.CId
|
||||||
PGF.Expr
|
PGF.Expr
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
|
-- | Lexers and unlexers - they work on space-separated word strings
|
||||||
module GF.Text.Lexing (stringOp,opInEnv) where
|
module GF.Text.Lexing (stringOp,opInEnv) where
|
||||||
|
|
||||||
import GF.Text.Transliterations
|
import GF.Text.Transliterations
|
||||||
|
import PGF.Lexing
|
||||||
|
|
||||||
import Data.Char
|
import Data.Char (isSpace)
|
||||||
import Data.List (intersperse)
|
import Data.List (intersperse)
|
||||||
|
|
||||||
-- lexers and unlexers - they work on space-separated word strings
|
|
||||||
|
|
||||||
stringOp :: String -> Maybe (String -> String)
|
stringOp :: String -> Maybe (String -> String)
|
||||||
stringOp name = case name of
|
stringOp name = case name of
|
||||||
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
|
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
|
||||||
@@ -51,84 +51,3 @@ appUnlexer f = f . words
|
|||||||
wrapHTML :: String -> String
|
wrapHTML :: String -> String
|
||||||
wrapHTML = unlines . tag . intersperse "<br>" . lines where
|
wrapHTML = unlines . tag . intersperse "<br>" . lines where
|
||||||
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
|
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
|
||||||
|
|
||||||
lexText :: String -> [String]
|
|
||||||
lexText = uncap . lext where
|
|
||||||
lext s = case s of
|
|
||||||
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
|
||||||
c:cs | isMinorPunct c -> [c] : lext cs
|
|
||||||
c:cs | isSpace c -> lext cs
|
|
||||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
|
||||||
_ -> [s]
|
|
||||||
uncap s = case s of
|
|
||||||
(c:cs):ws -> (toLower c : cs):ws
|
|
||||||
_ -> s
|
|
||||||
|
|
||||||
-- | Haskell lexer, usable for much code
|
|
||||||
lexCode :: String -> [String]
|
|
||||||
lexCode ss = case lex ss of
|
|
||||||
[(w@(_:_),ws)] -> w : lexCode ws
|
|
||||||
_ -> []
|
|
||||||
|
|
||||||
-- | LaTeX style lexer, with "math" environment using Code between $...$
|
|
||||||
lexMixed :: String -> [String]
|
|
||||||
lexMixed = concat . alternate False where
|
|
||||||
alternate env s = case s of
|
|
||||||
_:_ -> case break (=='$') s of
|
|
||||||
(t,[]) -> lex env t : []
|
|
||||||
(t,c:m) -> lex env t : [[c]] : alternate (not env) m
|
|
||||||
_ -> []
|
|
||||||
lex env = if env then lexCode else lexText
|
|
||||||
|
|
||||||
bindTok :: [String] -> String
|
|
||||||
bindTok ws = case ws of
|
|
||||||
w:"&+":ws2 -> w ++ bindTok ws2
|
|
||||||
w:[] -> w
|
|
||||||
w:ws2 -> w ++ " " ++ bindTok ws2
|
|
||||||
[] -> ""
|
|
||||||
|
|
||||||
unlexText :: [String] -> String
|
|
||||||
unlexText = unlext where
|
|
||||||
unlext s = case s of
|
|
||||||
w:[] -> w
|
|
||||||
w:[c]:[] | isPunct c -> w ++ [c]
|
|
||||||
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
|
|
||||||
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
|
|
||||||
w:ws -> w ++ " " ++ unlext ws
|
|
||||||
_ -> []
|
|
||||||
|
|
||||||
-- capitalize first letter
|
|
||||||
capitInit s = case s of
|
|
||||||
c:cs -> toUpper c : cs
|
|
||||||
_ -> s
|
|
||||||
|
|
||||||
-- unquote each string of form "foo"
|
|
||||||
unquote = map unq where
|
|
||||||
unq s = case s of
|
|
||||||
'"':cs@(_:_) | last cs == '"' -> init cs
|
|
||||||
_ -> s
|
|
||||||
|
|
||||||
unlexCode :: [String] -> String
|
|
||||||
unlexCode s = case s of
|
|
||||||
w:[] -> w
|
|
||||||
[c]:cs | isParen c -> [c] ++ unlexCode cs
|
|
||||||
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
|
|
||||||
w:ws -> w ++ " " ++ unlexCode ws
|
|
||||||
_ -> []
|
|
||||||
|
|
||||||
|
|
||||||
unlexMixed :: [String] -> String
|
|
||||||
unlexMixed = concat . alternate False where
|
|
||||||
alternate env s = case s of
|
|
||||||
_:_ -> case break (=="$") s of
|
|
||||||
(t,[]) -> unlex env t : []
|
|
||||||
(t,c:m) -> unlex env t : sep env c : alternate (not env) m
|
|
||||||
_ -> []
|
|
||||||
unlex env = if env then unlexCode else unlexText
|
|
||||||
sep env c = if env then c ++ " " else " " ++ c
|
|
||||||
|
|
||||||
isPunct = flip elem ".?!,:;"
|
|
||||||
isMajorPunct = flip elem ".?!"
|
|
||||||
isMinorPunct = flip elem ",:;"
|
|
||||||
isParen = flip elem "()[]{}"
|
|
||||||
isClosing = flip elem ")]}"
|
|
||||||
|
|||||||
91
src/runtime/haskell/PGF/Lexing.hs
Normal file
91
src/runtime/haskell/PGF/Lexing.hs
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
module PGF.Lexing where
|
||||||
|
import Data.Char(isSpace,toLower,toUpper)
|
||||||
|
|
||||||
|
-- * Text lexing
|
||||||
|
lexText :: String -> [String]
|
||||||
|
lexText = uncap . lext where
|
||||||
|
lext s = case s of
|
||||||
|
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
||||||
|
c:cs | isMinorPunct c -> [c] : lext cs
|
||||||
|
c:cs | isSpace c -> lext cs
|
||||||
|
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
||||||
|
_ -> [s]
|
||||||
|
uncap s = case s of
|
||||||
|
(c:cs):ws -> (toLower c : cs):ws
|
||||||
|
_ -> s
|
||||||
|
|
||||||
|
unlexText :: [String] -> String
|
||||||
|
unlexText = unlext where
|
||||||
|
unlext s = case s of
|
||||||
|
w:[] -> w
|
||||||
|
w:[c]:[] | isPunct c -> w ++ [c]
|
||||||
|
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
|
||||||
|
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
|
||||||
|
w:ws -> w ++ " " ++ unlext ws
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
-- | Bind tokens separated by Prelude.BIND, i.e. &+
|
||||||
|
bindTok :: [String] -> String
|
||||||
|
bindTok ws = case ws of
|
||||||
|
w:"&+":ws2 -> w ++ bindTok ws2
|
||||||
|
w:[] -> w
|
||||||
|
w:ws2 -> w ++ " " ++ bindTok ws2
|
||||||
|
[] -> ""
|
||||||
|
|
||||||
|
-- * Code lexing
|
||||||
|
|
||||||
|
-- | Haskell lexer, usable for much code
|
||||||
|
lexCode :: String -> [String]
|
||||||
|
lexCode ss = case lex ss of
|
||||||
|
[(w@(_:_),ws)] -> w : lexCode ws
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
unlexCode :: [String] -> String
|
||||||
|
unlexCode s = case s of
|
||||||
|
w:[] -> w
|
||||||
|
[c]:cs | isParen c -> [c] ++ unlexCode cs
|
||||||
|
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
|
||||||
|
w:ws -> w ++ " " ++ unlexCode ws
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
|
||||||
|
-- * Mixed lexing
|
||||||
|
|
||||||
|
-- | LaTeX style lexer, with "math" environment using Code between $...$
|
||||||
|
lexMixed :: String -> [String]
|
||||||
|
lexMixed = concat . alternate False where
|
||||||
|
alternate env s = case s of
|
||||||
|
_:_ -> case break (=='$') s of
|
||||||
|
(t,[]) -> lex env t : []
|
||||||
|
(t,c:m) -> lex env t : [[c]] : alternate (not env) m
|
||||||
|
_ -> []
|
||||||
|
lex env = if env then lexCode else lexText
|
||||||
|
|
||||||
|
unlexMixed :: [String] -> String
|
||||||
|
unlexMixed = concat . alternate False where
|
||||||
|
alternate env s = case s of
|
||||||
|
_:_ -> case break (=="$") s of
|
||||||
|
(t,[]) -> unlex env t : []
|
||||||
|
(t,c:m) -> unlex env t : sep env c : alternate (not env) m
|
||||||
|
_ -> []
|
||||||
|
unlex env = if env then unlexCode else unlexText
|
||||||
|
sep env c = if env then c ++ " " else " " ++ c
|
||||||
|
|
||||||
|
-- * Additional lexing uitilties
|
||||||
|
|
||||||
|
-- | Capitalize first letter
|
||||||
|
capitInit s = case s of
|
||||||
|
c:cs -> toUpper c : cs
|
||||||
|
_ -> s
|
||||||
|
|
||||||
|
-- | Unquote each string wrapped in double quotes
|
||||||
|
unquote = map unq where
|
||||||
|
unq s = case s of
|
||||||
|
'"':cs@(_:_) | last cs == '"' -> init cs
|
||||||
|
_ -> s
|
||||||
|
|
||||||
|
isPunct = flip elem ".?!,:;"
|
||||||
|
isMajorPunct = flip elem ".?!"
|
||||||
|
isMinorPunct = flip elem ",:;"
|
||||||
|
isParen = flip elem "()[]{}"
|
||||||
|
isClosing = flip elem ")]}"
|
||||||
Reference in New Issue
Block a user