mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 13:09:33 -06:00
eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime.
This commit is contained in:
4
gf.cabal
4
gf.cabal
@@ -109,9 +109,6 @@ Library
|
||||
exposed-modules:
|
||||
PGF
|
||||
PGF.Internal
|
||||
PGF.Lexing
|
||||
PGF.LexingAGreek
|
||||
PGF.Utilities
|
||||
PGF.Haskell
|
||||
|
||||
other-modules:
|
||||
@@ -137,6 +134,7 @@ Library
|
||||
PGF.VisualizeTree
|
||||
PGF.ByteCode
|
||||
PGF.OldBinary
|
||||
PGF.Utilities
|
||||
|
||||
if flag(c-runtime)
|
||||
exposed-modules: PGF2
|
||||
|
||||
@@ -219,7 +219,7 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [
|
||||
|
||||
stringOps menv opts s = foldr (menvop . app) s (reverse opts)
|
||||
where
|
||||
app f = maybe id id (stringOp f)
|
||||
app f = maybe id id (stringOp (const False) f)
|
||||
menvop op = maybe op (\ (b,e) -> opInEnv b e op) menv
|
||||
|
||||
envFlag fs =
|
||||
|
||||
@@ -12,12 +12,12 @@
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
module GF.Data.Utilities(module GF.Data.Utilities, module PGF.Utilities) where
|
||||
module GF.Data.Utilities(module GF.Data.Utilities) where
|
||||
|
||||
import Data.Maybe
|
||||
import Data.List
|
||||
import Control.Monad (MonadPlus(..),liftM,when)
|
||||
import PGF.Utilities
|
||||
import qualified Data.Set as Set
|
||||
|
||||
-- * functions on lists
|
||||
|
||||
@@ -190,3 +190,18 @@ joinS glue = concatS . intersperse (showString glue)
|
||||
|
||||
|
||||
|
||||
-- | Like 'Data.List.nub', but O(n log n) instead of O(n^2), since it uses a set to lookup previous things.
|
||||
-- The result list is stable (the elements are returned in the order they occur), and lazy.
|
||||
-- Requires that the list elements can be compared by Ord.
|
||||
-- Code ruthlessly taken from <http://hpaste.org/54411>
|
||||
nub' :: Ord a => [a] -> [a]
|
||||
nub' = loop Set.empty
|
||||
where loop _ [] = []
|
||||
loop seen (x : xs)
|
||||
| Set.member x seen = loop seen xs
|
||||
| otherwise = x : loop (Set.insert x seen) xs
|
||||
|
||||
|
||||
-- | Replace all occurences of an element by another element.
|
||||
replace :: Eq a => a -> a -> [a] -> [a]
|
||||
replace x y = map (\z -> if z == x then y else z)
|
||||
|
||||
@@ -2,30 +2,30 @@
|
||||
module GF.Text.Lexing (stringOp,opInEnv) where
|
||||
|
||||
import GF.Text.Transliterations
|
||||
import PGF.Lexing
|
||||
import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016
|
||||
|
||||
import Data.Char (isSpace)
|
||||
import Data.Char (isSpace,toUpper,toLower)
|
||||
import Data.List (intersperse)
|
||||
|
||||
stringOp :: String -> Maybe (String -> String)
|
||||
stringOp name = case name of
|
||||
stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
|
||||
stringOp good name = case name of
|
||||
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
|
||||
"lextext" -> Just $ appLexer lexText
|
||||
"lextext" -> Just $ appLexer (lexText good)
|
||||
"lexcode" -> Just $ appLexer lexCode
|
||||
"lexmixed" -> Just $ appLexer lexMixed
|
||||
"lexmixed" -> Just $ appLexer (lexMixed good)
|
||||
"lexgreek" -> Just $ appLexer lexAGreek
|
||||
"lexgreek2" -> Just $ appLexer lexAGreek2
|
||||
"words" -> Just $ appLexer words
|
||||
"bind" -> Just $ appUnlexer (unwords . bindTok)
|
||||
"unchars" -> Just $ appUnlexer concat
|
||||
"unlextext" -> Just $ appUnlexer (unlexText . unquote)
|
||||
"unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok)
|
||||
"unlexcode" -> Just $ appUnlexer unlexCode
|
||||
"unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote)
|
||||
"unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
|
||||
"unlexgreek" -> Just $ appUnlexer unlexAGreek
|
||||
"unlexnone" -> Just id
|
||||
"unlexid" -> Just id
|
||||
"unwords" -> Just $ appUnlexer unwords
|
||||
"to_html" -> Just wrapHTML
|
||||
_ -> transliterate name
|
||||
_ -> transliterate name
|
||||
|
||||
-- perform op in environments beg--end, t.ex. between "--"
|
||||
--- suboptimal implementation
|
||||
@@ -55,3 +55,395 @@ appUnlexer f = f . words
|
||||
wrapHTML :: String -> String
|
||||
wrapHTML = unlines . tag . intersperse "<br>" . lines where
|
||||
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
|
||||
|
||||
|
||||
-- * Text lexing
|
||||
-- | Text lexing with standard word capitalization of the first word of every sentence
|
||||
lexText :: (String -> Bool) -> String -> [String]
|
||||
lexText good = lexText' (uncapitInit good)
|
||||
|
||||
-- | Text lexing with custom treatment of the first word of every sentence.
|
||||
lexText' :: (String->String) -> String -> [String]
|
||||
lexText' uncap1 = uncap . lext where
|
||||
lext s = case s of
|
||||
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
||||
c:cs | isMinorPunct c -> [c] : lext cs
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
||||
_ -> [s]
|
||||
uncap s = case s of
|
||||
w:ws -> uncap1 w:ws
|
||||
_ -> s
|
||||
|
||||
unlexText :: [String] -> String
|
||||
unlexText = capitInit . unlext where
|
||||
unlext s = case s of
|
||||
w:[] -> w
|
||||
w:[c]:[] | isPunct c -> w ++ [c]
|
||||
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
|
||||
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
|
||||
w:ws -> w ++ " " ++ unlext ws
|
||||
_ -> []
|
||||
|
||||
-- | Bind tokens separated by Prelude.BIND, i.e. &+
|
||||
bindTok :: [String] -> [String]
|
||||
bindTok ws = case ws of
|
||||
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
|
||||
"&+":ws -> bindTok ws
|
||||
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
|
||||
"&|":ws -> bindTok ws
|
||||
w:ws -> w:bindTok ws
|
||||
[] -> []
|
||||
|
||||
-- * Code lexing
|
||||
|
||||
-- | Haskell lexer, usable for much code
|
||||
lexCode :: String -> [String]
|
||||
lexCode ss = case lex ss of
|
||||
[(w@(_:_),ws)] -> w : lexCode ws
|
||||
_ -> []
|
||||
|
||||
|
||||
-- * Ancient Greek lexing
|
||||
|
||||
lexTextAGreek :: String -> [String]
|
||||
lexTextAGreek s = lext s where
|
||||
lext s = case s of
|
||||
c:cs | isAGreekPunct c -> [c] : (lext cs)
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
|
||||
in w : lext cs
|
||||
[] -> []
|
||||
|
||||
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
|
||||
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
|
||||
|
||||
lexTextAGreek2 :: String -> [String]
|
||||
lexTextAGreek2 s = lext s where
|
||||
lext s = case s of
|
||||
c:cs | isAGreekPunct c -> [c] : (lext cs)
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
|
||||
in case cs of
|
||||
'.':'.':d:ds | isSpace d
|
||||
-> (w++['.']) : lext ('.':d:ds)
|
||||
'.':d:ds | isAGreekPunct d || isSpace d
|
||||
-> (w++['.']) : lext (d:ds)
|
||||
'.':d:ds | not (isSpace d)
|
||||
-> case lext (d:ds) of
|
||||
e:es -> (w++['.']++e) : es
|
||||
es -> (w++['.']) : es
|
||||
'.':[] -> (w++['.']) : []
|
||||
_ -> w : lext cs
|
||||
[] -> []
|
||||
|
||||
unlexTextAGreek :: [String] -> String
|
||||
unlexTextAGreek = unlext where
|
||||
unlext s = case s of
|
||||
w:[] -> w
|
||||
w:[c]:[] | isAGreekPunct c -> w ++ [c]
|
||||
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
|
||||
w:ws -> w ++ " " ++ unlext ws
|
||||
[] -> []
|
||||
|
||||
isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset,
|
||||
-- second version · = 00B7 standard code point
|
||||
|
||||
-- * Text lexing and unlexing for Ancient Greek:
|
||||
-- 1. no capitalization of initial word,
|
||||
-- 2. grave/acute accent switch on final syllables of words not followed by punctuation,
|
||||
-- 3. accent move from/to support word to/from following clitic words (iterated).
|
||||
|
||||
lexAGreek :: String -> [String]
|
||||
lexAGreek = fromAGreek . lexTextAGreek
|
||||
|
||||
lexAGreek2 :: String -> [String]
|
||||
lexAGreek2 = fromAGreek . lexTextAGreek2
|
||||
|
||||
unlexAGreek :: [String] -> String
|
||||
unlexAGreek = unlexTextAGreek . toAGreek
|
||||
|
||||
-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
|
||||
-- vowel indication (like a.) differs from sentence end (a .).
|
||||
|
||||
-- | normalize = change grave accent on sentence internal words to acute,
|
||||
-- and shift inherited acutes to the following enclitic (where they are
|
||||
-- visible only as shown in the list of enclitics above)
|
||||
|
||||
normalize :: String -> String
|
||||
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)
|
||||
|
||||
fromAGreek :: [String] -> [String]
|
||||
fromAGreek s = case s of
|
||||
w:[]:vs -> w:[]:(fromAGreek vs)
|
||||
w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
|
||||
w:v:vs | wasEnclitic v && wasEnclitic w ->
|
||||
getEnclitic w : fromAGreek (v:vs)
|
||||
w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*"
|
||||
getProclitic w : getEnclitic v : fromAGreek vs
|
||||
w:v:vs | wasEnclitic v && (hasEndCircum w ||
|
||||
(hasEndAcute w && hasSingleAccent w)) ->
|
||||
w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*"
|
||||
w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
|
||||
w : getEnclitic v : fromAGreek vs
|
||||
w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*"
|
||||
dropLastAccent w : getEnclitic v : fromAGreek vs
|
||||
w:v:vs | wasEnclitic w ->
|
||||
getEnclitic w : fromAGreek (v:vs)
|
||||
w:ws -> (toAcute w) : (fromAGreek ws)
|
||||
ws -> ws
|
||||
|
||||
-- | de-normalize = change acute accent of end syllables in sentence internal
|
||||
-- (non-enclitic) words to grave accent, and move accents of enclitics to the
|
||||
-- previous word to produce ordinary ancient greek
|
||||
|
||||
denormalize :: String -> String
|
||||
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)
|
||||
|
||||
toAGreek :: [String] -> [String]
|
||||
toAGreek s = case s of
|
||||
w:[]:vs -> w:[]:(toAGreek vs)
|
||||
w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek
|
||||
w:v:vs | isEnclitic v && isEnclitic w ->
|
||||
addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2
|
||||
w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta
|
||||
addAcute w: (toAGreek (dropAccent v:vs))
|
||||
w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
|
||||
w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta
|
||||
w:v:vs | isEnclitic v && hasPrefinalAcute w ->
|
||||
w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b.
|
||||
w:v:vs | isEnclitic v ->
|
||||
(addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma
|
||||
w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
|
||||
w:ws -> (toGrave w) : (toAGreek ws)
|
||||
ws -> ws
|
||||
|
||||
-- | Change accent on the final syllable of a word
|
||||
|
||||
toGrave :: String -> String
|
||||
toGrave = reverse . grave . reverse where
|
||||
grave s = case s of
|
||||
'\'':cs -> '`':cs
|
||||
c:cs | isAGreekVowel c -> c:cs
|
||||
c:cs -> c: grave cs
|
||||
_ -> s
|
||||
|
||||
toAcute :: String -> String
|
||||
toAcute = reverse . acute . reverse where
|
||||
acute s = case s of
|
||||
'`':cs -> '\'':cs
|
||||
c:cs | isAGreekVowel c -> c:cs
|
||||
c:cs -> c: acute cs
|
||||
_ -> s
|
||||
|
||||
isAGreekVowel = flip elem "aeioyhw"
|
||||
|
||||
-- | Accent moves for enclitics and proclitics (atona)
|
||||
|
||||
enclitics = [
|
||||
"moy","moi","me", -- personal pronouns
|
||||
"soy","soi","se",
|
||||
"oy(","oi(","e(",
|
||||
"tis*","ti","tina'", -- indefinite pronoun
|
||||
"tino's*","tini'",
|
||||
"tine's*","tina's*",
|
||||
"tinw~n","tisi'","tisi'n",
|
||||
"poy","poi", -- indefinite adverbs
|
||||
"pove'n","pws*",
|
||||
"ph|","pote'",
|
||||
"ge","te","toi", -- particles
|
||||
"nyn","per","pw"
|
||||
-- suffix -"de"
|
||||
-- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
|
||||
] -- and more, BR 11
|
||||
|
||||
proclitics = [
|
||||
"o(","h(","oi(","ai(", -- articles
|
||||
"e)n","ei)s*","e)x","e)k", -- prepositions
|
||||
"ei)","w(s*", -- conjunctions
|
||||
"oy)","oy)k","oy)c" -- negation
|
||||
]
|
||||
|
||||
isEnclitic = flip elem enclitics
|
||||
isProclitic = flip elem proclitics
|
||||
|
||||
-- Check if a word is an enclitic or accented enclitic and extract the enclitic
|
||||
|
||||
wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
|
||||
++ (map dropAccent (filter hasAccent enclitics))
|
||||
accented = (filter hasAccent enclitics)
|
||||
++ map addAcute (filter (not . hasAccent) enclitics)
|
||||
in flip elem (accented ++ unaccented)
|
||||
|
||||
wasProclitic = flip elem (map addAcute proclitics)
|
||||
|
||||
getEnclitic =
|
||||
let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
|
||||
++ (map addAcute (filter (not . hasAccent) enclitics)))
|
||||
(enclitics ++ (filter hasAccent enclitics)
|
||||
++ (filter (not . hasAccent) enclitics))
|
||||
find = \v -> lookup v pairs
|
||||
in \v -> case (find v) of
|
||||
Just x -> x
|
||||
_ -> v
|
||||
getProclitic =
|
||||
let pairs = zip (map addAcute proclitics) proclitics
|
||||
find = \v -> lookup v pairs
|
||||
in \v -> case (find v) of
|
||||
Just x -> x
|
||||
_ -> v
|
||||
|
||||
-- | Accent manipulation
|
||||
|
||||
dropAccent = reverse . drop . reverse where
|
||||
drop s = case s of
|
||||
[] -> []
|
||||
'\'':cs -> cs
|
||||
'`':cs -> cs
|
||||
'~':cs -> cs
|
||||
c:cs -> c:drop cs
|
||||
|
||||
dropLastAccent = reverse . drop . reverse where
|
||||
drop s = case s of
|
||||
[] -> []
|
||||
'\'':cs -> cs
|
||||
'`':cs -> cs
|
||||
'~':cs -> cs
|
||||
c:cs -> c:drop cs
|
||||
|
||||
addAcute :: String -> String
|
||||
addAcute = reverse . acu . reverse where
|
||||
acu w = case w of
|
||||
c:cs | c == '\'' -> c:cs
|
||||
c:cs | c == '(' -> '\'':c:cs
|
||||
c:cs | c == ')' -> '\'':c:cs
|
||||
c:cs | isAGreekVowel c -> '\'':c:cs
|
||||
c:cs -> c : acu cs
|
||||
_ -> w
|
||||
|
||||
-- | Accent checking on end syllables
|
||||
|
||||
hasEndAcute = find . reverse where
|
||||
find s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> True
|
||||
'`':cs -> False
|
||||
'~':cs -> False
|
||||
c:cs | isAGreekVowel c -> False
|
||||
_:cs -> find cs
|
||||
|
||||
hasEndCircum = find . reverse where
|
||||
find s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> False
|
||||
'`':cs -> False
|
||||
'~':cs -> True
|
||||
c:cs | isAGreekVowel c -> False
|
||||
_:cs -> find cs
|
||||
|
||||
hasPrefinalAcute = find . reverse where
|
||||
find s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> False -- final acute
|
||||
'`':cs -> False
|
||||
'~':cs -> False
|
||||
c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
|
||||
c:cs | isAGreekVowel c -> findNext cs
|
||||
_:cs -> find cs where
|
||||
findNext s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> True -- prefinal acute
|
||||
'`':cs -> False
|
||||
'~':cs -> False
|
||||
c:cs | isAGreekVowel c -> False
|
||||
_:cs -> findNext cs where
|
||||
|
||||
hasSingleAccent v =
|
||||
hasAccent v && not (hasAccent (dropLastAccent v))
|
||||
|
||||
hasAccent v = case v of
|
||||
[] -> False
|
||||
c:cs -> elem c ['\'','`','~'] || hasAccent cs
|
||||
|
||||
{- Tests:
|
||||
|
||||
-- denormalization. Examples in BR 11 work:
|
||||
-}
|
||||
enclitics_expls = -- normalized
|
||||
"sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha
|
||||
"sofo's tis*":"sofoi' tine's*": -- a.beta
|
||||
"ei) tis*":"ei) tine's*":
|
||||
"a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma
|
||||
"doy~los* tis*":"doy~loi tine's*":
|
||||
"lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b.
|
||||
"ei) poy tis* tina' i)'doi": -- Anm. 2.
|
||||
[]
|
||||
|
||||
|
||||
unlexCode :: [String] -> String
|
||||
unlexCode s = case s of
|
||||
w:[] -> w
|
||||
[c]:cs | isParen c -> [c] ++ unlexCode cs
|
||||
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
|
||||
w:ws -> w ++ " " ++ unlexCode ws
|
||||
_ -> []
|
||||
|
||||
|
||||
-- | LaTeX lexer in the math mode: \ should not be separated from the next word
|
||||
|
||||
lexLatexCode :: String -> [String]
|
||||
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
|
||||
restoreBackslash ws = case ws of
|
||||
"\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
|
||||
w:ww -> w:restoreBackslash ww
|
||||
_ -> ws
|
||||
|
||||
-- * Mixed lexing
|
||||
|
||||
-- | LaTeX style lexer, with "math" environment using Code between $...$
|
||||
lexMixed :: (String -> Bool) -> String -> [String]
|
||||
lexMixed good = concat . alternate False [] where
|
||||
alternate env t s = case s of
|
||||
'$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
|
||||
'\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
|
||||
c:cs -> alternate env (c:t) cs
|
||||
_ -> [lex env (reverse t)]
|
||||
lex env = if env then lexLatexCode else lexText good
|
||||
|
||||
unlexMixed :: (String -> Bool) -> [String] -> String
|
||||
unlexMixed good = capitInit . concat . alternate False where
|
||||
alternate env s = case s of
|
||||
_:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
|
||||
(t,[]) -> unlex env t : []
|
||||
(t,c:m) -> unlex env t : sep env c m : alternate (not env) m
|
||||
_ -> []
|
||||
unlex env = if env then unlexCode else (uncapitInit good . unlexText)
|
||||
sep env c m = case (m,env) of
|
||||
([p]:_,True) | isPunct p -> c -- closing $ glued to next punct
|
||||
(_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows
|
||||
_ -> " " ++ c -- put space before opening $
|
||||
|
||||
-- * Additional lexing uitilties
|
||||
|
||||
-- | Capitalize first letter
|
||||
capitInit s = case s of
|
||||
c:cs -> toUpper c : cs
|
||||
_ -> s
|
||||
|
||||
-- | Uncapitalize first letter
|
||||
uncapitInit good s =
|
||||
case s of
|
||||
c:cs | not (good s) -> toLower c : cs
|
||||
_ -> s
|
||||
|
||||
-- | Unquote each string wrapped in double quotes
|
||||
unquote = map unq where
|
||||
unq s = case s of
|
||||
'"':cs@(_:_) | last cs == '"' -> init cs
|
||||
_ -> s
|
||||
|
||||
isPunct = flip elem ".?!,:;"
|
||||
isMajorPunct = flip elem ".?!"
|
||||
isMinorPunct = flip elem ",:;"
|
||||
isParen = flip elem "()[]{}"
|
||||
isClosing = flip elem ")]}"
|
||||
|
||||
3
src/runtime/haskell-bind/PGF.hs
Normal file
3
src/runtime/haskell-bind/PGF.hs
Normal file
@@ -0,0 +1,3 @@
|
||||
module PGF(module PGF2) where
|
||||
|
||||
import PGF2
|
||||
1
src/runtime/haskell-bind/PGF/Internal.hs
Normal file
1
src/runtime/haskell-bind/PGF/Internal.hs
Normal file
@@ -0,0 +1 @@
|
||||
module PGF.Internal where
|
||||
@@ -1,61 +0,0 @@
|
||||
-- | pgf-hsbind-trans: A simple batch translator to illustrate the use of the Haskell binding
|
||||
-- to the C implementation of the PGF run-time system.
|
||||
--
|
||||
-- AR April 2015 modified from pgf-shell
|
||||
|
||||
import PGF2
|
||||
import PGF.Lexing (lexText')
|
||||
|
||||
import Data.Char(isSpace,toLower)
|
||||
import Data.List (nub)
|
||||
import System.Environment
|
||||
import qualified Data.Map as Map
|
||||
|
||||
maxNumTrees :: Int
|
||||
maxNumTrees = 1
|
||||
|
||||
maxNumVariants :: Int
|
||||
maxNumVariants = 1
|
||||
|
||||
main = getPGF =<< getArgs
|
||||
|
||||
getPGF args = case args of
|
||||
[path,from,to,cat,mxt,mxv] -> pgfTrans from to (Just cat) (read mxt, read mxv) =<< readPGF path
|
||||
[path,from,to] -> pgfTrans from to Nothing (maxNumTrees,maxNumVariants) =<< readPGF path
|
||||
_ -> putStrLn "Usage: pgf-hsbind-trans <path to pgf> <from-lang> <to-lang> [<cat> <#trees> <#variants>]"
|
||||
|
||||
pgfTrans from to mcat mx pgf = do
|
||||
cfrom <- getConcr' pgf from
|
||||
cto <- getConcr' pgf to
|
||||
let cat = maybe (startCat pgf) id mcat
|
||||
interact (unlines . map (translates pgf cfrom cto cat mx) . lines)
|
||||
|
||||
getConcr' pgf lang =
|
||||
maybe (fail $ "Concrete syntax not found: "++show lang) return $
|
||||
Map.lookup lang (languages pgf)
|
||||
|
||||
linearizeAndShow gr mxv (t,p) = [show t]++take mxv (nub (map unstar (linearizeAll gr t)))++[show p]
|
||||
where
|
||||
unstar s = case s of
|
||||
'*':' ':cs -> cs
|
||||
_ -> s
|
||||
|
||||
translates pgf cfrom cto cat (mxt,mxv) s0 =
|
||||
let s1 = lextext cfrom s0
|
||||
(s,p) = case reverse s1 of c:_ | elem c ".?!" -> (init s1,[c]) ; _ -> (s1,[]) -- separate final punctuation
|
||||
in
|
||||
case cparse pgf cfrom cat s of
|
||||
Left tok -> unlines [s,"Parse error: "++tok]
|
||||
Right ts -> unlines $ (("> "++ s):) $ take mxt $ map ((++p) . unlines . linearizeAndShow cto mxv) ts -- append punctuation
|
||||
|
||||
cparse pgf concr cat input = parseWithHeuristics concr cat input (-1) callbacks where
|
||||
callbacks = maybe [] cb $ lookup "App" literalCallbacks
|
||||
cb fs = [(cat,f pgf ("TranslateEng",concr) input)|(cat,f)<-fs]
|
||||
|
||||
lextext cnc = unwords . lexText' (\w -> case lookupMorpho cnc w of
|
||||
_:_ -> w
|
||||
_ -> case lookupMorpho cnc (uncapitInit w) of
|
||||
[] -> w
|
||||
_ -> uncapitInit w
|
||||
)
|
||||
where uncapitInit (c:cs) = toLower c : cs
|
||||
@@ -14,7 +14,9 @@ extra-source-files: README
|
||||
cabal-version: >=1.10
|
||||
|
||||
library
|
||||
exposed-modules: PGF2, SG
|
||||
exposed-modules: PGF2, SG,
|
||||
-- backwards compatibility API:
|
||||
PGF, PGF.Internal
|
||||
other-modules: PGF2.FFI, PGF2.Expr, PGF2.Type, SG.FFI
|
||||
build-depends: base >=4.3, bytestring >=0.9,
|
||||
containers, pretty
|
||||
|
||||
@@ -1,115 +0,0 @@
|
||||
module PGF.Lexing where
|
||||
import Data.Char(isSpace,toLower,toUpper)
|
||||
|
||||
-- * Text lexing
|
||||
-- | Text lexing with standard word capitalization of the first word of every sentence
|
||||
lexText :: String -> [String]
|
||||
lexText = lexText' uncapitInit
|
||||
|
||||
-- | Text lexing with custom treatment of the first word of every sentence.
|
||||
lexText' :: (String->String) -> String -> [String]
|
||||
lexText' uncap1 = uncap . lext where
|
||||
lext s = case s of
|
||||
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
||||
c:cs | isMinorPunct c -> [c] : lext cs
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
||||
_ -> [s]
|
||||
uncap s = case s of
|
||||
w:ws -> uncap1 w:ws
|
||||
_ -> s
|
||||
|
||||
unlexText :: [String] -> String
|
||||
unlexText = capitInit . unlext where
|
||||
unlext s = case s of
|
||||
w:[] -> w
|
||||
w:[c]:[] | isPunct c -> w ++ [c]
|
||||
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
|
||||
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
|
||||
w:ws -> w ++ " " ++ unlext ws
|
||||
_ -> []
|
||||
|
||||
-- | Bind tokens separated by Prelude.BIND, i.e. &+
|
||||
bindTok :: [String] -> [String]
|
||||
bindTok ws = case ws of
|
||||
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
|
||||
"&+":ws -> bindTok ws
|
||||
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
|
||||
"&|":ws -> bindTok ws
|
||||
w:ws -> w:bindTok ws
|
||||
[] -> []
|
||||
|
||||
-- * Code lexing
|
||||
|
||||
-- | Haskell lexer, usable for much code
|
||||
lexCode :: String -> [String]
|
||||
lexCode ss = case lex ss of
|
||||
[(w@(_:_),ws)] -> w : lexCode ws
|
||||
_ -> []
|
||||
|
||||
unlexCode :: [String] -> String
|
||||
unlexCode s = case s of
|
||||
w:[] -> w
|
||||
[c]:cs | isParen c -> [c] ++ unlexCode cs
|
||||
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
|
||||
w:ws -> w ++ " " ++ unlexCode ws
|
||||
_ -> []
|
||||
|
||||
|
||||
-- | LaTeX lexer in the math mode: \ should not be separated from the next word
|
||||
|
||||
lexLatexCode :: String -> [String]
|
||||
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
|
||||
restoreBackslash ws = case ws of
|
||||
"\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
|
||||
w:ww -> w:restoreBackslash ww
|
||||
_ -> ws
|
||||
|
||||
-- * Mixed lexing
|
||||
|
||||
-- | LaTeX style lexer, with "math" environment using Code between $...$
|
||||
lexMixed :: String -> [String]
|
||||
lexMixed = concat . alternate False [] where
|
||||
alternate env t s = case s of
|
||||
'$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
|
||||
'\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
|
||||
c:cs -> alternate env (c:t) cs
|
||||
_ -> [lex env (reverse t)]
|
||||
lex env = if env then lexLatexCode else lexText
|
||||
|
||||
unlexMixed :: [String] -> String
|
||||
unlexMixed = capitInit . concat . alternate False where
|
||||
alternate env s = case s of
|
||||
_:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
|
||||
(t,[]) -> unlex env t : []
|
||||
(t,c:m) -> unlex env t : sep env c m : alternate (not env) m
|
||||
_ -> []
|
||||
unlex env = if env then unlexCode else (uncapitInit . unlexText)
|
||||
sep env c m = case (m,env) of
|
||||
([p]:_,True) | isPunct p -> c -- closing $ glued to next punct
|
||||
(_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows
|
||||
_ -> " " ++ c -- put space before opening $
|
||||
|
||||
-- * Additional lexing uitilties
|
||||
|
||||
-- | Capitalize first letter
|
||||
capitInit s = case s of
|
||||
c:cs -> toUpper c : cs
|
||||
_ -> s
|
||||
|
||||
-- | Uncapitalize first letter
|
||||
uncapitInit s = case s of
|
||||
c:cs -> toLower c : cs
|
||||
_ -> s
|
||||
|
||||
-- | Unquote each string wrapped in double quotes
|
||||
unquote = map unq where
|
||||
unq s = case s of
|
||||
'"':cs@(_:_) | last cs == '"' -> init cs
|
||||
_ -> s
|
||||
|
||||
isPunct = flip elem ".?!,:;"
|
||||
isMajorPunct = flip elem ".?!"
|
||||
isMinorPunct = flip elem ",:;"
|
||||
isParen = flip elem "()[]{}"
|
||||
isClosing = flip elem ")]}"
|
||||
@@ -1,310 +0,0 @@
|
||||
module PGF.LexingAGreek where -- HL 2a1.2.2016
|
||||
import Data.Char(isSpace)
|
||||
|
||||
-- * Text lexing without word capitalization of the first word of every sentence.
|
||||
-- Greek sentences in (transliterated) texts don't start with capital character.
|
||||
|
||||
-- Ordinary greek text does not have vowel length indicators. We then use '.' as
|
||||
-- a sentence separator.
|
||||
|
||||
lexTextAGreek :: String -> [String]
|
||||
lexTextAGreek s = lext s where
|
||||
lext s = case s of
|
||||
c:cs | isAGreekPunct c -> [c] : (lext cs)
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
|
||||
in w : lext cs
|
||||
[] -> []
|
||||
|
||||
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
|
||||
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
|
||||
|
||||
lexTextAGreek2 :: String -> [String]
|
||||
lexTextAGreek2 s = lext s where
|
||||
lext s = case s of
|
||||
c:cs | isAGreekPunct c -> [c] : (lext cs)
|
||||
c:cs | isSpace c -> lext cs
|
||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
|
||||
in case cs of
|
||||
'.':'.':d:ds | isSpace d
|
||||
-> (w++['.']) : lext ('.':d:ds)
|
||||
'.':d:ds | isAGreekPunct d || isSpace d
|
||||
-> (w++['.']) : lext (d:ds)
|
||||
'.':d:ds | not (isSpace d)
|
||||
-> case lext (d:ds) of
|
||||
e:es -> (w++['.']++e) : es
|
||||
es -> (w++['.']) : es
|
||||
'.':[] -> (w++['.']) : []
|
||||
_ -> w : lext cs
|
||||
[] -> []
|
||||
|
||||
unlexTextAGreek :: [String] -> String
|
||||
unlexTextAGreek = unlext where
|
||||
unlext s = case s of
|
||||
w:[] -> w
|
||||
w:[c]:[] | isAGreekPunct c -> w ++ [c]
|
||||
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
|
||||
w:ws -> w ++ " " ++ unlext ws
|
||||
[] -> []
|
||||
|
||||
isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset,
|
||||
-- second version · = 00B7 standard code point
|
||||
|
||||
-- * Text lexing and unlexing for Ancient Greek:
|
||||
-- 1. no capitalization of initial word,
|
||||
-- 2. grave/acute accent switch on final syllables of words not followed by punctuation,
|
||||
-- 3. accent move from/to support word to/from following clitic words (iterated).
|
||||
|
||||
lexAGreek :: String -> [String]
|
||||
lexAGreek = fromAGreek . lexTextAGreek
|
||||
|
||||
lexAGreek2 :: String -> [String]
|
||||
lexAGreek2 = fromAGreek . lexTextAGreek2
|
||||
|
||||
unlexAGreek :: [String] -> String
|
||||
unlexAGreek = unlexTextAGreek . toAGreek
|
||||
|
||||
-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
|
||||
-- vowel indication (like a.) differs from sentence end (a .).
|
||||
|
||||
-- | normalize = change grave accent on sentence internal words to acute,
|
||||
-- and shift inherited acutes to the following enclitic (where they are
|
||||
-- visible only as shown in the list of enclitics above)
|
||||
|
||||
normalize :: String -> String
|
||||
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)
|
||||
|
||||
fromAGreek :: [String] -> [String]
|
||||
fromAGreek s = case s of
|
||||
w:[]:vs -> w:[]:(fromAGreek vs)
|
||||
w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
|
||||
w:v:vs | wasEnclitic v && wasEnclitic w ->
|
||||
getEnclitic w : fromAGreek (v:vs)
|
||||
w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*"
|
||||
getProclitic w : getEnclitic v : fromAGreek vs
|
||||
w:v:vs | wasEnclitic v && (hasEndCircum w ||
|
||||
(hasEndAcute w && hasSingleAccent w)) ->
|
||||
w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*"
|
||||
w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
|
||||
w : getEnclitic v : fromAGreek vs
|
||||
w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*"
|
||||
dropLastAccent w : getEnclitic v : fromAGreek vs
|
||||
w:v:vs | wasEnclitic w ->
|
||||
getEnclitic w : fromAGreek (v:vs)
|
||||
w:ws -> (toAcute w) : (fromAGreek ws)
|
||||
ws -> ws
|
||||
|
||||
-- | de-normalize = change acute accent of end syllables in sentence internal
|
||||
-- (non-enclitic) words to grave accent, and move accents of enclitics to the
|
||||
-- previous word to produce ordinary ancient greek
|
||||
|
||||
denormalize :: String -> String
|
||||
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)
|
||||
|
||||
toAGreek :: [String] -> [String]
|
||||
toAGreek s = case s of
|
||||
w:[]:vs -> w:[]:(toAGreek vs)
|
||||
w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek
|
||||
w:v:vs | isEnclitic v && isEnclitic w ->
|
||||
addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2
|
||||
w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta
|
||||
addAcute w: (toAGreek (dropAccent v:vs))
|
||||
w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
|
||||
w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta
|
||||
w:v:vs | isEnclitic v && hasPrefinalAcute w ->
|
||||
w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b.
|
||||
w:v:vs | isEnclitic v ->
|
||||
(addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma
|
||||
w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
|
||||
w:ws -> (toGrave w) : (toAGreek ws)
|
||||
ws -> ws
|
||||
|
||||
-- | Change accent on the final syllable of a word
|
||||
|
||||
toGrave :: String -> String
|
||||
toGrave = reverse . grave . reverse where
|
||||
grave s = case s of
|
||||
'\'':cs -> '`':cs
|
||||
c:cs | isAGreekVowel c -> c:cs
|
||||
c:cs -> c: grave cs
|
||||
_ -> s
|
||||
|
||||
toAcute :: String -> String
|
||||
toAcute = reverse . acute . reverse where
|
||||
acute s = case s of
|
||||
'`':cs -> '\'':cs
|
||||
c:cs | isAGreekVowel c -> c:cs
|
||||
c:cs -> c: acute cs
|
||||
_ -> s
|
||||
|
||||
isAGreekVowel = flip elem "aeioyhw"
|
||||
|
||||
-- | Accent moves for enclitics and proclitics (atona)
|
||||
|
||||
enclitics = [
|
||||
"moy","moi","me", -- personal pronouns
|
||||
"soy","soi","se",
|
||||
"oy(","oi(","e(",
|
||||
"tis*","ti","tina'", -- indefinite pronoun
|
||||
"tino's*","tini'",
|
||||
"tine's*","tina's*",
|
||||
"tinw~n","tisi'","tisi'n",
|
||||
"poy","poi", -- indefinite adverbs
|
||||
"pove'n","pws*",
|
||||
"ph|","pote'",
|
||||
"ge","te","toi", -- particles
|
||||
"nyn","per","pw"
|
||||
-- suffix -"de"
|
||||
-- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
|
||||
] -- and more, BR 11
|
||||
|
||||
proclitics = [
|
||||
"o(","h(","oi(","ai(", -- articles
|
||||
"e)n","ei)s*","e)x","e)k", -- prepositions
|
||||
"ei)","w(s*", -- conjunctions
|
||||
"oy)","oy)k","oy)c" -- negation
|
||||
]
|
||||
|
||||
isEnclitic = flip elem enclitics
|
||||
isProclitic = flip elem proclitics
|
||||
|
||||
-- Check if a word is an enclitic or accented enclitic and extract the enclitic
|
||||
|
||||
wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
|
||||
++ (map dropAccent (filter hasAccent enclitics))
|
||||
accented = (filter hasAccent enclitics)
|
||||
++ map addAcute (filter (not . hasAccent) enclitics)
|
||||
in flip elem (accented ++ unaccented)
|
||||
|
||||
wasProclitic = flip elem (map addAcute proclitics)
|
||||
|
||||
getEnclitic =
|
||||
let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
|
||||
++ (map addAcute (filter (not . hasAccent) enclitics)))
|
||||
(enclitics ++ (filter hasAccent enclitics)
|
||||
++ (filter (not . hasAccent) enclitics))
|
||||
find = \v -> lookup v pairs
|
||||
in \v -> case (find v) of
|
||||
Just x -> x
|
||||
_ -> v
|
||||
getProclitic =
|
||||
let pairs = zip (map addAcute proclitics) proclitics
|
||||
find = \v -> lookup v pairs
|
||||
in \v -> case (find v) of
|
||||
Just x -> x
|
||||
_ -> v
|
||||
|
||||
-- | Accent manipulation
|
||||
|
||||
dropAccent = reverse . drop . reverse where
|
||||
drop s = case s of
|
||||
[] -> []
|
||||
'\'':cs -> cs
|
||||
'`':cs -> cs
|
||||
'~':cs -> cs
|
||||
c:cs -> c:drop cs
|
||||
|
||||
dropLastAccent = reverse . drop . reverse where
|
||||
drop s = case s of
|
||||
[] -> []
|
||||
'\'':cs -> cs
|
||||
'`':cs -> cs
|
||||
'~':cs -> cs
|
||||
c:cs -> c:drop cs
|
||||
|
||||
addAcute :: String -> String
|
||||
addAcute = reverse . acu . reverse where
|
||||
acu w = case w of
|
||||
c:cs | c == '\'' -> c:cs
|
||||
c:cs | c == '(' -> '\'':c:cs
|
||||
c:cs | c == ')' -> '\'':c:cs
|
||||
c:cs | isAGreekVowel c -> '\'':c:cs
|
||||
c:cs -> c : acu cs
|
||||
_ -> w
|
||||
|
||||
-- | Accent checking on end syllables
|
||||
|
||||
hasEndAcute = find . reverse where
|
||||
find s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> True
|
||||
'`':cs -> False
|
||||
'~':cs -> False
|
||||
c:cs | isAGreekVowel c -> False
|
||||
_:cs -> find cs
|
||||
|
||||
hasEndCircum = find . reverse where
|
||||
find s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> False
|
||||
'`':cs -> False
|
||||
'~':cs -> True
|
||||
c:cs | isAGreekVowel c -> False
|
||||
_:cs -> find cs
|
||||
|
||||
hasPrefinalAcute = find . reverse where
|
||||
find s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> False -- final acute
|
||||
'`':cs -> False
|
||||
'~':cs -> False
|
||||
c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
|
||||
c:cs | isAGreekVowel c -> findNext cs
|
||||
_:cs -> find cs where
|
||||
findNext s = case s of
|
||||
[] -> False
|
||||
'\'':cs -> True -- prefinal acute
|
||||
'`':cs -> False
|
||||
'~':cs -> False
|
||||
c:cs | isAGreekVowel c -> False
|
||||
_:cs -> findNext cs where
|
||||
|
||||
hasSingleAccent v =
|
||||
hasAccent v && not (hasAccent (dropLastAccent v))
|
||||
|
||||
hasAccent v = case v of
|
||||
[] -> False
|
||||
c:cs -> elem c ['\'','`','~'] || hasAccent cs
|
||||
|
||||
{- Tests:
|
||||
|
||||
-- denormalization. Examples in BR 11 work:
|
||||
-}
|
||||
enclitics_expls = -- normalized
|
||||
"sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha
|
||||
"sofo's tis*":"sofoi' tine's*": -- a.beta
|
||||
"ei) tis*":"ei) tine's*":
|
||||
"a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma
|
||||
"doy~los* tis*":"doy~loi tine's*":
|
||||
"lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b.
|
||||
"ei) poy tis* tina' i)'doi": -- Anm. 2.
|
||||
[]
|
||||
{-
|
||||
test = map denormalize enclitics_expls
|
||||
|
||||
*PGF.LexingAGreek> test
|
||||
["sofw~n tis*","sofw~n tines*","sof~n tinwn",
|
||||
"sofo's tis*","sofoi' tines*",
|
||||
"ei)' tis*","ei)' tines*",
|
||||
"a)'nvrwpo's* tis*","a)'nvrwpoi' tines*",
|
||||
"doy~lo's* tis*","doy~loi' tines*",
|
||||
"lo'gos* tis* ","lo'goi tine's*","lo'gwn tinw~n ",
|
||||
"ei)' poy' ti's* tina i)'doi"]
|
||||
|
||||
-- normalization:
|
||||
|
||||
*PGF.LexingAGreek> map normalize test
|
||||
["sofw~n tis*","sofw~n tine's*","sof~n tinw~n",
|
||||
"sofo's tis*","sofoi' tine's*",
|
||||
"ei) tis*","ei) tine's*",
|
||||
"a)'nvrwpos* tis*","a)'nvrwpoi tine's*",
|
||||
"doy~los* tis*","doy~loi tine's*",
|
||||
"lo'gos* tis*","lo'goi tine's*","lo'gwn tinw~n",
|
||||
"ei) poy tis* tina' i)'doi"]
|
||||
|
||||
*PGF.LexingAGreek> map (normalize . denormalize) enclitics_expls == enclitics_expls
|
||||
True
|
||||
*PGF.LexingAGreek> map (denormalize . normalize) test == test
|
||||
True
|
||||
|
||||
-}
|
||||
@@ -50,9 +50,6 @@ Library
|
||||
exposed-modules:
|
||||
PGF
|
||||
PGF.Internal
|
||||
PGF.Lexing
|
||||
PGF.LexingAGreek
|
||||
PGF.Utilities
|
||||
PGF.Haskell
|
||||
|
||||
other-modules:
|
||||
@@ -78,3 +75,4 @@ Library
|
||||
PGF.VisualizeTree
|
||||
PGF.ByteCode
|
||||
PGF.OldBinary
|
||||
PGF.Utilities
|
||||
|
||||
@@ -4,8 +4,8 @@ module PGFService(cgiMain,cgiMain',getPath,
|
||||
Caches,pgfCache,newPGFCache,flushPGFCache,listPGFCache) where
|
||||
|
||||
import PGF (PGF,Labels,CncLabels)
|
||||
import GF.Text.Lexing
|
||||
import qualified PGF
|
||||
import PGF.Lexing
|
||||
import Cache
|
||||
import CGIUtils(outputJSONP,outputPlain,outputHTML,outputText,
|
||||
outputBinary,outputBinary',
|
||||
@@ -272,8 +272,11 @@ cpgfMain qsem command (t,(pgf,pc)) =
|
||||
maybe (Left ("["++w++"]")) Right $
|
||||
msum [parse1 w,parse1 ow,morph w,morph ow]
|
||||
where
|
||||
ow = if w==lw then capitInit w else lw
|
||||
lw = uncapitInit w
|
||||
ow = case w of
|
||||
c:cs | isLower c -> toUpper c : cs
|
||||
| isUpper c -> toLower c : cs
|
||||
s -> s
|
||||
|
||||
parse1 = either (const Nothing) (fmap fst . listToMaybe) .
|
||||
C.parse concr cat
|
||||
morph w = listToMaybe
|
||||
@@ -293,7 +296,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
|
||||
from1 = maybe (missing "from") return =<< from'
|
||||
from' = getLang "from"
|
||||
|
||||
to = (,) # getLangs "to" % unlexerC
|
||||
to = (,) # getLangs "to" % unlexer (const False)
|
||||
|
||||
getLangs = getLangs' readLang
|
||||
getLang = getLang' readLang
|
||||
@@ -308,8 +311,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
|
||||
let t = C.readExpr s
|
||||
maybe (badRequest "bad tree" s) return t
|
||||
|
||||
--c_lexer concr = lexer
|
||||
c_lexer concr = ilexer (not . null . C.lookupMorpho concr)
|
||||
c_lexer concr = lexer (not . null . C.lookupMorpho concr)
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
@@ -338,62 +340,29 @@ instance ToATree C.Expr where
|
||||
--------------------------------------------------------------------------------
|
||||
-- * Lexing
|
||||
|
||||
-- | Lexers with a text lexer that tries to be a more clever with the first word
|
||||
ilexer good = lexer' uncap
|
||||
where
|
||||
uncap s = case span isUpper s of
|
||||
([c],r) | not (good s) -> toLower c:r
|
||||
_ -> s
|
||||
|
||||
-- | Standard lexers
|
||||
lexer = lexer' uncapitInit
|
||||
|
||||
lexer' uncap = maybe (return id) lexerfun =<< getInput "lexer"
|
||||
lexer good = maybe (return id) lexerfun =<< getInput "lexer"
|
||||
where
|
||||
lexerfun name =
|
||||
case name of
|
||||
"text" -> return (unwords . lexText' uncap)
|
||||
"code" -> return (unwords . lexCode)
|
||||
"mixed" -> return (unwords . lexMixed)
|
||||
_ -> badRequest "Unknown lexer" name
|
||||
case stringOp good ("lex"++name) of
|
||||
Just fn -> return fn
|
||||
Nothing -> badRequest "Unknown lexer" name
|
||||
|
||||
|
||||
type Unlexer = String->String
|
||||
|
||||
-- | Unlexing for the C runtime system, &+ is already applied
|
||||
unlexerC :: CGI Unlexer
|
||||
unlexerC = maybe (return id) unlexerfun =<< getInput "unlexer"
|
||||
unlexer :: (String -> Bool) -> CGI Unlexer
|
||||
unlexer good = maybe (return id) unlexerfun =<< getInput "unlexer"
|
||||
where
|
||||
unlexerfun name =
|
||||
case name of
|
||||
"text" -> return (unlexText' . words)
|
||||
"code" -> return (unlexCode . words)
|
||||
"mixed" -> return (unlexMixed . words)
|
||||
"none" -> return id
|
||||
"id" -> return id
|
||||
_ -> badRequest "Unknown lexer" name
|
||||
|
||||
-- | Unlex text, skipping the quality marker used by the App grammar
|
||||
unlexText' ("+":ws) = "+ "++unlexText ws
|
||||
unlexText' ("*":ws) = "* "++unlexText ws
|
||||
unlexText' ws = unlexText ws
|
||||
|
||||
-- | Unlexing for the Haskell run-time, applying the &+ operator first
|
||||
unlexerH :: CGI Unlexer
|
||||
unlexerH = maybe (return doBind) unlexerfun =<< getInput "unlexer"
|
||||
where
|
||||
unlexerfun name =
|
||||
case name of
|
||||
"text" -> return (unlexText' . bind)
|
||||
"code" -> return (unlexCode . bind)
|
||||
"mixed" -> return (unlexMixed . bind)
|
||||
"none" -> return id
|
||||
"id" -> return id
|
||||
"bind" -> return doBind
|
||||
_ -> badRequest "Unknown lexer" name
|
||||
|
||||
doBind = unwords . bind
|
||||
bind = bindTok . words
|
||||
case stringOp good ("unlex"++name) of
|
||||
Just fn -> return (fn . cleanMarker)
|
||||
Nothing -> badRequest "Unknown unlexer" name
|
||||
|
||||
cleanMarker ('+':cs) = cs
|
||||
cleanMarker ('*':cs) = cs
|
||||
cleanMarker cs = cs
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
-- * Haskell run-time functionality
|
||||
@@ -431,8 +400,8 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
|
||||
inp <- textInput
|
||||
return (fr,lex inp)
|
||||
|
||||
mlexer Nothing = lexer
|
||||
mlexer (Just lang) = ilexer (PGF.isInMorpho morpho)
|
||||
mlexer Nothing = lexer (const False)
|
||||
mlexer (Just lang) = lexer (PGF.isInMorpho morpho)
|
||||
where morpho = PGF.buildMorpho pgf lang
|
||||
|
||||
tree :: CGI PGF.Tree
|
||||
@@ -489,7 +458,7 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
|
||||
from = getLang "from"
|
||||
|
||||
to1 = maybe (missing "to") return =<< getLang "to"
|
||||
to = (,) # getLangs "to" % unlexerH
|
||||
to = (,) # getLangs "to" % unlexer (const False)
|
||||
|
||||
getLangs = getLangs' readLang
|
||||
getLang = getLang' readLang
|
||||
|
||||
Reference in New Issue
Block a user