eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime.

This commit is contained in:
Krasimir Angelov
2017-09-04 11:43:37 +02:00
parent cae52bb9af
commit 1f908fa7bf
12 changed files with 453 additions and 561 deletions

View File

@@ -0,0 +1,3 @@
module PGF(module PGF2) where
import PGF2

View File

@@ -0,0 +1 @@
module PGF.Internal where

View File

@@ -1,61 +0,0 @@
-- | pgf-hsbind-trans: A simple batch translator to illustrate the use of the Haskell binding
-- to the C implementation of the PGF run-time system.
--
-- AR April 2015 modified from pgf-shell
import PGF2
import PGF.Lexing (lexText')
import Data.Char(isSpace,toLower)
import Data.List (nub)
import System.Environment
import qualified Data.Map as Map
maxNumTrees :: Int
maxNumTrees = 1
maxNumVariants :: Int
maxNumVariants = 1
main = getPGF =<< getArgs
getPGF args = case args of
[path,from,to,cat,mxt,mxv] -> pgfTrans from to (Just cat) (read mxt, read mxv) =<< readPGF path
[path,from,to] -> pgfTrans from to Nothing (maxNumTrees,maxNumVariants) =<< readPGF path
_ -> putStrLn "Usage: pgf-hsbind-trans <path to pgf> <from-lang> <to-lang> [<cat> <#trees> <#variants>]"
pgfTrans from to mcat mx pgf = do
cfrom <- getConcr' pgf from
cto <- getConcr' pgf to
let cat = maybe (startCat pgf) id mcat
interact (unlines . map (translates pgf cfrom cto cat mx) . lines)
getConcr' pgf lang =
maybe (fail $ "Concrete syntax not found: "++show lang) return $
Map.lookup lang (languages pgf)
linearizeAndShow gr mxv (t,p) = [show t]++take mxv (nub (map unstar (linearizeAll gr t)))++[show p]
where
unstar s = case s of
'*':' ':cs -> cs
_ -> s
translates pgf cfrom cto cat (mxt,mxv) s0 =
let s1 = lextext cfrom s0
(s,p) = case reverse s1 of c:_ | elem c ".?!" -> (init s1,[c]) ; _ -> (s1,[]) -- separate final punctuation
in
case cparse pgf cfrom cat s of
Left tok -> unlines [s,"Parse error: "++tok]
Right ts -> unlines $ (("> "++ s):) $ take mxt $ map ((++p) . unlines . linearizeAndShow cto mxv) ts -- append punctuation
cparse pgf concr cat input = parseWithHeuristics concr cat input (-1) callbacks where
callbacks = maybe [] cb $ lookup "App" literalCallbacks
cb fs = [(cat,f pgf ("TranslateEng",concr) input)|(cat,f)<-fs]
lextext cnc = unwords . lexText' (\w -> case lookupMorpho cnc w of
_:_ -> w
_ -> case lookupMorpho cnc (uncapitInit w) of
[] -> w
_ -> uncapitInit w
)
where uncapitInit (c:cs) = toLower c : cs

View File

@@ -14,7 +14,9 @@ extra-source-files: README
cabal-version: >=1.10
library
exposed-modules: PGF2, SG
exposed-modules: PGF2, SG,
-- backwards compatibility API:
PGF, PGF.Internal
other-modules: PGF2.FFI, PGF2.Expr, PGF2.Type, SG.FFI
build-depends: base >=4.3, bytestring >=0.9,
containers, pretty

View File

@@ -1,115 +0,0 @@
module PGF.Lexing where
import Data.Char(isSpace,toLower,toUpper)
-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: String -> [String]
lexText = lexText' uncapitInit
-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
w:ws -> uncap1 w:ws
_ -> s
unlexText :: [String] -> String
unlexText = capitInit . unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isPunct c -> w ++ [c]
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
_ -> []
-- | Bind tokens separated by Prelude.BIND, i.e. &+
bindTok :: [String] -> [String]
bindTok ws = case ws of
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
"&+":ws -> bindTok ws
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
"&|":ws -> bindTok ws
w:ws -> w:bindTok ws
[] -> []
-- * Code lexing
-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
[(w@(_:_),ws)] -> w : lexCode ws
_ -> []
unlexCode :: [String] -> String
unlexCode s = case s of
w:[] -> w
[c]:cs | isParen c -> [c] ++ unlexCode cs
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
w:ws -> w ++ " " ++ unlexCode ws
_ -> []
-- | LaTeX lexer in the math mode: \ should not be separated from the next word
lexLatexCode :: String -> [String]
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
restoreBackslash ws = case ws of
"\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
w:ww -> w:restoreBackslash ww
_ -> ws
-- * Mixed lexing
-- | LaTeX style lexer, with "math" environment using Code between $...$
lexMixed :: String -> [String]
lexMixed = concat . alternate False [] where
alternate env t s = case s of
'$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
'\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
c:cs -> alternate env (c:t) cs
_ -> [lex env (reverse t)]
lex env = if env then lexLatexCode else lexText
unlexMixed :: [String] -> String
unlexMixed = capitInit . concat . alternate False where
alternate env s = case s of
_:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
(t,[]) -> unlex env t : []
(t,c:m) -> unlex env t : sep env c m : alternate (not env) m
_ -> []
unlex env = if env then unlexCode else (uncapitInit . unlexText)
sep env c m = case (m,env) of
([p]:_,True) | isPunct p -> c -- closing $ glued to next punct
(_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows
_ -> " " ++ c -- put space before opening $
-- * Additional lexing uitilties
-- | Capitalize first letter
capitInit s = case s of
c:cs -> toUpper c : cs
_ -> s
-- | Uncapitalize first letter
uncapitInit s = case s of
c:cs -> toLower c : cs
_ -> s
-- | Unquote each string wrapped in double quotes
unquote = map unq where
unq s = case s of
'"':cs@(_:_) | last cs == '"' -> init cs
_ -> s
isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"

View File

@@ -1,310 +0,0 @@
module PGF.LexingAGreek where -- HL 2a1.2.2016
import Data.Char(isSpace)
-- * Text lexing without word capitalization of the first word of every sentence.
-- Greek sentences in (transliterated) texts don't start with capital character.
-- Ordinary greek text does not have vowel length indicators. We then use '.' as
-- a sentence separator.
lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in w : lext cs
[] -> []
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in case cs of
'.':'.':d:ds | isSpace d
-> (w++['.']) : lext ('.':d:ds)
'.':d:ds | isAGreekPunct d || isSpace d
-> (w++['.']) : lext (d:ds)
'.':d:ds | not (isSpace d)
-> case lext (d:ds) of
e:es -> (w++['.']++e) : es
es -> (w++['.']) : es
'.':[] -> (w++['.']) : []
_ -> w : lext cs
[] -> []
unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isAGreekPunct c -> w ++ [c]
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
[] -> []
isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset,
-- second version · = 00B7 standard code point
-- * Text lexing and unlexing for Ancient Greek:
-- 1. no capitalization of initial word,
-- 2. grave/acute accent switch on final syllables of words not followed by punctuation,
-- 3. accent move from/to support word to/from following clitic words (iterated).
lexAGreek :: String -> [String]
lexAGreek = fromAGreek . lexTextAGreek
lexAGreek2 :: String -> [String]
lexAGreek2 = fromAGreek . lexTextAGreek2
unlexAGreek :: [String] -> String
unlexAGreek = unlexTextAGreek . toAGreek
-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
-- vowel indication (like a.) differs from sentence end (a .).
-- | normalize = change grave accent on sentence internal words to acute,
-- and shift inherited acutes to the following enclitic (where they are
-- visible only as shown in the list of enclitics above)
normalize :: String -> String
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)
fromAGreek :: [String] -> [String]
fromAGreek s = case s of
w:[]:vs -> w:[]:(fromAGreek vs)
w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
w:v:vs | wasEnclitic v && wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*"
getProclitic w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && (hasEndCircum w ||
(hasEndAcute w && hasSingleAccent w)) ->
w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*"
w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*"
dropLastAccent w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:ws -> (toAcute w) : (fromAGreek ws)
ws -> ws
-- | de-normalize = change acute accent of end syllables in sentence internal
-- (non-enclitic) words to grave accent, and move accents of enclitics to the
-- previous word to produce ordinary ancient greek
denormalize :: String -> String
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)
toAGreek :: [String] -> [String]
toAGreek s = case s of
w:[]:vs -> w:[]:(toAGreek vs)
w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek
w:v:vs | isEnclitic v && isEnclitic w ->
addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2
w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta
addAcute w: (toAGreek (dropAccent v:vs))
w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta
w:v:vs | isEnclitic v && hasPrefinalAcute w ->
w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b.
w:v:vs | isEnclitic v ->
(addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma
w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
w:ws -> (toGrave w) : (toAGreek ws)
ws -> ws
-- | Change accent on the final syllable of a word
toGrave :: String -> String
toGrave = reverse . grave . reverse where
grave s = case s of
'\'':cs -> '`':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: grave cs
_ -> s
toAcute :: String -> String
toAcute = reverse . acute . reverse where
acute s = case s of
'`':cs -> '\'':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: acute cs
_ -> s
isAGreekVowel = flip elem "aeioyhw"
-- | Accent moves for enclitics and proclitics (atona)
enclitics = [
"moy","moi","me", -- personal pronouns
"soy","soi","se",
"oy(","oi(","e(",
"tis*","ti","tina'", -- indefinite pronoun
"tino's*","tini'",
"tine's*","tina's*",
"tinw~n","tisi'","tisi'n",
"poy","poi", -- indefinite adverbs
"pove'n","pws*",
"ph|","pote'",
"ge","te","toi", -- particles
"nyn","per","pw"
-- suffix -"de"
-- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
] -- and more, BR 11
proclitics = [
"o(","h(","oi(","ai(", -- articles
"e)n","ei)s*","e)x","e)k", -- prepositions
"ei)","w(s*", -- conjunctions
"oy)","oy)k","oy)c" -- negation
]
isEnclitic = flip elem enclitics
isProclitic = flip elem proclitics
-- Check if a word is an enclitic or accented enclitic and extract the enclitic
wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
++ (map dropAccent (filter hasAccent enclitics))
accented = (filter hasAccent enclitics)
++ map addAcute (filter (not . hasAccent) enclitics)
in flip elem (accented ++ unaccented)
wasProclitic = flip elem (map addAcute proclitics)
getEnclitic =
let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
++ (map addAcute (filter (not . hasAccent) enclitics)))
(enclitics ++ (filter hasAccent enclitics)
++ (filter (not . hasAccent) enclitics))
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
getProclitic =
let pairs = zip (map addAcute proclitics) proclitics
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
-- | Accent manipulation
dropAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
dropLastAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
addAcute :: String -> String
addAcute = reverse . acu . reverse where
acu w = case w of
c:cs | c == '\'' -> c:cs
c:cs | c == '(' -> '\'':c:cs
c:cs | c == ')' -> '\'':c:cs
c:cs | isAGreekVowel c -> '\'':c:cs
c:cs -> c : acu cs
_ -> w
-- | Accent checking on end syllables
hasEndAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> True
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasEndCircum = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False
'`':cs -> False
'~':cs -> True
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasPrefinalAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False -- final acute
'`':cs -> False
'~':cs -> False
c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
c:cs | isAGreekVowel c -> findNext cs
_:cs -> find cs where
findNext s = case s of
[] -> False
'\'':cs -> True -- prefinal acute
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> findNext cs where
hasSingleAccent v =
hasAccent v && not (hasAccent (dropLastAccent v))
hasAccent v = case v of
[] -> False
c:cs -> elem c ['\'','`','~'] || hasAccent cs
{- Tests:
-- denormalization. Examples in BR 11 work:
-}
enclitics_expls = -- normalized
"sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha
"sofo's tis*":"sofoi' tine's*": -- a.beta
"ei) tis*":"ei) tine's*":
"a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma
"doy~los* tis*":"doy~loi tine's*":
"lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b.
"ei) poy tis* tina' i)'doi": -- Anm. 2.
[]
{-
test = map denormalize enclitics_expls
*PGF.LexingAGreek> test
["sofw~n tis*","sofw~n tines*","sof~n tinwn",
"sofo's tis*","sofoi' tines*",
"ei)' tis*","ei)' tines*",
"a)'nvrwpo's* tis*","a)'nvrwpoi' tines*",
"doy~lo's* tis*","doy~loi' tines*",
"lo'gos* tis* ","lo'goi tine's*","lo'gwn tinw~n ",
"ei)' poy' ti's* tina i)'doi"]
-- normalization:
*PGF.LexingAGreek> map normalize test
["sofw~n tis*","sofw~n tine's*","sof~n tinw~n",
"sofo's tis*","sofoi' tine's*",
"ei) tis*","ei) tine's*",
"a)'nvrwpos* tis*","a)'nvrwpoi tine's*",
"doy~los* tis*","doy~loi tine's*",
"lo'gos* tis*","lo'goi tine's*","lo'gwn tinw~n",
"ei) poy tis* tina' i)'doi"]
*PGF.LexingAGreek> map (normalize . denormalize) enclitics_expls == enclitics_expls
True
*PGF.LexingAGreek> map (denormalize . normalize) test == test
True
-}

View File

@@ -50,9 +50,6 @@ Library
exposed-modules:
PGF
PGF.Internal
PGF.Lexing
PGF.LexingAGreek
PGF.Utilities
PGF.Haskell
other-modules:
@@ -78,3 +75,4 @@ Library
PGF.VisualizeTree
PGF.ByteCode
PGF.OldBinary
PGF.Utilities