Lexer by need.

This commit is contained in:
aarne
2003-11-17 15:17:53 +00:00
parent 9d55f72d7a
commit 70c9f7b365
14 changed files with 81 additions and 24 deletions

View File

@@ -45,6 +45,7 @@ Highlights, preliminary version
<li> New syntax alternatives for local definitions: <tt>let</tt> without <li> New syntax alternatives for local definitions: <tt>let</tt> without
braces and <tt>where</tt>. braces and <tt>where</tt>.
<li> Pattern variables can be used on lhs's of <tt>oper</tt> definitions. <li> Pattern variables can be used on lhs's of <tt>oper</tt> definitions.
<li> New Unicode transliterations (by Harad Hammarström).
<h4>New parser (forthcoming)</h4> <h4>New parser (forthcoming)</h4>
@@ -62,7 +63,8 @@ Highlights, preliminary version
<li> Haskell source code organized into subdirectories. <li> Haskell source code organized into subdirectories.
<li> BNF Converter used for defining the languages GF and GFC, which also <li> BNF Converter used for defining the languages GF and GFC, which also
give reliable LaTeX documentation. give reliable LaTeX documentation.
<li> Lexican rules sorted out by option <tt>-cflexer</tt> for efficient
parsing with large lexica.
<!-- NEW --> <!-- NEW -->

View File

@@ -507,7 +507,7 @@ oper
DitransVerb = TransVerb ** {s3 : Preposition} ; DitransVerb = TransVerb ** {s3 : Preposition} ;
mkDitransVerb : Verb -> Preposition -> Preposition -> DitransVerb = \v,p1,p2 -> mkDitransVerb : Verb -> Preposition -> Preposition -> DitransVerb = \v,p1,p2 ->
v ** {s3 = p1 ; s3 = p2} ; v ** {s2 = p1 ; s3 = p2} ;
complDitransVerb : complDitransVerb :
Bool -> DitransVerb -> NounPhrase -> NounPhrase -> VerbPhrase = Bool -> DitransVerb -> NounPhrase -> NounPhrase -> VerbPhrase =
@@ -538,7 +538,7 @@ oper
s3 = \\g,n => spelar.s3 ! g ! n ++ (if_then_else Str postp bra.s []) s3 = \\g,n => spelar.s3 ! g ! n ++ (if_then_else Str postp bra.s [])
} ; } ;
advAdjPhrase : Adverb -> AdjPhrase -> AdjPhrase = \mycket, dyr -> advAdjPhrase : SS -> AdjPhrase -> AdjPhrase = \mycket, dyr ->
{s = \\a,c => mycket.s ++ dyr.s ! a ! c ; {s = \\a,c => mycket.s ++ dyr.s ! a ! c ;
p = dyr.p p = dyr.p
} ; } ;
@@ -903,7 +903,7 @@ oper
-- This class covers adverbials such as "annars", "därför", which are prefixed -- This class covers adverbials such as "annars", "därför", which are prefixed
-- to a sentence to form a phrase. -- to a sentence to form a phrase.
advSentence : Adverb -> Sentence -> Utterance = \annars,soverhan -> advSentence : SS -> Sentence -> Utterance = \annars,soverhan ->
ss (annars.s ++ soverhan.s ! Inv ++ ".") ; ss (annars.s ++ soverhan.s ! Inv ++ ".") ;

View File

@@ -15,8 +15,9 @@ import Char (isUpper, isLower, toUpper, toLower)
-- abstract type CF. -- abstract type CF.
-- Invariant: each category has all its rules grouped with it -- Invariant: each category has all its rules grouped with it
-- also: the list is never empty (the category is just missing then) -- also: the list is never empty (the category is just missing then)
newtype CF = CF ([(CFCat,[CFRule])], CFPredef) newtype CF = CF ([CFRuleGroup], CFPredef)
type CFRule = (CFFun, (CFCat, [CFItem])) type CFRule = (CFFun, (CFCat, [CFItem]))
type CFRuleGroup = (CFCat,[CFRule])
-- CFPredef is a hack for variable symbols and literals; normally = const [] -- CFPredef is a hack for variable symbols and literals; normally = const []
data CFItem = CFTerm RegExp | CFNonterm CFCat deriving (Eq, Ord,Show) data CFItem = CFTerm RegExp | CFNonterm CFCat deriving (Eq, Ord,Show)

View File

@@ -3,7 +3,9 @@ module CFIdent where
import Operations import Operations
import GFC import GFC
import Ident import Ident
import Values (cPredefAbs)
import AbsGFC import AbsGFC
import Macros (ident2label)
import PrGrammar import PrGrammar
import Str import Str
import Char (toLower, toUpper) import Char (toLower, toUpper)
@@ -48,6 +50,10 @@ newtype CFFun = CFFun (Atom, Profile) deriving (Eq,Show)
type Profile = [([[Int]],[Int])] type Profile = [([[Int]],[Int])]
wordsCFTok :: CFTok -> [String]
wordsCFTok t = case t of
TC (c:cs) -> [c':cs | c' <- [toUpper c, toLower c]]
_ -> [prCFTok t]
-- the following functions should be used instead of constructors -- the following functions should be used instead of constructors
@@ -68,6 +74,9 @@ stringCFFun = mkCFFun . AS
intCFFun :: Int -> CFFun intCFFun :: Int -> CFFun
intCFFun = mkCFFun . AI . toInteger intCFFun = mkCFFun . AI . toInteger
dummyCFFun :: CFFun
dummyCFFun = varCFFun $ identC "_" --- used in lexer-by-need rules
cfFun2String :: CFFun -> String cfFun2String :: CFFun -> String
cfFun2String (CFFun (f,_)) = prt f cfFun2String (CFFun (f,_)) = prt f
@@ -105,8 +114,8 @@ cat2CFCat :: (Ident,Ident) -> CFCat
cat2CFCat = uncurry idents2CFCat cat2CFCat = uncurry idents2CFCat
---- literals ---- literals
cfCatString = string2CFCat "Predef" "String" cfCatString = string2CFCat (prt cPredefAbs) "String"
cfCatInt = string2CFCat "Predef" "Int" cfCatInt = string2CFCat (prt cPredefAbs) "Int"
@@ -122,6 +131,9 @@ moduleOfCFCat (CFCat (CIQ m _, _)) = m
cfCat2Cat :: CFCat -> (Ident,Ident) cfCat2Cat :: CFCat -> (Ident,Ident)
cfCat2Cat (CFCat (CIQ m c,_)) = (m,c) cfCat2Cat (CFCat (CIQ m c,_)) = (m,c)
lexCFCat :: CFCat -> CFCat
lexCFCat cat = ident2CFCat (uncurry CIQ (cfCat2Cat cat)) (identC "*")
-- to construct CF tokens -- to construct CF tokens
string2CFTok :: String -> CFTok string2CFTok :: String -> CFTok

View File

@@ -5,12 +5,14 @@ import Option
import Ident import Ident
import AbsGFC import AbsGFC
import GFC import GFC
import Values (isPredefCat,cPredefAbs)
import PrGrammar import PrGrammar
import CMacros import CMacros
import qualified Modules as M import qualified Modules as M
import CF import CF
import CFIdent import CFIdent
import List (nub) import Morphology
import List (nub,partition)
import Monad import Monad
-- AR 27/1/2000 -- 3/12/2001 -- 8/6/2003 -- AR 27/1/2000 -- 3/12/2001 -- 8/6/2003
@@ -28,8 +30,8 @@ canon2cf opts gr c = do
rules0 <- liftM concat $ mapM (uncurry (cnc2cfCond opts)) mms rules0 <- liftM concat $ mapM (uncurry (cnc2cfCond opts)) mms
let rules = filter (not . isCircularCF) rules0 ---- temporarily here let rules = filter (not . isCircularCF) rules0 ---- temporarily here
let grules = groupCFRules rules let grules = groupCFRules rules
let predef = mkCFPredef $ map fst grules let predef = mkCFPredef opts grules
return $ CF (grules, predef) return $ CF predef
cnc2cfCond :: Options -> Ident -> [(Ident,Info)] -> Err [CFRule] cnc2cfCond :: Options -> Ident -> [(Ident,Info)] -> Err [CFRule]
cnc2cfCond opts m gr = cnc2cfCond opts m gr =
@@ -138,16 +140,40 @@ term2CFItems m t = errIn "forming cf items" $ case t of
tryMkCFTerm itss = return itss tryMkCFTerm itss = return itss
extrR arg lab = case (arg,lab) of extrR arg lab = case (arg,lab) of
(Arg (A cat pos), l@(L _)) -> return [[PNonterm (CIQ m cat) pos l True]] (Arg (A cat pos), l@(L _)) -> return [[PNonterm (cIQ cat) pos l True]]
(Arg (A cat pos), l@(LV _)) -> return [[PNonterm (CIQ m cat) pos l False]] (Arg (A cat pos), l@(LV _)) -> return [[PNonterm (cIQ cat) pos l False]]
(Arg (AB cat pos b), l@(L _)) -> return [[PNonterm (CIQ m cat) pos l True]] (Arg (AB cat pos b), l@(L _)) -> return [[PNonterm (cIQ cat) pos l True]]
(Arg (AB cat pos b), l@(LV _)) -> return [[PNonterm (CIQ m cat) pos l False]] (Arg (AB cat pos b), l@(LV _)) -> return [[PNonterm (cIQ cat) pos l False]]
---- ?? ---- ??
_ -> prtBad "cannot extract record field from" arg _ -> prtBad "cannot extract record field from" arg
cIQ c = if isPredefCat c then CIQ cPredefAbs c else CIQ m c
mkCFPredef :: [CFCat] -> CFPredef mkCFPredef :: Options -> [CFRuleGroup] -> ([CFRuleGroup],CFPredef)
mkCFPredef cats s = mkCFPredef opts rules = (ruls, \s -> preds0 s ++ look s) where
(ruls,preds) = if oElem lexerByNeed opts -- option -cflexer
then predefLexer rules
else (rules,NT)
preds0 s =
[(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++ [(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++
[(cat, varCFFun x) | TV x <- [s], cat <- cats] ++ [(cat, varCFFun x) | TV x <- [s], cat <- cats] ++
[(cfCatString, stringCFFun t) | TL t <- [s]] ++ [(cfCatString, stringCFFun t) | TL t <- [s]] ++
[(cfCatInt, intCFFun t) | TI t <- [s]] [(cfCatInt, intCFFun t) | TI t <- [s]]
cats = map fst rules
look s = errVal [] $ liftM concat $
mapM (flip justLookupTree preds . tS) $ wordsCFTok s --- for TC tokens
--- TODO: use trie instead of bintree; integrate with morphology
predefLexer :: [CFRuleGroup] -> ([CFRuleGroup],BinTree (CFTok,[(CFCat, CFFun)]))
predefLexer groups = (reverse ruls, sorted2tree $ sortAssocs preds) where
(ruls,preds) = foldr mkOne ([],[]) groups
mkOne group@(cat,rules) (rs,ps) = (rule:rs,pre ++ ps) where
(rule,pre) = case partition isLexical rules of
([],_) -> (group,[])
(ls,rest) -> ((cat,rest), concatMap mkLexRule ls) --- useLexRule cat : rest
isLexical (f,(c,its)) = case its of
[CFTerm (RegAlts ws)] -> True
_ -> False
-- useLexRule cat = (dummyCFFun,(cat,[CFNonterm (lexCFCat cat)])) -- not needed
mkLexRule r = case r of
(fun,(cat,[CFTerm (RegAlts ws)])) -> [(tS w, (cat,fun)) | w <- ws]
_ -> []

View File

@@ -33,6 +33,7 @@ type BindVs = [[I.Ident]]
-- (2) term2trm: restore Bindings from Binds -- (2) term2trm: restore Bindings from Binds
tree2term :: CFTree -> Err ITerm tree2term :: CFTree -> Err ITerm
-- tree2term (CFTree (f,(_,[t]))) | f == dummyCFFun = tree2term t -- not used
tree2term (CFTree (cff@(CFFun (fun,pro)), (_,trees))) = case fun of tree2term (CFTree (cff@(CFFun (fun,pro)), (_,trees))) = case fun of
AM _ -> return IMeta AM _ -> return IMeta
_ -> do _ -> do

View File

@@ -191,4 +191,5 @@ evalPrintname gr c ppr lin =
C x y -> C (oneBranch x) (oneBranch y) C x y -> C (oneBranch x) (oneBranch y)
S x _ -> oneBranch x S x _ -> oneBranch x
P x _ -> oneBranch x P x _ -> oneBranch x
Alts (d,_) -> oneBranch d
_ -> t _ -> t

View File

@@ -88,6 +88,7 @@ stateGrammarST = grammar
stateCF = cf stateCF = cf
stateMorpho = morpho stateMorpho = morpho
stateOptions = loptions stateOptions = loptions
stateGrammarWords = map fst . tree2list . stateMorpho
cncModuleIdST = stateGrammarST cncModuleIdST = stateGrammarST

View File

@@ -32,12 +32,21 @@ type Ss = [String]
matchPrefix :: Ss -> [(Ss,[String])] -> [String] -> Ss matchPrefix :: Ss -> [(Ss,[String])] -> [String] -> Ss
matchPrefix s vs t = matchPrefix s vs t =
head ([u | (u,as) <- vs, any (\c -> isPrefixOf c (concat t)) as] ++ [s]) head $ [u |
(u,as) <- vs,
any (\c -> isPrefixOf c (concat (unmarkup t))) as
] ++ [s]
matchSuffix :: String -> Ss -> [(Ss,[String])] -> Ss matchSuffix :: String -> Ss -> [(Ss,[String])] -> Ss
matchSuffix t s vs = matchSuffix t s vs =
head ([u | (u,as) <- vs, any (\c -> isSuffixOf c t) as] ++ [s]) head ([u | (u,as) <- vs, any (\c -> isSuffixOf c t) as] ++ [s])
unmarkup :: [String] -> [String]
unmarkup = filter (not . isXMLtag) where
isXMLtag s = case s of
'<':cs@(_:_) -> last cs == '>'
_ -> False
str2strings :: Str -> Ss str2strings :: Str -> Ss
str2strings (Str st) = alls st where str2strings (Str st) = alls st where
alls st = case st of alls st = case st of

View File

@@ -44,6 +44,8 @@ cPredefAbs = identC "PredefAbs"
cInt = identC "Int" cInt = identC "Int"
cString = identC "String" cString = identC "String"
isPredefCat c = elem c [cInt,cString]
eType :: Exp eType :: Exp
eType = Sort "Type" eType = Sort "Type"

View File

@@ -125,6 +125,7 @@ newParser = iOpt "new"
noCF = iOpt "nocf" noCF = iOpt "nocf"
checkCirc = iOpt "nocirc" checkCirc = iOpt "nocirc"
noCheckCirc = iOpt "nocheckcirc" noCheckCirc = iOpt "nocheckcirc"
lexerByNeed = iOpt "cflexer"
-- linearization -- linearization
allLin = iOpt "all" allLin = iOpt "all"

View File

@@ -148,6 +148,7 @@ customGrammarPrinter =
,(strCI "lbnf", prLBNF . stateCF) ,(strCI "lbnf", prLBNF . stateCF)
,(strCI "morpho", prMorpho . stateMorpho) ,(strCI "morpho", prMorpho . stateMorpho)
,(strCI "opts", prOpts . stateOptions) ,(strCI "opts", prOpts . stateOptions)
,(strCI "words", unwords . stateGrammarWords)
{- ---- {- ----
(strCI "gf", prt . st2grammar . stateGrammarST) -- DEFAULT (strCI "gf", prt . st2grammar . stateGrammarST) -- DEFAULT
,(strCI "canon", showCanon "Lang" . stateGrammarST) ,(strCI "canon", showCanon "Lang" . stateGrammarST)

View File

@@ -92,10 +92,10 @@ tokens2trms opts sg cn parser as = do
verb = oElem beVerbose opts verb = oElem beVerbose opts
forgive = oElem forgiveParse opts forgive = oElem forgiveParse opts
unknown ts = case filter noMatch ts of unknown ts = case filter noMatch [t | t@(TS _) <- ts] of
[] -> "where all words are known" [] -> "where all words are known"
us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals
terminals = map TS $ cfTokens $ stateCF sg terminals = map TS $ stateGrammarWords sg
noMatch t = all (not . compatTok t) terminals noMatch t = all (not . compatTok t) terminals

View File

@@ -1 +1 @@
module Today where today = "Fri Nov 14 14:45:24 CET 2003" module Today where today = "Mon Nov 17 17:04:28 CET 2003"