diff --git a/doc/gf2-highlights.html b/doc/gf2-highlights.html
index 57cd45044..036ca55f3 100644
--- a/doc/gf2-highlights.html
+++ b/doc/gf2-highlights.html
@@ -45,6 +45,7 @@ Highlights, preliminary version
New syntax alternatives for local definitions: let without
braces and where.
Pattern variables can be used on lhs's of oper definitions.
+ New Unicode transliterations (by Harad Hammarström).
New parser (forthcoming)
@@ -62,7 +63,8 @@ Highlights, preliminary version
Haskell source code organized into subdirectories.
BNF Converter used for defining the languages GF and GFC, which also
give reliable LaTeX documentation.
-
+ Lexican rules sorted out by option -cflexer for efficient
+ parsing with large lexica.
diff --git a/grammars/resource/swedish/SyntaxSwe.gf b/grammars/resource/swedish/SyntaxSwe.gf
index c26a07bc8..9b9897ee1 100644
--- a/grammars/resource/swedish/SyntaxSwe.gf
+++ b/grammars/resource/swedish/SyntaxSwe.gf
@@ -507,7 +507,7 @@ oper
DitransVerb = TransVerb ** {s3 : Preposition} ;
mkDitransVerb : Verb -> Preposition -> Preposition -> DitransVerb = \v,p1,p2 ->
- v ** {s3 = p1 ; s3 = p2} ;
+ v ** {s2 = p1 ; s3 = p2} ;
complDitransVerb :
Bool -> DitransVerb -> NounPhrase -> NounPhrase -> VerbPhrase =
@@ -538,7 +538,7 @@ oper
s3 = \\g,n => spelar.s3 ! g ! n ++ (if_then_else Str postp bra.s [])
} ;
- advAdjPhrase : Adverb -> AdjPhrase -> AdjPhrase = \mycket, dyr ->
+ advAdjPhrase : SS -> AdjPhrase -> AdjPhrase = \mycket, dyr ->
{s = \\a,c => mycket.s ++ dyr.s ! a ! c ;
p = dyr.p
} ;
@@ -903,7 +903,7 @@ oper
-- This class covers adverbials such as "annars", "därför", which are prefixed
-- to a sentence to form a phrase.
- advSentence : Adverb -> Sentence -> Utterance = \annars,soverhan ->
+ advSentence : SS -> Sentence -> Utterance = \annars,soverhan ->
ss (annars.s ++ soverhan.s ! Inv ++ ".") ;
diff --git a/src/GF/CF/CF.hs b/src/GF/CF/CF.hs
index 0cff68b97..7c0013548 100644
--- a/src/GF/CF/CF.hs
+++ b/src/GF/CF/CF.hs
@@ -15,8 +15,9 @@ import Char (isUpper, isLower, toUpper, toLower)
-- abstract type CF.
-- Invariant: each category has all its rules grouped with it
-- also: the list is never empty (the category is just missing then)
-newtype CF = CF ([(CFCat,[CFRule])], CFPredef)
+newtype CF = CF ([CFRuleGroup], CFPredef)
type CFRule = (CFFun, (CFCat, [CFItem]))
+type CFRuleGroup = (CFCat,[CFRule])
-- CFPredef is a hack for variable symbols and literals; normally = const []
data CFItem = CFTerm RegExp | CFNonterm CFCat deriving (Eq, Ord,Show)
diff --git a/src/GF/CF/CFIdent.hs b/src/GF/CF/CFIdent.hs
index 02343bfb7..99ab711e4 100644
--- a/src/GF/CF/CFIdent.hs
+++ b/src/GF/CF/CFIdent.hs
@@ -3,7 +3,9 @@ module CFIdent where
import Operations
import GFC
import Ident
+import Values (cPredefAbs)
import AbsGFC
+import Macros (ident2label)
import PrGrammar
import Str
import Char (toLower, toUpper)
@@ -48,6 +50,10 @@ newtype CFFun = CFFun (Atom, Profile) deriving (Eq,Show)
type Profile = [([[Int]],[Int])]
+wordsCFTok :: CFTok -> [String]
+wordsCFTok t = case t of
+ TC (c:cs) -> [c':cs | c' <- [toUpper c, toLower c]]
+ _ -> [prCFTok t]
-- the following functions should be used instead of constructors
@@ -68,6 +74,9 @@ stringCFFun = mkCFFun . AS
intCFFun :: Int -> CFFun
intCFFun = mkCFFun . AI . toInteger
+dummyCFFun :: CFFun
+dummyCFFun = varCFFun $ identC "_" --- used in lexer-by-need rules
+
cfFun2String :: CFFun -> String
cfFun2String (CFFun (f,_)) = prt f
@@ -105,8 +114,8 @@ cat2CFCat :: (Ident,Ident) -> CFCat
cat2CFCat = uncurry idents2CFCat
---- literals
-cfCatString = string2CFCat "Predef" "String"
-cfCatInt = string2CFCat "Predef" "Int"
+cfCatString = string2CFCat (prt cPredefAbs) "String"
+cfCatInt = string2CFCat (prt cPredefAbs) "Int"
@@ -122,6 +131,9 @@ moduleOfCFCat (CFCat (CIQ m _, _)) = m
cfCat2Cat :: CFCat -> (Ident,Ident)
cfCat2Cat (CFCat (CIQ m c,_)) = (m,c)
+lexCFCat :: CFCat -> CFCat
+lexCFCat cat = ident2CFCat (uncurry CIQ (cfCat2Cat cat)) (identC "*")
+
-- to construct CF tokens
string2CFTok :: String -> CFTok
diff --git a/src/GF/CF/CanonToCF.hs b/src/GF/CF/CanonToCF.hs
index 6651b0100..0950d6244 100644
--- a/src/GF/CF/CanonToCF.hs
+++ b/src/GF/CF/CanonToCF.hs
@@ -5,12 +5,14 @@ import Option
import Ident
import AbsGFC
import GFC
+import Values (isPredefCat,cPredefAbs)
import PrGrammar
import CMacros
import qualified Modules as M
import CF
import CFIdent
-import List (nub)
+import Morphology
+import List (nub,partition)
import Monad
-- AR 27/1/2000 -- 3/12/2001 -- 8/6/2003
@@ -28,8 +30,8 @@ canon2cf opts gr c = do
rules0 <- liftM concat $ mapM (uncurry (cnc2cfCond opts)) mms
let rules = filter (not . isCircularCF) rules0 ---- temporarily here
let grules = groupCFRules rules
- let predef = mkCFPredef $ map fst grules
- return $ CF (grules, predef)
+ let predef = mkCFPredef opts grules
+ return $ CF predef
cnc2cfCond :: Options -> Ident -> [(Ident,Info)] -> Err [CFRule]
cnc2cfCond opts m gr =
@@ -138,16 +140,40 @@ term2CFItems m t = errIn "forming cf items" $ case t of
tryMkCFTerm itss = return itss
extrR arg lab = case (arg,lab) of
- (Arg (A cat pos), l@(L _)) -> return [[PNonterm (CIQ m cat) pos l True]]
- (Arg (A cat pos), l@(LV _)) -> return [[PNonterm (CIQ m cat) pos l False]]
- (Arg (AB cat pos b), l@(L _)) -> return [[PNonterm (CIQ m cat) pos l True]]
- (Arg (AB cat pos b), l@(LV _)) -> return [[PNonterm (CIQ m cat) pos l False]]
+ (Arg (A cat pos), l@(L _)) -> return [[PNonterm (cIQ cat) pos l True]]
+ (Arg (A cat pos), l@(LV _)) -> return [[PNonterm (cIQ cat) pos l False]]
+ (Arg (AB cat pos b), l@(L _)) -> return [[PNonterm (cIQ cat) pos l True]]
+ (Arg (AB cat pos b), l@(LV _)) -> return [[PNonterm (cIQ cat) pos l False]]
---- ??
_ -> prtBad "cannot extract record field from" arg
+ cIQ c = if isPredefCat c then CIQ cPredefAbs c else CIQ m c
-mkCFPredef :: [CFCat] -> CFPredef
-mkCFPredef cats s =
- [(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++
- [(cat, varCFFun x) | TV x <- [s], cat <- cats] ++
- [(cfCatString, stringCFFun t) | TL t <- [s]] ++
- [(cfCatInt, intCFFun t) | TI t <- [s]]
+mkCFPredef :: Options -> [CFRuleGroup] -> ([CFRuleGroup],CFPredef)
+mkCFPredef opts rules = (ruls, \s -> preds0 s ++ look s) where
+ (ruls,preds) = if oElem lexerByNeed opts -- option -cflexer
+ then predefLexer rules
+ else (rules,NT)
+ preds0 s =
+ [(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++
+ [(cat, varCFFun x) | TV x <- [s], cat <- cats] ++
+ [(cfCatString, stringCFFun t) | TL t <- [s]] ++
+ [(cfCatInt, intCFFun t) | TI t <- [s]]
+ cats = map fst rules
+ look s = errVal [] $ liftM concat $
+ mapM (flip justLookupTree preds . tS) $ wordsCFTok s --- for TC tokens
+
+--- TODO: use trie instead of bintree; integrate with morphology
+predefLexer :: [CFRuleGroup] -> ([CFRuleGroup],BinTree (CFTok,[(CFCat, CFFun)]))
+predefLexer groups = (reverse ruls, sorted2tree $ sortAssocs preds) where
+ (ruls,preds) = foldr mkOne ([],[]) groups
+ mkOne group@(cat,rules) (rs,ps) = (rule:rs,pre ++ ps) where
+ (rule,pre) = case partition isLexical rules of
+ ([],_) -> (group,[])
+ (ls,rest) -> ((cat,rest), concatMap mkLexRule ls) --- useLexRule cat : rest
+ isLexical (f,(c,its)) = case its of
+ [CFTerm (RegAlts ws)] -> True
+ _ -> False
+-- useLexRule cat = (dummyCFFun,(cat,[CFNonterm (lexCFCat cat)])) -- not needed
+ mkLexRule r = case r of
+ (fun,(cat,[CFTerm (RegAlts ws)])) -> [(tS w, (cat,fun)) | w <- ws]
+ _ -> []
diff --git a/src/GF/CF/Profile.hs b/src/GF/CF/Profile.hs
index 6dbb5f85a..1b821d53a 100644
--- a/src/GF/CF/Profile.hs
+++ b/src/GF/CF/Profile.hs
@@ -33,6 +33,7 @@ type BindVs = [[I.Ident]]
-- (2) term2trm: restore Bindings from Binds
tree2term :: CFTree -> Err ITerm
+-- tree2term (CFTree (f,(_,[t]))) | f == dummyCFFun = tree2term t -- not used
tree2term (CFTree (cff@(CFFun (fun,pro)), (_,trees))) = case fun of
AM _ -> return IMeta
_ -> do
diff --git a/src/GF/Compile/Optimize.hs b/src/GF/Compile/Optimize.hs
index 77cbcdbdf..ceec2c1b6 100644
--- a/src/GF/Compile/Optimize.hs
+++ b/src/GF/Compile/Optimize.hs
@@ -191,4 +191,5 @@ evalPrintname gr c ppr lin =
C x y -> C (oneBranch x) (oneBranch y)
S x _ -> oneBranch x
P x _ -> oneBranch x
+ Alts (d,_) -> oneBranch d
_ -> t
diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs
index 4e84bd248..17ea2cc9a 100644
--- a/src/GF/Compile/ShellState.hs
+++ b/src/GF/Compile/ShellState.hs
@@ -88,6 +88,7 @@ stateGrammarST = grammar
stateCF = cf
stateMorpho = morpho
stateOptions = loptions
+stateGrammarWords = map fst . tree2list . stateMorpho
cncModuleIdST = stateGrammarST
diff --git a/src/GF/Data/Str.hs b/src/GF/Data/Str.hs
index 0c1ecf7c9..6433e5355 100644
--- a/src/GF/Data/Str.hs
+++ b/src/GF/Data/Str.hs
@@ -32,12 +32,21 @@ type Ss = [String]
matchPrefix :: Ss -> [(Ss,[String])] -> [String] -> Ss
matchPrefix s vs t =
- head ([u | (u,as) <- vs, any (\c -> isPrefixOf c (concat t)) as] ++ [s])
+ head $ [u |
+ (u,as) <- vs,
+ any (\c -> isPrefixOf c (concat (unmarkup t))) as
+ ] ++ [s]
matchSuffix :: String -> Ss -> [(Ss,[String])] -> Ss
matchSuffix t s vs =
head ([u | (u,as) <- vs, any (\c -> isSuffixOf c t) as] ++ [s])
+unmarkup :: [String] -> [String]
+unmarkup = filter (not . isXMLtag) where
+ isXMLtag s = case s of
+ '<':cs@(_:_) -> last cs == '>'
+ _ -> False
+
str2strings :: Str -> Ss
str2strings (Str st) = alls st where
alls st = case st of
diff --git a/src/GF/Grammar/Values.hs b/src/GF/Grammar/Values.hs
index 4d787488d..015f9ffb3 100644
--- a/src/GF/Grammar/Values.hs
+++ b/src/GF/Grammar/Values.hs
@@ -44,6 +44,8 @@ cPredefAbs = identC "PredefAbs"
cInt = identC "Int"
cString = identC "String"
+isPredefCat c = elem c [cInt,cString]
+
eType :: Exp
eType = Sort "Type"
diff --git a/src/GF/Infra/Option.hs b/src/GF/Infra/Option.hs
index 3bdf4dc0f..ac2f46b7e 100644
--- a/src/GF/Infra/Option.hs
+++ b/src/GF/Infra/Option.hs
@@ -125,6 +125,7 @@ newParser = iOpt "new"
noCF = iOpt "nocf"
checkCirc = iOpt "nocirc"
noCheckCirc = iOpt "nocheckcirc"
+lexerByNeed = iOpt "cflexer"
-- linearization
allLin = iOpt "all"
diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs
index 4d5eb8122..10446413a 100644
--- a/src/GF/UseGrammar/Custom.hs
+++ b/src/GF/UseGrammar/Custom.hs
@@ -148,6 +148,7 @@ customGrammarPrinter =
,(strCI "lbnf", prLBNF . stateCF)
,(strCI "morpho", prMorpho . stateMorpho)
,(strCI "opts", prOpts . stateOptions)
+ ,(strCI "words", unwords . stateGrammarWords)
{- ----
(strCI "gf", prt . st2grammar . stateGrammarST) -- DEFAULT
,(strCI "canon", showCanon "Lang" . stateGrammarST)
diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs
index ab379cec3..b5b587c91 100644
--- a/src/GF/UseGrammar/Parsing.hs
+++ b/src/GF/UseGrammar/Parsing.hs
@@ -92,10 +92,10 @@ tokens2trms opts sg cn parser as = do
verb = oElem beVerbose opts
forgive = oElem forgiveParse opts
- unknown ts = case filter noMatch ts of
+ unknown ts = case filter noMatch [t | t@(TS _) <- ts] of
[] -> "where all words are known"
us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals
- terminals = map TS $ cfTokens $ stateCF sg
+ terminals = map TS $ stateGrammarWords sg
noMatch t = all (not . compatTok t) terminals
diff --git a/src/Today.hs b/src/Today.hs
index 4c966d529..7d6ec50b2 100644
--- a/src/Today.hs
+++ b/src/Today.hs
@@ -1 +1 @@
-module Today where today = "Fri Nov 14 14:45:24 CET 2003"
+module Today where today = "Mon Nov 17 17:04:28 CET 2003"