diff --git a/doc/gf2-highlights.html b/doc/gf2-highlights.html index 57cd45044..036ca55f3 100644 --- a/doc/gf2-highlights.html +++ b/doc/gf2-highlights.html @@ -45,6 +45,7 @@ Highlights, preliminary version
  • New syntax alternatives for local definitions: let without braces and where.
  • Pattern variables can be used on lhs's of oper definitions. +
  • New Unicode transliterations (by Harad Hammarström).

    New parser (forthcoming)

    @@ -62,7 +63,8 @@ Highlights, preliminary version
  • Haskell source code organized into subdirectories.
  • BNF Converter used for defining the languages GF and GFC, which also give reliable LaTeX documentation. - +
  • Lexican rules sorted out by option -cflexer for efficient + parsing with large lexica. diff --git a/grammars/resource/swedish/SyntaxSwe.gf b/grammars/resource/swedish/SyntaxSwe.gf index c26a07bc8..9b9897ee1 100644 --- a/grammars/resource/swedish/SyntaxSwe.gf +++ b/grammars/resource/swedish/SyntaxSwe.gf @@ -507,7 +507,7 @@ oper DitransVerb = TransVerb ** {s3 : Preposition} ; mkDitransVerb : Verb -> Preposition -> Preposition -> DitransVerb = \v,p1,p2 -> - v ** {s3 = p1 ; s3 = p2} ; + v ** {s2 = p1 ; s3 = p2} ; complDitransVerb : Bool -> DitransVerb -> NounPhrase -> NounPhrase -> VerbPhrase = @@ -538,7 +538,7 @@ oper s3 = \\g,n => spelar.s3 ! g ! n ++ (if_then_else Str postp bra.s []) } ; - advAdjPhrase : Adverb -> AdjPhrase -> AdjPhrase = \mycket, dyr -> + advAdjPhrase : SS -> AdjPhrase -> AdjPhrase = \mycket, dyr -> {s = \\a,c => mycket.s ++ dyr.s ! a ! c ; p = dyr.p } ; @@ -903,7 +903,7 @@ oper -- This class covers adverbials such as "annars", "därför", which are prefixed -- to a sentence to form a phrase. - advSentence : Adverb -> Sentence -> Utterance = \annars,soverhan -> + advSentence : SS -> Sentence -> Utterance = \annars,soverhan -> ss (annars.s ++ soverhan.s ! Inv ++ ".") ; diff --git a/src/GF/CF/CF.hs b/src/GF/CF/CF.hs index 0cff68b97..7c0013548 100644 --- a/src/GF/CF/CF.hs +++ b/src/GF/CF/CF.hs @@ -15,8 +15,9 @@ import Char (isUpper, isLower, toUpper, toLower) -- abstract type CF. -- Invariant: each category has all its rules grouped with it -- also: the list is never empty (the category is just missing then) -newtype CF = CF ([(CFCat,[CFRule])], CFPredef) +newtype CF = CF ([CFRuleGroup], CFPredef) type CFRule = (CFFun, (CFCat, [CFItem])) +type CFRuleGroup = (CFCat,[CFRule]) -- CFPredef is a hack for variable symbols and literals; normally = const [] data CFItem = CFTerm RegExp | CFNonterm CFCat deriving (Eq, Ord,Show) diff --git a/src/GF/CF/CFIdent.hs b/src/GF/CF/CFIdent.hs index 02343bfb7..99ab711e4 100644 --- a/src/GF/CF/CFIdent.hs +++ b/src/GF/CF/CFIdent.hs @@ -3,7 +3,9 @@ module CFIdent where import Operations import GFC import Ident +import Values (cPredefAbs) import AbsGFC +import Macros (ident2label) import PrGrammar import Str import Char (toLower, toUpper) @@ -48,6 +50,10 @@ newtype CFFun = CFFun (Atom, Profile) deriving (Eq,Show) type Profile = [([[Int]],[Int])] +wordsCFTok :: CFTok -> [String] +wordsCFTok t = case t of + TC (c:cs) -> [c':cs | c' <- [toUpper c, toLower c]] + _ -> [prCFTok t] -- the following functions should be used instead of constructors @@ -68,6 +74,9 @@ stringCFFun = mkCFFun . AS intCFFun :: Int -> CFFun intCFFun = mkCFFun . AI . toInteger +dummyCFFun :: CFFun +dummyCFFun = varCFFun $ identC "_" --- used in lexer-by-need rules + cfFun2String :: CFFun -> String cfFun2String (CFFun (f,_)) = prt f @@ -105,8 +114,8 @@ cat2CFCat :: (Ident,Ident) -> CFCat cat2CFCat = uncurry idents2CFCat ---- literals -cfCatString = string2CFCat "Predef" "String" -cfCatInt = string2CFCat "Predef" "Int" +cfCatString = string2CFCat (prt cPredefAbs) "String" +cfCatInt = string2CFCat (prt cPredefAbs) "Int" @@ -122,6 +131,9 @@ moduleOfCFCat (CFCat (CIQ m _, _)) = m cfCat2Cat :: CFCat -> (Ident,Ident) cfCat2Cat (CFCat (CIQ m c,_)) = (m,c) +lexCFCat :: CFCat -> CFCat +lexCFCat cat = ident2CFCat (uncurry CIQ (cfCat2Cat cat)) (identC "*") + -- to construct CF tokens string2CFTok :: String -> CFTok diff --git a/src/GF/CF/CanonToCF.hs b/src/GF/CF/CanonToCF.hs index 6651b0100..0950d6244 100644 --- a/src/GF/CF/CanonToCF.hs +++ b/src/GF/CF/CanonToCF.hs @@ -5,12 +5,14 @@ import Option import Ident import AbsGFC import GFC +import Values (isPredefCat,cPredefAbs) import PrGrammar import CMacros import qualified Modules as M import CF import CFIdent -import List (nub) +import Morphology +import List (nub,partition) import Monad -- AR 27/1/2000 -- 3/12/2001 -- 8/6/2003 @@ -28,8 +30,8 @@ canon2cf opts gr c = do rules0 <- liftM concat $ mapM (uncurry (cnc2cfCond opts)) mms let rules = filter (not . isCircularCF) rules0 ---- temporarily here let grules = groupCFRules rules - let predef = mkCFPredef $ map fst grules - return $ CF (grules, predef) + let predef = mkCFPredef opts grules + return $ CF predef cnc2cfCond :: Options -> Ident -> [(Ident,Info)] -> Err [CFRule] cnc2cfCond opts m gr = @@ -138,16 +140,40 @@ term2CFItems m t = errIn "forming cf items" $ case t of tryMkCFTerm itss = return itss extrR arg lab = case (arg,lab) of - (Arg (A cat pos), l@(L _)) -> return [[PNonterm (CIQ m cat) pos l True]] - (Arg (A cat pos), l@(LV _)) -> return [[PNonterm (CIQ m cat) pos l False]] - (Arg (AB cat pos b), l@(L _)) -> return [[PNonterm (CIQ m cat) pos l True]] - (Arg (AB cat pos b), l@(LV _)) -> return [[PNonterm (CIQ m cat) pos l False]] + (Arg (A cat pos), l@(L _)) -> return [[PNonterm (cIQ cat) pos l True]] + (Arg (A cat pos), l@(LV _)) -> return [[PNonterm (cIQ cat) pos l False]] + (Arg (AB cat pos b), l@(L _)) -> return [[PNonterm (cIQ cat) pos l True]] + (Arg (AB cat pos b), l@(LV _)) -> return [[PNonterm (cIQ cat) pos l False]] ---- ?? _ -> prtBad "cannot extract record field from" arg + cIQ c = if isPredefCat c then CIQ cPredefAbs c else CIQ m c -mkCFPredef :: [CFCat] -> CFPredef -mkCFPredef cats s = - [(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++ - [(cat, varCFFun x) | TV x <- [s], cat <- cats] ++ - [(cfCatString, stringCFFun t) | TL t <- [s]] ++ - [(cfCatInt, intCFFun t) | TI t <- [s]] +mkCFPredef :: Options -> [CFRuleGroup] -> ([CFRuleGroup],CFPredef) +mkCFPredef opts rules = (ruls, \s -> preds0 s ++ look s) where + (ruls,preds) = if oElem lexerByNeed opts -- option -cflexer + then predefLexer rules + else (rules,NT) + preds0 s = + [(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++ + [(cat, varCFFun x) | TV x <- [s], cat <- cats] ++ + [(cfCatString, stringCFFun t) | TL t <- [s]] ++ + [(cfCatInt, intCFFun t) | TI t <- [s]] + cats = map fst rules + look s = errVal [] $ liftM concat $ + mapM (flip justLookupTree preds . tS) $ wordsCFTok s --- for TC tokens + +--- TODO: use trie instead of bintree; integrate with morphology +predefLexer :: [CFRuleGroup] -> ([CFRuleGroup],BinTree (CFTok,[(CFCat, CFFun)])) +predefLexer groups = (reverse ruls, sorted2tree $ sortAssocs preds) where + (ruls,preds) = foldr mkOne ([],[]) groups + mkOne group@(cat,rules) (rs,ps) = (rule:rs,pre ++ ps) where + (rule,pre) = case partition isLexical rules of + ([],_) -> (group,[]) + (ls,rest) -> ((cat,rest), concatMap mkLexRule ls) --- useLexRule cat : rest + isLexical (f,(c,its)) = case its of + [CFTerm (RegAlts ws)] -> True + _ -> False +-- useLexRule cat = (dummyCFFun,(cat,[CFNonterm (lexCFCat cat)])) -- not needed + mkLexRule r = case r of + (fun,(cat,[CFTerm (RegAlts ws)])) -> [(tS w, (cat,fun)) | w <- ws] + _ -> [] diff --git a/src/GF/CF/Profile.hs b/src/GF/CF/Profile.hs index 6dbb5f85a..1b821d53a 100644 --- a/src/GF/CF/Profile.hs +++ b/src/GF/CF/Profile.hs @@ -33,6 +33,7 @@ type BindVs = [[I.Ident]] -- (2) term2trm: restore Bindings from Binds tree2term :: CFTree -> Err ITerm +-- tree2term (CFTree (f,(_,[t]))) | f == dummyCFFun = tree2term t -- not used tree2term (CFTree (cff@(CFFun (fun,pro)), (_,trees))) = case fun of AM _ -> return IMeta _ -> do diff --git a/src/GF/Compile/Optimize.hs b/src/GF/Compile/Optimize.hs index 77cbcdbdf..ceec2c1b6 100644 --- a/src/GF/Compile/Optimize.hs +++ b/src/GF/Compile/Optimize.hs @@ -191,4 +191,5 @@ evalPrintname gr c ppr lin = C x y -> C (oneBranch x) (oneBranch y) S x _ -> oneBranch x P x _ -> oneBranch x + Alts (d,_) -> oneBranch d _ -> t diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs index 4e84bd248..17ea2cc9a 100644 --- a/src/GF/Compile/ShellState.hs +++ b/src/GF/Compile/ShellState.hs @@ -88,6 +88,7 @@ stateGrammarST = grammar stateCF = cf stateMorpho = morpho stateOptions = loptions +stateGrammarWords = map fst . tree2list . stateMorpho cncModuleIdST = stateGrammarST diff --git a/src/GF/Data/Str.hs b/src/GF/Data/Str.hs index 0c1ecf7c9..6433e5355 100644 --- a/src/GF/Data/Str.hs +++ b/src/GF/Data/Str.hs @@ -32,12 +32,21 @@ type Ss = [String] matchPrefix :: Ss -> [(Ss,[String])] -> [String] -> Ss matchPrefix s vs t = - head ([u | (u,as) <- vs, any (\c -> isPrefixOf c (concat t)) as] ++ [s]) + head $ [u | + (u,as) <- vs, + any (\c -> isPrefixOf c (concat (unmarkup t))) as + ] ++ [s] matchSuffix :: String -> Ss -> [(Ss,[String])] -> Ss matchSuffix t s vs = head ([u | (u,as) <- vs, any (\c -> isSuffixOf c t) as] ++ [s]) +unmarkup :: [String] -> [String] +unmarkup = filter (not . isXMLtag) where + isXMLtag s = case s of + '<':cs@(_:_) -> last cs == '>' + _ -> False + str2strings :: Str -> Ss str2strings (Str st) = alls st where alls st = case st of diff --git a/src/GF/Grammar/Values.hs b/src/GF/Grammar/Values.hs index 4d787488d..015f9ffb3 100644 --- a/src/GF/Grammar/Values.hs +++ b/src/GF/Grammar/Values.hs @@ -44,6 +44,8 @@ cPredefAbs = identC "PredefAbs" cInt = identC "Int" cString = identC "String" +isPredefCat c = elem c [cInt,cString] + eType :: Exp eType = Sort "Type" diff --git a/src/GF/Infra/Option.hs b/src/GF/Infra/Option.hs index 3bdf4dc0f..ac2f46b7e 100644 --- a/src/GF/Infra/Option.hs +++ b/src/GF/Infra/Option.hs @@ -125,6 +125,7 @@ newParser = iOpt "new" noCF = iOpt "nocf" checkCirc = iOpt "nocirc" noCheckCirc = iOpt "nocheckcirc" +lexerByNeed = iOpt "cflexer" -- linearization allLin = iOpt "all" diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs index 4d5eb8122..10446413a 100644 --- a/src/GF/UseGrammar/Custom.hs +++ b/src/GF/UseGrammar/Custom.hs @@ -148,6 +148,7 @@ customGrammarPrinter = ,(strCI "lbnf", prLBNF . stateCF) ,(strCI "morpho", prMorpho . stateMorpho) ,(strCI "opts", prOpts . stateOptions) + ,(strCI "words", unwords . stateGrammarWords) {- ---- (strCI "gf", prt . st2grammar . stateGrammarST) -- DEFAULT ,(strCI "canon", showCanon "Lang" . stateGrammarST) diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs index ab379cec3..b5b587c91 100644 --- a/src/GF/UseGrammar/Parsing.hs +++ b/src/GF/UseGrammar/Parsing.hs @@ -92,10 +92,10 @@ tokens2trms opts sg cn parser as = do verb = oElem beVerbose opts forgive = oElem forgiveParse opts - unknown ts = case filter noMatch ts of + unknown ts = case filter noMatch [t | t@(TS _) <- ts] of [] -> "where all words are known" us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals - terminals = map TS $ cfTokens $ stateCF sg + terminals = map TS $ stateGrammarWords sg noMatch t = all (not . compatTok t) terminals diff --git a/src/Today.hs b/src/Today.hs index 4c966d529..7d6ec50b2 100644 --- a/src/Today.hs +++ b/src/Today.hs @@ -1 +1 @@ -module Today where today = "Fri Nov 14 14:45:24 CET 2003" +module Today where today = "Mon Nov 17 17:04:28 CET 2003"