From 82fbc184b6cdb939e5630477d0839786cc19fb5e Mon Sep 17 00:00:00 2001 From: aarne Date: Fri, 31 Mar 2006 16:30:44 +0000 Subject: [PATCH] added some generation facilities --- doc/gf-history.html | 21 ++++ src/GF/Grammar/SGrammar.hs | 169 ++++++++++++++++++++++++++ src/GF/Probabilistic/Probabilistic.hs | 74 ++--------- src/GF/Shell.hs | 4 + src/GF/Shell/HelpFile.hs | 12 +- src/GF/Shell/ShellCommands.hs | 6 +- src/GF/UseGrammar/Generate.hs | 107 +--------------- src/HelpFile | 12 +- 8 files changed, 224 insertions(+), 181 deletions(-) create mode 100644 src/GF/Grammar/SGrammar.hs diff --git a/doc/gf-history.html b/doc/gf-history.html index 1e91321db..d92146cd2 100644 --- a/doc/gf-history.html +++ b/doc/gf-history.html @@ -14,6 +14,27 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2

+31/3 (AR) Added flags and options to some commands, to help generation: +

+ +

+ +


+ +22/12 Release of GF 2.5. + +

+ 16/3 (AR) Added two flag values to pt -transform=X: nodup which excludes terms where a constant is duplicated, and diff --git a/src/GF/Grammar/SGrammar.hs b/src/GF/Grammar/SGrammar.hs new file mode 100644 index 000000000..e0c001b6b --- /dev/null +++ b/src/GF/Grammar/SGrammar.hs @@ -0,0 +1,169 @@ +---------------------------------------------------------------------- +-- | +-- Module : SGrammar +-- Maintainer : AR +-- Stability : (stable) +-- Portability : (portable) +-- +-- +-- A simple format for context-free abstract syntax used e.g. in +-- generation. AR 31\/3\/2006 +-- +-- (c) Aarne Ranta 2004 under GNU GPL +-- +-- Purpose: to generate corpora. We use simple types and don't +-- guarantee the correctness of bindings\/dependences. +----------------------------------------------------------------------------- + +module GF.Grammar.SGrammar where + +import GF.Canon.GFC +import GF.Grammar.LookAbs +import GF.Grammar.PrGrammar +import GF.Grammar.Macros +import GF.Grammar.Values +import GF.Grammar.Grammar +import GF.Infra.Ident (Ident) + +import GF.Data.Operations +import GF.Data.Zipper +import GF.Infra.Option + +import Data.List + +-- (c) Aarne Ranta 2006 under GNU GPL + + +type SGrammar = BinTree SCat [SRule] +type SIdent = String +type SRule = (SFun,SType) +type SType = ([SCat],SCat) +type SCat = SIdent +type SFun = (Double,SIdent) + +allRules gr = concat [rs | (c,rs) <- tree2list gr] + +data STree = + SApp (SFun,[STree]) + | SMeta SCat + | SString String + | SInt Integer + | SFloat Double + deriving (Show,Eq) + +depth :: STree -> Int +depth t = case t of + SApp (_,ts@(_:_)) -> maximum (map depth ts) + 1 + _ -> 1 + +type Probs = BinTree Ident Double + +emptyProbs :: Probs +emptyProbs = emptyBinTree + +prProbs :: Probs -> String +prProbs = unlines . map pr . tree2list where + pr (f,p) = prt f ++ "\t" ++ show p + +------------------------------------------ +-- translate grammar to simpler form and generated trees back + +gr2sgr :: Options -> Probs -> GFCGrammar -> SGrammar +gr2sgr opts probs gr = buildTree [(c,norm (noexp c rs)) | rs@((_,(_,c)):_) <- rules] where + noe = maybe [] (chunks ',') $ getOptVal opts (aOpt "noexpand") + only = maybe [] (chunks ',') $ getOptVal opts (aOpt "doexpand") + un = getOptInt opts (aOpt "atoms") + rules = + prune $ + groupBy (\x y -> scat x == scat y) $ + sortBy (\x y -> compare (scat x) (scat y)) $ + [(trId f, ty') | (f,ty) <- funRulesOf gr, ty' <- trTy ty] + trId (_,f) = let f' = prt f in case lookupTree prt f probs of + Ok p -> (p,f') + _ -> (2.0, f') + trTy ty = case catSkeleton ty of + Ok (mcs,mc) -> [(map trCat mcs, trCat mc)] + _ -> [] + trCat (m,c) = prt c --- + scat (_,(_,c)) = c + + prune rs = maybe rs (\n -> map (onlyAtoms n) rs) $ un + + norm = fillProb + + onlyAtoms n rs = + let (rs1,rs2) = partition atom rs + in take n rs1 ++ rs2 + atom = null . fst . snd + + noexp c rs + | null only = if elem c noe then [((2.0,'?':c),([],c))] else rs + | otherwise = if elem c only then rs else [((2.0,'?':c),([],c))] + +-- for cases where explicit probability is not given (encoded as +-- p > 1) divide the remaining mass by the number of such cases + +fillProb :: [SRule] -> [SRule] +fillProb rs = [((defa p,f),ty) | ((p,f),ty) <- rs] where + defa p = if p > 1.0 then def else p + def = (1 - sum given) / genericLength nope + (nope,given) = partition (> 1.0) [p | ((p,_),_) <- rs] + +-- str2tr :: STree -> Exp +str2tr t = case t of + SApp ((_,'?':c),[]) -> mkMeta 0 -- from noexpand=c + SApp ((_,f),ts) -> mkApp (trId f) (map str2tr ts) + SMeta _ -> mkMeta 0 + SString s -> K s + SInt i -> EInt i + SFloat i -> EFloat i + where + trId = cn . zIdent + +-- tr2str :: Tree -> STree +tr2str (Tr (N (_,at,val,_,_),ts)) = case (at,val) of + (AtC (_,f), _) -> SApp ((2.0,prt_ f),map tr2str ts) + (AtM _, v) -> SMeta (catOf v) + (AtL s, _) -> SString s + (AtI i, _) -> SInt i + (AtF i, _) -> SFloat i + _ -> SMeta "FAILED_TO_GENERATE" ---- err monad! + where + catOf v = case v of + VApp w _ -> catOf w + VCn (_,c) -> prt_ c + _ -> "FAILED_TO_GENERATE_FROM_META" + + +------------------------------------------ +-- to test + +prSTree t = case t of + SApp ((_,f),ts) -> f ++ concat (map pr1 ts) + SMeta c -> '?':c + SString s -> prQuotedString s + SInt i -> show i + SFloat i -> show i + where + pr1 t@(SApp (_,ts)) = ' ' : (if null ts then id else prParenth) (prSTree t) + pr1 t = prSTree t + +pSRule :: String -> SRule +pSRule s = case words s of + f : _ : cs -> ((2.0,f),(init cs', last cs')) + where cs' = [cs !! i | i <- [0,2..length cs - 1]] + _ -> error $ "not a rule" +++ s + +exSgr = map pSRule [ + "Pred : NP -> VP -> S" + ,"Compl : TV -> NP -> VP" + ,"PredVV : VV -> VP -> VP" + ,"DefCN : CN -> NP" + ,"ModCN : AP -> CN -> CN" + ,"john : NP" + ,"walk : VP" + ,"love : TV" + ,"try : VV" + ,"girl : CN" + ,"big : AP" + ] diff --git a/src/GF/Probabilistic/Probabilistic.hs b/src/GF/Probabilistic/Probabilistic.hs index 935175ed9..25258db52 100644 --- a/src/GF/Probabilistic/Probabilistic.hs +++ b/src/GF/Probabilistic/Probabilistic.hs @@ -34,7 +34,8 @@ import GF.Grammar.LookAbs import GF.Grammar.PrGrammar import GF.Grammar.Macros import GF.Grammar.Values -import GF.Grammar.Grammar -- (Cat,EInt,K) +import GF.Grammar.Grammar +import GF.Grammar.SGrammar import GF.Infra.Ident import GF.Data.Zipper @@ -54,13 +55,13 @@ timeout = 99 generateRandomTreesProb :: Options -> StdGen -> GFCGrammar -> Probs -> Cat -> [Exp] generateRandomTreesProb opts gen gr probs cat = map str2tr $ randomTrees gen gr' cat' where - gr' = gr2sgr gr probs + gr' = gr2sgr opts probs gr cat' = prt $ snd cat -- | check that probabilities attached to a grammar make sense checkGrammarProbs :: GFCGrammar -> Probs -> Err Probs checkGrammarProbs gr probs = - err Bad (return . gr2probs) $ checkSGrammar $ gr2sgr gr probs where + err Bad (return . gr2probs) $ checkSGrammar $ gr2sgr noOptions probs gr where gr2probs sgr = buildTree [(zIdent f,p) | (_,rs) <- tree2list sgr, ((p,f),_) <- rs] -- | compute the probability of a given tree @@ -95,61 +96,9 @@ pProb s = case words s of readD :: String -> Double readD = read -type Probs = BinTree Ident Double - -emptyProbs :: Probs -emptyProbs = emptyBinTree - -prProbs :: Probs -> String -prProbs = unlines . map pr . tree2list where - pr (f,p) = prt f ++ "\t" ++ show p - ------------------------------------------ -- translate grammar to simpler form and generated trees back -gr2sgr :: GFCGrammar -> Probs -> SGrammar -gr2sgr gr probs = buildTree [(c,fillProb rs) | rs@((_,(_,c)):_) <- rules] where - rules = - groupBy (\x y -> scat x == scat y) $ - sortBy (\x y -> compare (scat x) (scat y)) - [(trId f, ty') | (f,ty) <- funRulesOf gr, ty' <- trTy ty] - trId (_,f) = let f' = prt f in case lookupTree prt f probs of - Ok p -> (p,f') - _ -> (2.0, f') - trTy ty = case catSkeleton ty of - Ok (mcs,mc) -> [(map trCat mcs, trCat mc)] - _ -> [] - trCat (m,c) = prt c --- - scat (_,(_,c)) = c - -str2tr :: STree -> Exp -str2tr t = case t of - SApp ((_,f),ts) -> mkApp (trId f) (map str2tr ts) - SMeta _ -> mkMeta 0 - SString s -> K s - SInt i -> EInt i - SFloat i -> EFloat i - where - trId = cn . zIdent - -type SGrammar = BinTree SCat [SRule] -type SIdent = String -type SRule = (SFun,SType) -type SType = ([SCat],SCat) -type SCat = SIdent -type SFun = (Double,SIdent) - -allRules gr = concat [rs | (c,rs) <- tree2list gr] - -data STree = - SApp (SFun,[STree]) --- | SAppN (SIdent,[STree]) -- no probability given - | SMeta SCat - | SString String - | SInt Integer - | SFloat Double - deriving (Show,Eq) - probTree :: STree -> Double probTree t = case t of SApp ((p,_),ts) -> p * product (map probTree ts) @@ -204,16 +153,8 @@ checkSGrammar = mapMTree chCat where Bad $ "illegal probability sum " ++ show s ++ " in " ++ c _ -> return (c,rs) --- for cases where explicit probability is not given (encoded as --- p > 1) divide the remaining mass by the number of such cases - -fillProb :: [SRule] -> [SRule] -fillProb rs = [((defa p,f),ty) | ((p,f),ty) <- rs] where - defa p = if p > 1.0 then def else p - def = (1 - sum given) / genericLength nope - (nope,given) = partition (> 1.0) [p | ((p,_),_) <- rs] - +{- ------------------------------------------ -- to test outside GF @@ -246,7 +187,7 @@ pSRule s = case words s of where cs' = [cs !! i | i <- [0,2..length cs - 1]] _ -> error $ "not a rule" +++ s -exSgr = mkSGrammar $ map pSRule [ +expSgr = mkSGrammar $ map pSRule [ "0.8 a : A" ,"0.2 b : A" ,"0.2 n : A -> S -> S" @@ -257,3 +198,6 @@ ex1 :: IO () ex1 = do g <- newStdGen mapM_ (putStrLn . prSTree) $ randomTrees g exSgr "S" + +-} + diff --git a/src/GF/Shell.hs b/src/GF/Shell.hs index 3af343bb2..bdbf6d62c 100644 --- a/src/GF/Shell.hs +++ b/src/GF/Shell.hs @@ -249,6 +249,10 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com let p = optParseArgErrMsg opts gro x case p of Ok (ts,msg) + | oElem (iOpt "fail") opts && null ts -> do + putStrLnFlush ("#FAIL:" +++ x) >> changeArg (const $ ATrms ts) sa + | oElem (iOpt "ambiguous") opts && length ts > 1 -> do + putStrLnFlush ("#AMBIGUOUS:" +++ x) >> changeArg (const $ ATrms ts) sa | oElem (iOpt "prob") opts -> do let probs = stateProbs gro let tps = rankByScore [(t,computeProbTree probs t) | t <- ts] diff --git a/src/GF/Shell/HelpFile.hs b/src/GF/Shell/HelpFile.hs index dfb01da08..57692b493 100644 --- a/src/GF/Shell/HelpFile.hs +++ b/src/GF/Shell/HelpFile.hs @@ -199,10 +199,12 @@ txtHelpFile = "\n grammar (overridden by the -lang flag), in the category S (overridden" ++ "\n by the -cat flag)." ++ "\n options for batch input:" ++ - "\n -lines parse each line of input separately, ignoring empty lines" ++ - "\n -all as -lines, but also parse empty lines" ++ - "\n -prob rank results by probability" ++ - "\n -cut stop after first lexing result leading to parser success" ++ + "\n -lines parse each line of input separately, ignoring empty lines" ++ + "\n -all as -lines, but also parse empty lines" ++ + "\n -prob rank results by probability" ++ + "\n -cut stop after first lexing result leading to parser success" ++ + "\n -fail show strings whose parse fails prefixed by #FAIL" ++ + "\n -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++ "\n options for selecting parsing method:" ++ "\n (default)parse using an overgenerating CFG" ++ "\n -cfg parse using a much less overgenerating CFG" ++ @@ -344,11 +346,13 @@ txtHelpFile = "\n -lang use the abstract syntax of this grammar" ++ "\n -number generate (at most) this number of trees" ++ "\n -noexpand don't expand these categories (comma-separated, e.g. -noexpand=V,CN)" ++ + "\n -doexpand only expand these categories (comma-separated, e.g. -doexpand=V,CN)" ++ "\n examples:" ++ "\n gt -depth=10 -cat=NP -- generate all NP's to depth 10 " ++ "\n gt (PredVP ? (NegVG ?)) -- generate all trees of this form" ++ "\n gt -cat=S -tr | l -- generate and linearize" ++ "\n gt -noexpand=NP | l -mark=metacat -- the only NP is meta, linearized \"?0 +NP\"" ++ + "\n gt | l | p -lines -ambiguous | grep \"#AMBIGUOUS\" -- show ambiguous strings" ++ "\n" ++ "\nma, morphologically_analyse: ma String" ++ "\n Runs morphological analysis on each word in String and displays" ++ diff --git a/src/GF/Shell/ShellCommands.hs b/src/GF/Shell/ShellCommands.hs index d6209cffa..b93335416 100644 --- a/src/GF/Shell/ShellCommands.hs +++ b/src/GF/Shell/ShellCommands.hs @@ -180,11 +180,11 @@ optionsOfCommand co = case co of CTransformGrammar _ -> flags "printer" CConvertLatex _ -> none CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark" - CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob" + CParse -> both "ambiguous fail cut new newer cfg mcfg n ign raw v lines all prob" "cat lang lexer parser number rawtrees" CTranslate _ _ -> opts "cat lexer parser" - CGenerateRandom -> both "cf prob" "cat lang number depth" - CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand" + CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand" + CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand" CPutTerm -> flags "transform number" CTreeBank -> opts "c xml trees" CLookupTreebank -> both "assocs raw strings trees" "treebank" diff --git a/src/GF/UseGrammar/Generate.hs b/src/GF/UseGrammar/Generate.hs index c96bb5e40..d368056d4 100644 --- a/src/GF/UseGrammar/Generate.hs +++ b/src/GF/UseGrammar/Generate.hs @@ -25,7 +25,7 @@ import GF.Grammar.PrGrammar import GF.Grammar.Macros import GF.Grammar.Values import GF.Grammar.Grammar (Cat) - +import GF.Grammar.SGrammar import GF.Data.Operations import GF.Data.Zipper import GF.Infra.Option @@ -43,59 +43,11 @@ import Data.List generateTrees :: Options -> GFCGrammar -> Cat -> Int -> Maybe Int -> Maybe Tree -> [Exp] generateTrees opts gr cat n mn mt = map str2tr $ generate gr' ifm cat' n mn mt' where - gr' = gr2sgr noe ats gr + gr' = gr2sgr opts emptyProbs gr cat' = prt $ snd cat mt' = maybe Nothing (return . tr2str) mt ifm = oElem withMetas opts - ats = getOptInt opts (aOpt "atoms") - noe = maybe [] (chunks ',') $ getOptVal opts (aOpt "noexpand") ------------------------------------------- --- translate grammar to simpler form and generated trees back - -gr2sgr :: [SIdent] -> Maybe Int -> GFCGrammar -> SGrammar -gr2sgr noe un gr = buildTree [(c,noexp c rs) | rs@((_,(_,c)):_) <- prune rules] where - rules = - groupBy (\x y -> scat x == scat y) $ - sortBy (\x y -> compare (scat x) (scat y)) - [(trId f, ty') | (f,ty) <- funRulesOf gr, ty' <- trTy ty] - trId = prt . snd - trTy ty = case catSkeleton ty of - Ok (mcs,mc) -> [(map trCat mcs, trCat mc)] - _ -> [] - trCat (m,c) = prt c --- - scat (_,(_,c)) = c - - prune rs = maybe rs (\n -> map (onlyAtoms n) rs) $ un - onlyAtoms n rs = - let (rs1,rs2) = partition atom rs - in take n rs1 ++ rs2 - atom = null . fst . snd - - noexp c rs = if elem c noe then [('?':c,([],c))] else rs - --- str2tr :: STree -> Exp -str2tr t = case t of - SApp ('?':c,[]) -> mkMeta 0 -- from noexpand=c - SApp (f,ts) -> mkApp (trId f) (map str2tr ts) - SMeta _ -> mkMeta 0 ----- SString s -> K s - where - trId = cn . zIdent - --- tr2str :: Tree -> STree -tr2str (Tr (N (_,at,val,_,_),ts)) = case (at,val) of - (AtC (_,f), _) -> SApp (prt_ f,map tr2str ts) - (AtM _, v) -> SMeta (catOf v) - (AtL s, _) -> SString s - (AtI i, _) -> SInt i - (AtF i, _) -> SFloat i - _ -> SMeta "FAILED_TO_GENERATE" ---- err monad! - where - catOf v = case v of - VApp w _ -> catOf w - VCn (_,c) -> prt_ c - _ -> "FAILED_TO_GENERATE_FROM_META" ------------------------------------------ -- do the main thing with a simpler data structure @@ -139,58 +91,3 @@ generate gr ifm cat i mn mt = case mt of SApp (f,ts) -> [SApp (f,ts') | ts' <- combinations (map genM ts)] SMeta k -> gen k _ -> [t] - -type SGrammar = BinTree SCat [SRule] -type SIdent = String -type SRule = (SFun,SType) -type SType = ([SCat],SCat) -type SCat = SIdent -type SFun = SIdent - -allRules gr = concat [rs | (c,rs) <- tree2list gr] - -data STree = - SApp (SFun,[STree]) - | SMeta SCat - | SString String - | SInt Integer - | SFloat Double - deriving (Show,Eq) - -depth :: STree -> Int -depth t = case t of - SApp (_,ts@(_:_)) -> maximum (map depth ts) + 1 - _ -> 1 - ------------------------------------------- --- to test - -prSTree t = case t of - SApp (f,ts) -> f ++ concat (map pr1 ts) - SMeta c -> '?':c - SString s -> prQuotedString s - SInt i -> show i - SFloat i -> show i - where - pr1 t@(SApp (_,ts)) = ' ' : (if null ts then id else prParenth) (prSTree t) - pr1 t = prSTree t - -pSRule :: String -> SRule -pSRule s = case words s of - f : _ : cs -> (f,(init cs', last cs')) - where cs' = [cs !! i | i <- [0,2..length cs - 1]] - _ -> error $ "not a rule" +++ s - -exSgr = map pSRule [ - "Pred : NP -> VP -> S" - ,"Compl : TV -> NP -> VP" - ,"PredVV : VV -> VP -> VP" - ,"DefCN : CN -> NP" - ,"ModCN : AP -> CN -> CN" - ,"john : NP" - ,"walk : VP" - ,"love : TV" - ,"try : VV" - ,"girl : CN" - ,"big : AP" - ] diff --git a/src/HelpFile b/src/HelpFile index 1070a800e..97ce04186 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -170,10 +170,12 @@ p, parse: p String grammar (overridden by the -lang flag), in the category S (overridden by the -cat flag). options for batch input: - -lines parse each line of input separately, ignoring empty lines - -all as -lines, but also parse empty lines - -prob rank results by probability - -cut stop after first lexing result leading to parser success + -lines parse each line of input separately, ignoring empty lines + -all as -lines, but also parse empty lines + -prob rank results by probability + -cut stop after first lexing result leading to parser success + -fail show strings whose parse fails prefixed by #FAIL + -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS options for selecting parsing method: (default)parse using an overgenerating CFG -cfg parse using a much less overgenerating CFG @@ -315,11 +317,13 @@ gt, generate_trees: gt Tree? -lang use the abstract syntax of this grammar -number generate (at most) this number of trees -noexpand don't expand these categories (comma-separated, e.g. -noexpand=V,CN) + -doexpand only expand these categories (comma-separated, e.g. -doexpand=V,CN) examples: gt -depth=10 -cat=NP -- generate all NP's to depth 10 gt (PredVP ? (NegVG ?)) -- generate all trees of this form gt -cat=S -tr | l -- generate and linearize gt -noexpand=NP | l -mark=metacat -- the only NP is meta, linearized "?0 +NP" + gt | l | p -lines -ambiguous | grep "#AMBIGUOUS" -- show ambiguous strings ma, morphologically_analyse: ma String Runs morphological analysis on each word in String and displays