added some generation facilities

2026-07-14 01:22:46 -06:00 · 2006-03-31 16:30:44 +00:00
parent cb046fea18
commit 82fbc184b6
8 changed files with 224 additions and 181 deletions
@@ -14,6 +14,27 @@ Changes in functionality since May 17, 2005, release of GF Version 2.2

 <p>

+31/3 (AR) Added flags and options to some commands, to help generation:
+<ul>
+<li> <tt>gt -noexpand=NP,V,TV</tt> does not expand these categories,
+but only generates metavariables for them.
+<li> <tt>gt -doexpand=NP,V,TV</tt> only expands these categories,
+and generates metavariables for others.
+<li> <tt>gr -cf</tt> has the same flags.
+<li> <tt>l -mark=metacat</tt> marks the metavariables with their categories.
+<li> <tt>p -fail</tt> marks with <tt>#FAIL</tt> strings that have no parse.
+<li> <tt>p -ambiguous</tt> marks as <tt>#AMBIGUOUS</tt>
+     strings that have more than one parse.
+</ul>
+
+<p>
+
+<hr>
+
+22/12 <b>Release of GF 2.5</b>.
+
+<p>
+
 16/3 (AR) Added two flag values to <tt>pt -transform=X</tt>:
 <tt>nodup</tt> which excludes terms where a constant is duplicated,
 and
@@ -0,0 +1,169 @@
+----------------------------------------------------------------------
+-- |
+-- Module      : SGrammar
+-- Maintainer  : AR
+-- Stability   : (stable)
+-- Portability : (portable)
+--
+--
+-- A simple format for context-free abstract syntax used e.g. in
+-- generation. AR 31\/3\/2006
+--
+-- (c) Aarne Ranta 2004 under GNU GPL
+--
+-- Purpose: to generate corpora. We use simple types and don't
+-- guarantee the correctness of bindings\/dependences.
+-----------------------------------------------------------------------------
+
+module GF.Grammar.SGrammar where
+
+import GF.Canon.GFC
+import GF.Grammar.LookAbs
+import GF.Grammar.PrGrammar
+import GF.Grammar.Macros
+import GF.Grammar.Values
+import GF.Grammar.Grammar
+import GF.Infra.Ident (Ident)
+
+import GF.Data.Operations
+import GF.Data.Zipper
+import GF.Infra.Option
+
+import Data.List
+
+-- (c) Aarne Ranta 2006 under GNU GPL
+
+
+type SGrammar = BinTree SCat [SRule]
+type SIdent = String
+type SRule = (SFun,SType)
+type SType = ([SCat],SCat)
+type SCat = SIdent
+type SFun = (Double,SIdent)
+
+allRules gr = concat [rs  | (c,rs) <- tree2list gr]
+
+data STree = 
+    SApp (SFun,[STree]) 
+  | SMeta SCat
+  | SString String
+  | SInt Integer
+  | SFloat Double
+   deriving (Show,Eq)
+
+depth :: STree -> Int
+depth t = case t of
+  SApp (_,ts@(_:_)) -> maximum (map depth ts) + 1
+  _ -> 1
+
+type Probs = BinTree Ident Double
+
+emptyProbs :: Probs
+emptyProbs = emptyBinTree
+
+prProbs :: Probs -> String
+prProbs = unlines . map pr . tree2list where
+  pr (f,p) = prt f ++ "\t" ++ show p
+
+------------------------------------------
+-- translate grammar to simpler form and generated trees back
+
+gr2sgr :: Options -> Probs -> GFCGrammar -> SGrammar
+gr2sgr opts probs gr = buildTree [(c,norm (noexp c rs)) | rs@((_,(_,c)):_) <- rules] where
+  noe = maybe [] (chunks ',') $ getOptVal opts (aOpt "noexpand")
+  only = maybe [] (chunks ',') $ getOptVal opts (aOpt "doexpand")
+  un  = getOptInt opts (aOpt "atoms")
+  rules =
+     prune $
+       groupBy (\x y -> scat x == scat y) $
+        sortBy (\x y -> compare (scat x) (scat y)) $
+          [(trId f, ty') | (f,ty) <- funRulesOf gr, ty' <- trTy ty]
+  trId (_,f) = let f' = prt f in case lookupTree prt f probs of
+    Ok p -> (p,f')
+    _ -> (2.0, f')
+  trTy ty = case catSkeleton ty of
+    Ok (mcs,mc) -> [(map trCat mcs, trCat mc)]
+    _ -> []
+  trCat (m,c) = prt c ---
+  scat (_,(_,c)) = c
+
+  prune rs = maybe rs (\n -> map (onlyAtoms n) rs) $ un
+
+  norm = fillProb
+
+  onlyAtoms n rs = 
+    let (rs1,rs2) = partition atom rs 
+    in take n rs1 ++ rs2
+  atom = null . fst . snd
+
+  noexp c rs 
+    | null only = if elem c noe then [((2.0,'?':c),([],c))] else rs
+    | otherwise = if elem c only then rs else [((2.0,'?':c),([],c))]
+
+-- for cases where explicit probability is not given (encoded as
+-- p > 1) divide the remaining mass by the number of such cases
+
+fillProb :: [SRule] -> [SRule]
+fillProb rs = [((defa p,f),ty) | ((p,f),ty) <- rs] where
+  defa p = if p > 1.0 then def else p
+  def = (1 - sum given) / genericLength nope 
+  (nope,given) = partition (> 1.0) [p | ((p,_),_) <- rs]
+
+-- str2tr :: STree -> Exp
+str2tr t = case t of
+  SApp ((_,'?':c),[]) -> mkMeta 0 -- from noexpand=c
+  SApp ((_,f),ts) -> mkApp (trId f) (map str2tr ts) 
+  SMeta _     -> mkMeta 0
+  SString s   -> K s
+  SInt i      -> EInt i
+  SFloat i    -> EFloat i
+ where
+   trId = cn . zIdent
+
+-- tr2str :: Tree -> STree
+tr2str (Tr (N (_,at,val,_,_),ts)) = case (at,val) of
+  (AtC (_,f), _)         -> SApp ((2.0,prt_ f),map tr2str ts)
+  (AtM _,     v)         -> SMeta (catOf v)
+  (AtL s,     _)         -> SString s
+  (AtI i,     _)         -> SInt i
+  (AtF i,     _)         -> SFloat i
+  _ -> SMeta "FAILED_TO_GENERATE" ---- err monad!
+ where
+   catOf v = case v of
+     VApp w _  -> catOf w
+     VCn (_,c) -> prt_ c
+     _ -> "FAILED_TO_GENERATE_FROM_META"
+
+
+------------------------------------------
+-- to test
+
+prSTree t = case t of
+  SApp ((_,f),ts) -> f ++ concat (map pr1 ts)
+  SMeta c -> '?':c
+  SString s -> prQuotedString s
+  SInt i -> show i 
+  SFloat i -> show i 
+ where
+  pr1 t@(SApp (_,ts)) = ' ' : (if null ts then id else prParenth) (prSTree t)
+  pr1 t = prSTree t
+
+pSRule :: String -> SRule
+pSRule s = case words s of
+  f : _ : cs -> ((2.0,f),(init cs', last cs')) 
+    where cs' = [cs !! i | i <- [0,2..length cs - 1]]
+  _ -> error $ "not a rule" +++ s
+
+exSgr = map pSRule [
+   "Pred   : NP -> VP -> S"
+  ,"Compl  : TV -> NP -> VP" 
+  ,"PredVV : VV -> VP -> VP"
+  ,"DefCN  : CN -> NP"
+  ,"ModCN  : AP -> CN -> CN" 
+  ,"john   : NP"
+  ,"walk   : VP"
+  ,"love   : TV"
+  ,"try    : VV"
+  ,"girl   : CN"
+  ,"big    : AP"
+  ]
@@ -34,7 +34,8 @@ import GF.Grammar.LookAbs
 import GF.Grammar.PrGrammar
 import GF.Grammar.Macros
 import GF.Grammar.Values
-import GF.Grammar.Grammar -- (Cat,EInt,K)
+import GF.Grammar.Grammar
+import GF.Grammar.SGrammar

 import GF.Infra.Ident
 import GF.Data.Zipper
@@ -54,13 +55,13 @@ timeout = 99
 generateRandomTreesProb :: Options -> StdGen -> GFCGrammar -> Probs -> Cat -> [Exp]
 generateRandomTreesProb opts gen gr probs cat = 
  map str2tr $ randomTrees gen gr' cat' where
-    gr'  = gr2sgr gr probs
+    gr'  = gr2sgr opts probs gr
    cat' = prt $ snd cat

 -- | check that probabilities attached to a grammar make sense
 checkGrammarProbs :: GFCGrammar -> Probs -> Err Probs
 checkGrammarProbs gr probs = 
-  err Bad (return . gr2probs) $ checkSGrammar $ gr2sgr gr probs where
+  err Bad (return . gr2probs) $ checkSGrammar $ gr2sgr noOptions probs gr where
    gr2probs sgr = buildTree [(zIdent f,p) | (_,rs) <- tree2list sgr, ((p,f),_) <- rs]

 -- | compute the probability of a given tree
@@ -95,61 +96,9 @@ pProb s = case words s of
   readD :: String -> Double
   readD = read

-type Probs = BinTree Ident Double
-
-emptyProbs :: Probs
-emptyProbs = emptyBinTree
-
-prProbs :: Probs -> String
-prProbs = unlines . map pr . tree2list where
-  pr (f,p) = prt f ++ "\t" ++ show p
-
 ------------------------------------------
 -- translate grammar to simpler form and generated trees back

-gr2sgr :: GFCGrammar -> Probs -> SGrammar
-gr2sgr gr probs = buildTree [(c,fillProb rs) | rs@((_,(_,c)):_) <- rules] where
-  rules =
-    groupBy (\x y -> scat x == scat y) $
-      sortBy (\x y -> compare (scat x) (scat y)) 
-        [(trId f, ty') | (f,ty) <- funRulesOf gr, ty' <- trTy ty]
-  trId (_,f) = let f' = prt f in case lookupTree prt f probs of
-    Ok p -> (p,f')
-    _ -> (2.0, f')
-  trTy ty = case catSkeleton ty of
-    Ok (mcs,mc) -> [(map trCat mcs, trCat mc)]
-    _ -> []
-  trCat (m,c) = prt c ---
-  scat (_,(_,c)) = c
-
-str2tr :: STree -> Exp
-str2tr t = case t of
-  SApp ((_,f),ts) -> mkApp (trId f) (map str2tr ts) 
-  SMeta _     -> mkMeta 0
-  SString s   -> K s
-  SInt i      -> EInt i
-  SFloat i    -> EFloat i
- where
-   trId = cn . zIdent
-
-type SGrammar = BinTree SCat [SRule]
-type SIdent = String
-type SRule = (SFun,SType)
-type SType = ([SCat],SCat)
-type SCat = SIdent
-type SFun = (Double,SIdent)
-
-allRules gr = concat [rs  | (c,rs) <- tree2list gr]
-
-data STree = 
-    SApp (SFun,[STree]) 
--  | SAppN (SIdent,[STree])  -- no probability given 
-  | SMeta SCat
-  | SString String
-  | SInt Integer
-  | SFloat Double
-   deriving (Show,Eq)
-
 probTree :: STree -> Double
 probTree t = case t of
  SApp ((p,_),ts) -> p * product (map probTree ts)
@@ -204,16 +153,8 @@ checkSGrammar = mapMTree chCat where
      Bad $ "illegal probability sum " ++ show s ++ " in " ++ c
    _ -> return (c,rs)

-- for cases where explicit probability is not given (encoded as
-- p > 1) divide the remaining mass by the number of such cases
-
-fillProb :: [SRule] -> [SRule]
-fillProb rs = [((defa p,f),ty) | ((p,f),ty) <- rs] where
-  defa p = if p > 1.0 then def else p
-  def = (1 - sum given) / genericLength nope 
-  (nope,given) = partition (> 1.0) [p | ((p,_),_) <- rs]
-

+{-
 ------------------------------------------
 -- to test outside GF

@@ -246,7 +187,7 @@ pSRule s = case words s of
     where cs' = [cs !! i | i <- [0,2..length cs - 1]]
  _ -> error $ "not a rule" +++ s

-exSgr = mkSGrammar $ map pSRule [
+expSgr = mkSGrammar $ map pSRule [
  "0.8 a : A"
 ,"0.2 b : A"
 ,"0.2 n : A -> S -> S"
@@ -257,3 +198,6 @@ ex1 :: IO ()
 ex1 = do
  g <- newStdGen
  mapM_ (putStrLn . prSTree) $ randomTrees g exSgr "S"
+
+-}
+
@@ -249,6 +249,10 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
       let p = optParseArgErrMsg opts gro x
       case p of
         Ok (ts,msg) 
+           | oElem (iOpt "fail") opts && null ts -> do
+                putStrLnFlush ("#FAIL:" +++ x) >> changeArg (const $ ATrms ts) sa
+           | oElem (iOpt "ambiguous") opts && length ts > 1 -> do
+                putStrLnFlush ("#AMBIGUOUS:" +++ x) >> changeArg (const $ ATrms ts) sa
           | oElem (iOpt "prob") opts -> do
                let probs = stateProbs gro 
                let tps = rankByScore [(t,computeProbTree probs t) | t <- ts]
@@ -199,10 +199,12 @@ txtHelpFile =
  "\n      grammar (overridden by the -lang flag), in the category S (overridden" ++
  "\n      by the -cat flag)." ++
  "\n  options for batch input:" ++
-  "\n      -lines   parse each line of input separately, ignoring empty lines" ++
-  "\n      -all     as -lines, but also parse empty lines" ++
-  "\n      -prob    rank results by probability" ++
-  "\n      -cut     stop after first lexing result leading to parser success" ++
+  "\n      -lines     parse each line of input separately, ignoring empty lines" ++
+  "\n      -all       as -lines, but also parse empty lines" ++
+  "\n      -prob      rank results by probability" ++
+  "\n      -cut       stop after first lexing result leading to parser success" ++
+  "\n      -fail      show strings whose parse fails prefixed by #FAIL" ++
+  "\n      -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS" ++
  "\n  options for selecting parsing method:" ++
  "\n      (default)parse using an overgenerating CFG" ++
  "\n      -cfg     parse using a much less overgenerating CFG" ++
@@ -344,11 +346,13 @@ txtHelpFile =
  "\n      -lang     use the abstract syntax of this grammar" ++
  "\n      -number   generate (at most) this number of trees" ++
  "\n      -noexpand don't expand these categories (comma-separated, e.g. -noexpand=V,CN)" ++
+  "\n      -doexpand only expand these categories (comma-separated, e.g. -doexpand=V,CN)" ++
  "\n  examples:" ++
  "\n      gt -depth=10 -cat=NP              -- generate all NP's to depth 10 " ++
  "\n      gt (PredVP ? (NegVG ?))           -- generate all trees of this form" ++
  "\n      gt -cat=S -tr | l                 -- generate and linearize" ++
  "\n      gt -noexpand=NP | l -mark=metacat -- the only NP is meta, linearized \"?0 +NP\"" ++
+  "\n      gt | l | p -lines -ambiguous | grep \"#AMBIGUOUS\" -- show ambiguous strings" ++
  "\n" ++
  "\nma, morphologically_analyse: ma String" ++
  "\n      Runs morphological analysis on each word in String and displays" ++
@@ -180,11 +180,11 @@ optionsOfCommand co = case co of
  CTransformGrammar _ -> flags "printer"
  CConvertLatex _ -> none
  CLinearize _ -> both "utf8 table struct record all multi" "lang number unlexer mark"
-  CParse -> both "cut new newer cfg mcfg n ign raw v lines all prob" 
+  CParse -> both "ambiguous fail cut new newer cfg mcfg n ign raw v lines all prob" 
                 "cat lang lexer parser number rawtrees"
  CTranslate _ _ -> opts "cat lexer parser"
-  CGenerateRandom -> both "cf prob" "cat lang number depth"
-  CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand"
+  CGenerateRandom -> both "cf prob" "cat lang number depth atoms noexpand doexpand"
+  CGenerateTrees -> both "metas" "atoms depth alts cat lang number noexpand doexpand"
  CPutTerm -> flags "transform number"
  CTreeBank -> opts "c xml trees"
  CLookupTreebank -> both "assocs raw strings trees" "treebank"
@@ -25,7 +25,7 @@ import GF.Grammar.PrGrammar
 import GF.Grammar.Macros
 import GF.Grammar.Values
 import GF.Grammar.Grammar (Cat)
-
+import GF.Grammar.SGrammar
 import GF.Data.Operations
 import GF.Data.Zipper
 import GF.Infra.Option
@@ -43,59 +43,11 @@ import Data.List
 generateTrees :: Options -> GFCGrammar -> Cat -> Int -> Maybe Int -> Maybe Tree -> [Exp]
 generateTrees opts gr cat n mn mt = map str2tr $ generate gr' ifm cat' n mn mt'
  where
-    gr'  = gr2sgr noe ats gr
+    gr'  = gr2sgr opts emptyProbs gr
    cat' = prt $ snd cat
    mt'  = maybe Nothing (return . tr2str) mt
    ifm  = oElem withMetas opts
-    ats  = getOptInt opts (aOpt "atoms")
-    noe  = maybe [] (chunks ',') $ getOptVal opts (aOpt "noexpand")

------------------------------------------
-- translate grammar to simpler form and generated trees back
-
-gr2sgr :: [SIdent] -> Maybe Int -> GFCGrammar -> SGrammar
-gr2sgr noe un gr = buildTree [(c,noexp c rs) | rs@((_,(_,c)):_) <- prune rules] where
-  rules =
-    groupBy (\x y -> scat x == scat y) $
-      sortBy (\x y -> compare (scat x) (scat y)) 
-        [(trId f, ty') | (f,ty) <- funRulesOf gr, ty' <- trTy ty]
-  trId = prt . snd
-  trTy ty = case catSkeleton ty of
-    Ok (mcs,mc) -> [(map trCat mcs, trCat mc)]
-    _ -> []
-  trCat (m,c) = prt c ---
-  scat (_,(_,c)) = c
-
-  prune rs = maybe rs (\n -> map (onlyAtoms n) rs) $ un
-  onlyAtoms n rs = 
-    let (rs1,rs2) = partition atom rs 
-    in take n rs1 ++ rs2
-  atom = null . fst . snd
-
-  noexp c rs = if elem c noe then [('?':c,([],c))] else rs
-
-- str2tr :: STree -> Exp
-str2tr t = case t of
-  SApp ('?':c,[]) -> mkMeta 0 -- from noexpand=c
-  SApp (f,ts) -> mkApp (trId f) (map str2tr ts) 
-  SMeta _     -> mkMeta 0
----  SString s   -> K s
- where
-   trId = cn . zIdent
-
-- tr2str :: Tree -> STree
-tr2str (Tr (N (_,at,val,_,_),ts)) = case (at,val) of
-  (AtC (_,f), _)         -> SApp (prt_ f,map tr2str ts)
-  (AtM _,     v)         -> SMeta (catOf v)
-  (AtL s,     _)         -> SString s
-  (AtI i,     _)         -> SInt i
-  (AtF i,     _)         -> SFloat i
-  _ -> SMeta "FAILED_TO_GENERATE" ---- err monad!
- where
-   catOf v = case v of
-     VApp w _  -> catOf w
-     VCn (_,c) -> prt_ c
-     _ -> "FAILED_TO_GENERATE_FROM_META"

 ------------------------------------------
 -- do the main thing with a simpler data structure
@@ -139,58 +91,3 @@ generate gr ifm cat i mn mt = case mt of
    SApp (f,ts) -> [SApp (f,ts') | ts' <- combinations (map genM ts)]
    SMeta k     -> gen k 
    _ -> [t]
-
-type SGrammar = BinTree SCat [SRule]
-type SIdent = String
-type SRule = (SFun,SType)
-type SType = ([SCat],SCat)
-type SCat = SIdent
-type SFun = SIdent
-
-allRules gr = concat [rs  | (c,rs) <- tree2list gr]
-
-data STree = 
-    SApp (SFun,[STree]) 
-  | SMeta SCat
-  | SString String
-  | SInt Integer
-  | SFloat Double
-   deriving (Show,Eq)
-
-depth :: STree -> Int
-depth t = case t of
-  SApp (_,ts@(_:_)) -> maximum (map depth ts) + 1
-  _ -> 1
-
------------------------------------------
-- to test
-
-prSTree t = case t of
-  SApp (f,ts) -> f ++ concat (map pr1 ts)
-  SMeta c -> '?':c
-  SString s -> prQuotedString s
-  SInt i -> show i 
-  SFloat i -> show i 
- where
-  pr1 t@(SApp (_,ts)) = ' ' : (if null ts then id else prParenth) (prSTree t)
-  pr1 t = prSTree t
-
-pSRule :: String -> SRule
-pSRule s = case words s of
-  f : _ : cs -> (f,(init cs', last cs')) 
-    where cs' = [cs !! i | i <- [0,2..length cs - 1]]
-  _ -> error $ "not a rule" +++ s
-
-exSgr = map pSRule [
-   "Pred   : NP -> VP -> S"
-  ,"Compl  : TV -> NP -> VP" 
-  ,"PredVV : VV -> VP -> VP"
-  ,"DefCN  : CN -> NP"
-  ,"ModCN  : AP -> CN -> CN" 
-  ,"john   : NP"
-  ,"walk   : VP"
-  ,"love   : TV"
-  ,"try    : VV"
-  ,"girl   : CN"
-  ,"big    : AP"
-  ]
@@ -170,10 +170,12 @@ p,  parse: p String
      grammar (overridden by the -lang flag), in the category S (overridden
      by the -cat flag).
  options for batch input:
-      -lines   parse each line of input separately, ignoring empty lines
-      -all     as -lines, but also parse empty lines
-      -prob    rank results by probability
-      -cut     stop after first lexing result leading to parser success
+      -lines     parse each line of input separately, ignoring empty lines
+      -all       as -lines, but also parse empty lines
+      -prob      rank results by probability
+      -cut       stop after first lexing result leading to parser success
+      -fail      show strings whose parse fails prefixed by #FAIL
+      -ambiguous show strings that have more than one parse prefixed by #AMBIGUOUS
  options for selecting parsing method:
      (default)parse using an overgenerating CFG
      -cfg     parse using a much less overgenerating CFG
@@ -315,11 +317,13 @@ gt, generate_trees: gt Tree?
      -lang     use the abstract syntax of this grammar
      -number   generate (at most) this number of trees
      -noexpand don't expand these categories (comma-separated, e.g. -noexpand=V,CN)
+      -doexpand only expand these categories (comma-separated, e.g. -doexpand=V,CN)
  examples:
      gt -depth=10 -cat=NP              -- generate all NP's to depth 10 
      gt (PredVP ? (NegVG ?))           -- generate all trees of this form
      gt -cat=S -tr | l                 -- generate and linearize
      gt -noexpand=NP | l -mark=metacat -- the only NP is meta, linearized "?0 +NP"
+      gt | l | p -lines -ambiguous | grep "#AMBIGUOUS" -- show ambiguous strings

 ma, morphologically_analyse: ma String
      Runs morphological analysis on each word in String and displays