treebanks in shell state: i -treebank and lt

This commit is contained in:
aarne
2006-03-02 17:19:36 +00:00
parent 8a6da89104
commit bf872d018e
8 changed files with 77 additions and 12 deletions

View File

@@ -238,14 +238,16 @@ compatCF (CFCat (CIQ _ c, l)) (CFCat (CIQ _ c', l')) = c==c' && l==l'
-- instead of break -- instead of break
wordsLits [] = [] wordsLits [] = []
wordsLits (c:cs) | isSpace c = wordsLits (dropWhile isSpace cs) wordsLits (c:cs) | isSpace c = wordsLits (dropWhile isSpace cs)
| c == '\'' || c == '"' | isQuote c
= let (l,rs) = breaks (==c) cs = let (l,rs) = breaks (==c) cs
rs' = drop 1 rs rs' = drop 1 rs
in ([c]++l++[c]):wordsLits rs' in ([c]++l++[c]):wordsLits rs'
| otherwise = let (w,rs) = break isSpace cs | otherwise = let (w,rs) = break isSpaceQ cs
in (c:w):wordsLits rs in (c:w):wordsLits rs
where where
breaks c cs = case break c cs of breaks c cs = case break c cs of
(l@(_:_),d:rs) | last l == '\\' -> (l@(_:_),d:rs) | last l == '\\' ->
let (r,ts) = breaks c rs in (l++[d]++r, ts) let (r,ts) = breaks c rs in (l++[d]++r, ts)
v -> v v -> v
isQuote c = elem c "\"'"
isSpaceQ c = isSpace c ---- || isQuote c

View File

@@ -44,6 +44,7 @@ import qualified GF.Conversion.GFC as Cnv
import qualified GF.Parsing.GFC as Prs import qualified GF.Parsing.GFC as Prs
import Data.List (nub,nubBy) import Data.List (nub,nubBy)
import qualified Data.Map as Map
-- AR 11/11/2001 -- 17/6/2003 (for modules) ---- unfinished -- AR 11/11/2001 -- 17/6/2003 (for modules) ---- unfinished
@@ -61,6 +62,7 @@ data ShellState = ShSt {
-- (large, with parameters, no-so overgenerating) -- (large, with parameters, no-so overgenerating)
pInfos :: [(Ident, Prs.PInfo)], -- ^ parsing information (compiled mcfg&cfg grammars) pInfos :: [(Ident, Prs.PInfo)], -- ^ parsing information (compiled mcfg&cfg grammars)
morphos :: [(Ident,Morpho)], -- ^ morphologies morphos :: [(Ident,Morpho)], -- ^ morphologies
treebanks :: [(Ident,Treebank)], -- ^ treebanks
probss :: [(Ident,Probs)], -- ^ probability distributions probss :: [(Ident,Probs)], -- ^ probability distributions
gloptions :: Options, -- ^ global options gloptions :: Options, -- ^ global options
readFiles :: [(FilePath,ModTime)],-- ^ files read readFiles :: [(FilePath,ModTime)],-- ^ files read
@@ -73,6 +75,8 @@ data ShellState = ShSt {
transfers :: [(Ident,T.Env)] -- ^ transfer modules transfers :: [(Ident,T.Env)] -- ^ transfer modules
} }
type Treebank = Map.Map String [(String,String)] -- lang, tree
actualConcretes :: ShellState -> [((Ident,Ident),Bool)] actualConcretes :: ShellState -> [((Ident,Ident),Bool)]
actualConcretes sh = nub [((c,c),b) | actualConcretes sh = nub [((c,c),b) |
Just a <- [abstract sh], Just a <- [abstract sh],
@@ -102,6 +106,7 @@ emptyShellState = ShSt {
cfgs = [], cfgs = [],
pInfos = [], pInfos = [],
morphos = [], morphos = [],
treebanks = [],
probss = [], probss = [],
gloptions = noOptions, gloptions = noOptions,
readFiles = [], readFiles = [],
@@ -249,6 +254,7 @@ updateShellState opts ign mcnc sh ((_,sgr,gr),rts) = do
cfgs = zip concrs cfgs, cfgs = zip concrs cfgs,
pInfos = zip concrs pInfos, pInfos = zip concrs pInfos,
morphos = zip concrs morphos, morphos = zip concrs morphos,
treebanks = treebanks sh,
probss = zip concrs probss, probss = zip concrs probss,
gloptions = gloptions sh, --- opts, -- this would be command-line options gloptions = gloptions sh, --- opts, -- this would be command-line options
readFiles = [ft | ft@(f,_) <- readFiles sh, notInrts f] ++ rts, readFiles = [ft | ft@(f,_) <- readFiles sh, notInrts f] ++ rts,
@@ -298,6 +304,7 @@ purgeShellState sh = ShSt {
cfgs = cfgs sh, cfgs = cfgs sh,
pInfos = pInfos sh, pInfos = pInfos sh,
morphos = morphos sh, morphos = morphos sh,
treebanks = treebanks sh,
probss = probss sh, probss = probss sh,
gloptions = gloptions sh, gloptions = gloptions sh,
readFiles = [], readFiles = [],
@@ -314,17 +321,17 @@ purgeShellState sh = ShSt {
acncs = maybe [] singleton abstr ++ map (snd . fst) (actualConcretes sh) acncs = maybe [] singleton abstr ++ map (snd . fst) (actualConcretes sh)
changeMain :: Maybe Ident -> ShellState -> Err ShellState changeMain :: Maybe Ident -> ShellState -> Err ShellState
changeMain Nothing (ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) = changeMain Nothing (ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs) =
return (ShSt Nothing Nothing [] ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) return (ShSt Nothing Nothing [] ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs)
changeMain changeMain
(Just c) st@(ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) = (Just c) st@(ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs) =
case lookup c (M.modules ms) of case lookup c (M.modules ms) of
Just _ -> do Just _ -> do
a <- M.abstractOfConcrete ms c a <- M.abstractOfConcrete ms c
let cas = M.allConcretes ms a let cas = M.allConcretes ms a
let cs' = [((c,c),True) | c <- cas] let cs' = [((c,c),True) | c <- cas]
return (ShSt (Just a) (Just c) cs' ms ss cfs old_pis mcfgs cfgs return (ShSt (Just a) (Just c) cs' ms ss cfs old_pis mcfgs cfgs
pinfos mos pbs os rs acs s trs) pinfos mos tbs pbs os rs acs s trs)
_ -> P.prtBad "The state has no concrete syntax named" c _ -> P.prtBad "The state has no concrete syntax named" c
-- | form just one state grammar, if unique, from a canonical grammar -- | form just one state grammar, if unique, from a canonical grammar
@@ -472,6 +479,13 @@ addTransfer :: (Ident,T.Env) -> ShellState -> ShellState
addTransfer it@(i,_) sh = addTransfer it@(i,_) sh =
sh {transfers = it : filter ((/= i) . fst) (transfers sh)} sh {transfers = it : filter ((/= i) . fst) (transfers sh)}
addTreebank :: (Ident,Treebank) -> ShellState -> ShellState
addTreebank it@(i,_) sh =
sh {treebanks = it : filter ((/= i) . fst) (treebanks sh)}
findTreebank :: ShellState -> Ident -> Err Treebank
findTreebank sh i = maybeErr "no treebank found" $ lookup i $ treebanks sh
-- modify state -- modify state
type ShellStateOper = ShellState -> ShellState type ShellStateOper = ShellState -> ShellState
@@ -496,8 +510,8 @@ changeOptions f sh = sh {gloptions = f (gloptions sh)}
changeModTimes :: [(FilePath,ModTime)] -> ShellStateOper changeModTimes :: [(FilePath,ModTime)] -> ShellStateOper
--- __________ this is OBSOLETE --- __________ this is OBSOLETE
changeModTimes mfs changeModTimes mfs
(ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms pbs os ff ts ss trs) = (ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms tbs pbs os ff ts ss trs) =
ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms pbs os ff' ts ss trs ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms tbs pbs os ff' ts ss trs
where where
ff' = mfs ++ [mf | mf@(f,_) <- ff, notElem f (map fst mfs)] ff' = mfs ++ [mf | mf@(f,_) <- ff, notElem f (map fst mfs)]

View File

@@ -181,6 +181,10 @@ execLine put (c@(co, os), arg, cs) (outps,st) = do
execC :: CommandOpt -> ShellIO execC :: CommandOpt -> ShellIO
execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case comm of execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case comm of
CImport file | oElem (iOpt "treebank") opts -> do
ss <- readFileIf file >>= return . lines
let tb = pre2treebank $ getTreebank ss
changeState (addTreebank (I.identC (takeWhile (/='.') file), tb)) sa
CImport file | oElem fromExamples opts -> do CImport file | oElem fromExamples opts -> do
es <- liftM nub $ getGFEFiles opts file es <- liftM nub $ getGFEFiles opts file
system $ "gf -examples" +++ unlines es system $ "gf -examples" +++ unlines es
@@ -303,6 +307,15 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
comm = "command" ---- comm = "command" ----
returnArg (AString $ unlines $ mkTreebank opts st comm ts) sa returnArg (AString $ unlines $ mkTreebank opts st comm ts) sa
CLookupTreebank -> do
case treebanks st of
[] -> returnArg (AError "no treebank") sa
(_,tb):_ -> do
let s = prCommandArg a
let tes = map (string2treeErr gro . snd) $ lookupTreebank tb s
terms = [t | Ok t <- tes]
returnArg (ATrms terms) sa
CShowTreeGraph | oElem emitCode opts -> do -- -o CShowTreeGraph | oElem emitCode opts -> do -- -o
returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa
CShowTreeGraph -> do CShowTreeGraph -> do

View File

@@ -66,6 +66,7 @@ txtHelpFile =
"\n -o do emit code (default with new grammar format)" ++ "\n -o do emit code (default with new grammar format)" ++
"\n -ex preprocess .gfe files if needed" ++ "\n -ex preprocess .gfe files if needed" ++
"\n -prob read probabilities from top grammar file (format --# prob Fun Double)" ++ "\n -prob read probabilities from top grammar file (format --# prob Fun Double)" ++
"\n -treebank read a treebank file to memory (xml format)" ++
"\n flags:" ++ "\n flags:" ++
"\n -abs set the name used for abstract syntax (with -old option)" ++ "\n -abs set the name used for abstract syntax (with -old option)" ++
"\n -cnc set the name used for concrete syntax (with -old option)" ++ "\n -cnc set the name used for concrete syntax (with -old option)" ++
@@ -238,11 +239,18 @@ txtHelpFile =
"\n -c compare to existing xml-formatted treebank" ++ "\n -c compare to existing xml-formatted treebank" ++
"\n -trees return the trees of the treebank" ++ "\n -trees return the trees of the treebank" ++
"\n -xml wrap the treebank (or comparison results) with XML tags" ++ "\n -xml wrap the treebank (or comparison results) with XML tags" ++
"\n -mem write the treebank in memory instead of a file TODO" ++
"\n examples:" ++ "\n examples:" ++
"\n gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file" ++ "\n gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file" ++
"\n rf tb.xml | tb -c -- compare-test treebank from file" ++ "\n rf tb.xml | tb -c -- compare-test treebank from file" ++
"\n rf old.xml | tb -trees | tb -xml -- create new treebank from old" ++ "\n rf old.xml | tb -trees | tb -xml -- create new treebank from old" ++
"\n" ++ "\n" ++
"\nlt, lookup_treebank: lt String" ++
"\n Lookup a string in a treebank and return the resulting trees." ++
"\n Use 'tb' to create a treebank and 'i -treebank' to read it in memory." ++
"\n flag:" ++
"\n -treebank use this treebank (instead of the latest introduced one) TODO" ++
"\n" ++
"\ntt, test_tokenizer: tt String" ++ "\ntt, test_tokenizer: tt String" ++
"\n Show the token list sent to the parser when String is parsed." ++ "\n Show the token list sent to the parser when String is parsed." ++
"\n HINT: can be useful when debugging the parser." ++ "\n HINT: can be useful when debugging the parser." ++

View File

@@ -114,6 +114,7 @@ pCommand ws = case ws of
"cc" : s -> aUnit $ CComputeConcrete $ unwords s "cc" : s -> aUnit $ CComputeConcrete $ unwords s
"so" : s -> aUnit $ CShowOpers $ unwords s "so" : s -> aUnit $ CShowOpers $ unwords s
"tb" : [] -> aUnit CTreeBank "tb" : [] -> aUnit CTreeBank
"lt" : s -> aString CLookupTreebank s
"tq" : i:o:[] -> aUnit (CTranslationQuiz (language i) (language o)) "tq" : i:o:[] -> aUnit (CTranslationQuiz (language i) (language o))
"tl":i:o:[] -> aUnit (CTranslationList (language i) (language o)) "tl":i:o:[] -> aUnit (CTranslationList (language i) (language o))

View File

@@ -57,6 +57,8 @@ data Command =
| CComputeConcrete String | CComputeConcrete String
| CShowOpers String | CShowOpers String
| CLookupTreebank
| CTranslationQuiz Language Language | CTranslationQuiz Language Language
| CTranslationList Language Language | CTranslationList Language Language
| CMorphoQuiz | CMorphoQuiz
@@ -169,8 +171,9 @@ optionsOfCommand co = case co of
CSetFlag -> both "utf8 table struct record all multi" CSetFlag -> both "utf8 table struct record all multi"
"cat lang lexer parser number depth rawtrees unlexer optimize path conversion printer" "cat lang lexer parser number depth rawtrees unlexer optimize path conversion printer"
CImport _ -> both "old v s src gfc retain nocf nocheckcirc cflexer noemit o make ex prob" CImport _ ->
"abs cnc res path optimize conversion cat preproc probs noparse" both "old v s src gfc retain nocf nocheckcirc cflexer noemit o make ex prob treebank"
"abs cnc res path optimize conversion cat preproc probs noparse"
CRemoveLanguage _ -> none CRemoveLanguage _ -> none
CEmptyState -> none CEmptyState -> none
CStripState -> none CStripState -> none

View File

@@ -12,7 +12,14 @@
-- Purpose: to generate treebanks. -- Purpose: to generate treebanks.
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
module GF.UseGrammar.Treebank (mkTreebank,testTreebank,treesTreebank) where module GF.UseGrammar.Treebank (
mkTreebank,
testTreebank,
treesTreebank,
getTreebank,
lookupTreebank,
pre2treebank
) where
import GF.Compile.ShellState import GF.Compile.ShellState
import GF.UseGrammar.Linear (linTree2string) import GF.UseGrammar.Linear (linTree2string)
@@ -28,6 +35,7 @@ import GF.Grammar.Values (tree2exp)
import GF.Data.Operations import GF.Data.Operations
import GF.Infra.Option import GF.Infra.Option
import qualified GF.Grammar.Abstract as A import qualified GF.Grammar.Abstract as A
import qualified Data.Map as M
-- Generate a treebank with a multilingual grammar. AR 8/2/2006 -- Generate a treebank with a multilingual grammar. AR 8/2/2006
-- (c) Aarne Ranta 2006 under GNU GPL -- (c) Aarne Ranta 2006 under GNU GPL
@@ -74,11 +82,13 @@ puts = return -- putStrLn
ret = [] -- return () ret = [] -- return ()
-- --
type PreTreebank = [(String,[(String,String)])]
getTreebanks :: [String] -> [(String,String,String)] getTreebanks :: [String] -> [(String,String,String)]
getTreebanks = concatMap grps . getTreebank where getTreebanks = concatMap grps . getTreebank where
grps (t,lls) = [(t,x,y) | (x,y) <- lls] grps (t,lls) = [(t,x,y) | (x,y) <- lls]
getTreebank :: [String] -> [(String,[(String,String)])] getTreebank :: [String] -> PreTreebank
getTreebank ll = case ll of getTreebank ll = case ll of
l:ls@(_:_:_) -> l:ls@(_:_:_) ->
let (l1,l2) = getItem ls let (l1,l2) = getItem ls
@@ -97,6 +107,12 @@ getTreebank ll = case ll of
getLang = takeWhile (/='"') . tail . dropWhile (/='"') getLang = takeWhile (/='"') . tail . dropWhile (/='"')
lookupTreebank :: Treebank -> String -> [(String,String)]
lookupTreebank tb s = maybe [] id $ M.lookup s tb
pre2treebank :: PreTreebank -> Treebank
pre2treebank tb = M.fromListWith (++) [(s,[(l,t)]) | (t,ls) <- tb, (l,s) <- ls]
annot :: StateGrammar -> String -> A.Tree annot :: StateGrammar -> String -> A.Tree
annot gr s = errVal (error "illegal tree") $ do annot gr s = errVal (error "illegal tree") $ do
let t = tree2exp $ string2tree gr s let t = tree2exp $ string2tree gr s

View File

@@ -37,6 +37,7 @@ i, import: i File
-o do emit code (default with new grammar format) -o do emit code (default with new grammar format)
-ex preprocess .gfe files if needed -ex preprocess .gfe files if needed
-prob read probabilities from top grammar file (format --# prob Fun Double) -prob read probabilities from top grammar file (format --# prob Fun Double)
-treebank read a treebank file to memory (xml format)
flags: flags:
-abs set the name used for abstract syntax (with -old option) -abs set the name used for abstract syntax (with -old option)
-cnc set the name used for concrete syntax (with -old option) -cnc set the name used for concrete syntax (with -old option)
@@ -209,11 +210,18 @@ tb, tree_bank: tb
-c compare to existing xml-formatted treebank -c compare to existing xml-formatted treebank
-trees return the trees of the treebank -trees return the trees of the treebank
-xml wrap the treebank (or comparison results) with XML tags -xml wrap the treebank (or comparison results) with XML tags
-mem write the treebank in memory instead of a file TODO
examples: examples:
gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file
rf tb.xml | tb -c -- compare-test treebank from file rf tb.xml | tb -c -- compare-test treebank from file
rf old.xml | tb -trees | tb -xml -- create new treebank from old rf old.xml | tb -trees | tb -xml -- create new treebank from old
lt, lookup_treebank: lt String
Lookup a string in a treebank and return the resulting trees.
Use 'tb' to create a treebank and 'i -treebank' to read it in memory.
flag:
-treebank use this treebank (instead of the latest introduced one) TODO
tt, test_tokenizer: tt String tt, test_tokenizer: tt String
Show the token list sent to the parser when String is parsed. Show the token list sent to the parser when String is parsed.
HINT: can be useful when debugging the parser. HINT: can be useful when debugging the parser.