treebanks in shell state: i -treebank and lt

This commit is contained in:
aarne
2006-03-02 17:19:36 +00:00
parent 35aac815db
commit 32508e37b3
8 changed files with 77 additions and 12 deletions

View File

@@ -238,14 +238,16 @@ compatCF (CFCat (CIQ _ c, l)) (CFCat (CIQ _ c', l')) = c==c' && l==l'
-- instead of break
wordsLits [] = []
wordsLits (c:cs) | isSpace c = wordsLits (dropWhile isSpace cs)
| c == '\'' || c == '"'
| isQuote c
= let (l,rs) = breaks (==c) cs
rs' = drop 1 rs
in ([c]++l++[c]):wordsLits rs'
| otherwise = let (w,rs) = break isSpace cs
| otherwise = let (w,rs) = break isSpaceQ cs
in (c:w):wordsLits rs
where
breaks c cs = case break c cs of
(l@(_:_),d:rs) | last l == '\\' ->
let (r,ts) = breaks c rs in (l++[d]++r, ts)
v -> v
isQuote c = elem c "\"'"
isSpaceQ c = isSpace c ---- || isQuote c

View File

@@ -44,6 +44,7 @@ import qualified GF.Conversion.GFC as Cnv
import qualified GF.Parsing.GFC as Prs
import Data.List (nub,nubBy)
import qualified Data.Map as Map
-- AR 11/11/2001 -- 17/6/2003 (for modules) ---- unfinished
@@ -61,6 +62,7 @@ data ShellState = ShSt {
-- (large, with parameters, no-so overgenerating)
pInfos :: [(Ident, Prs.PInfo)], -- ^ parsing information (compiled mcfg&cfg grammars)
morphos :: [(Ident,Morpho)], -- ^ morphologies
treebanks :: [(Ident,Treebank)], -- ^ treebanks
probss :: [(Ident,Probs)], -- ^ probability distributions
gloptions :: Options, -- ^ global options
readFiles :: [(FilePath,ModTime)],-- ^ files read
@@ -73,6 +75,8 @@ data ShellState = ShSt {
transfers :: [(Ident,T.Env)] -- ^ transfer modules
}
type Treebank = Map.Map String [(String,String)] -- lang, tree
actualConcretes :: ShellState -> [((Ident,Ident),Bool)]
actualConcretes sh = nub [((c,c),b) |
Just a <- [abstract sh],
@@ -102,6 +106,7 @@ emptyShellState = ShSt {
cfgs = [],
pInfos = [],
morphos = [],
treebanks = [],
probss = [],
gloptions = noOptions,
readFiles = [],
@@ -249,6 +254,7 @@ updateShellState opts ign mcnc sh ((_,sgr,gr),rts) = do
cfgs = zip concrs cfgs,
pInfos = zip concrs pInfos,
morphos = zip concrs morphos,
treebanks = treebanks sh,
probss = zip concrs probss,
gloptions = gloptions sh, --- opts, -- this would be command-line options
readFiles = [ft | ft@(f,_) <- readFiles sh, notInrts f] ++ rts,
@@ -298,6 +304,7 @@ purgeShellState sh = ShSt {
cfgs = cfgs sh,
pInfos = pInfos sh,
morphos = morphos sh,
treebanks = treebanks sh,
probss = probss sh,
gloptions = gloptions sh,
readFiles = [],
@@ -314,17 +321,17 @@ purgeShellState sh = ShSt {
acncs = maybe [] singleton abstr ++ map (snd . fst) (actualConcretes sh)
changeMain :: Maybe Ident -> ShellState -> Err ShellState
changeMain Nothing (ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) =
return (ShSt Nothing Nothing [] ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs)
changeMain Nothing (ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs) =
return (ShSt Nothing Nothing [] ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs)
changeMain
(Just c) st@(ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos pbs os rs acs s trs) =
(Just c) st@(ShSt _ _ cs ms ss cfs old_pis mcfgs cfgs pinfos mos tbs pbs os rs acs s trs) =
case lookup c (M.modules ms) of
Just _ -> do
a <- M.abstractOfConcrete ms c
let cas = M.allConcretes ms a
let cs' = [((c,c),True) | c <- cas]
return (ShSt (Just a) (Just c) cs' ms ss cfs old_pis mcfgs cfgs
pinfos mos pbs os rs acs s trs)
pinfos mos tbs pbs os rs acs s trs)
_ -> P.prtBad "The state has no concrete syntax named" c
-- | form just one state grammar, if unique, from a canonical grammar
@@ -472,6 +479,13 @@ addTransfer :: (Ident,T.Env) -> ShellState -> ShellState
addTransfer it@(i,_) sh =
sh {transfers = it : filter ((/= i) . fst) (transfers sh)}
addTreebank :: (Ident,Treebank) -> ShellState -> ShellState
addTreebank it@(i,_) sh =
sh {treebanks = it : filter ((/= i) . fst) (treebanks sh)}
findTreebank :: ShellState -> Ident -> Err Treebank
findTreebank sh i = maybeErr "no treebank found" $ lookup i $ treebanks sh
-- modify state
type ShellStateOper = ShellState -> ShellState
@@ -496,8 +510,8 @@ changeOptions f sh = sh {gloptions = f (gloptions sh)}
changeModTimes :: [(FilePath,ModTime)] -> ShellStateOper
--- __________ this is OBSOLETE
changeModTimes mfs
(ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms pbs os ff ts ss trs) =
ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms pbs os ff' ts ss trs
(ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms tbs pbs os ff ts ss trs) =
ShSt a c cs can src cfs old_pinfos mcfgs cfgs pinfos ms tbs pbs os ff' ts ss trs
where
ff' = mfs ++ [mf | mf@(f,_) <- ff, notElem f (map fst mfs)]

View File

@@ -181,6 +181,10 @@ execLine put (c@(co, os), arg, cs) (outps,st) = do
execC :: CommandOpt -> ShellIO
execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case comm of
CImport file | oElem (iOpt "treebank") opts -> do
ss <- readFileIf file >>= return . lines
let tb = pre2treebank $ getTreebank ss
changeState (addTreebank (I.identC (takeWhile (/='.') file), tb)) sa
CImport file | oElem fromExamples opts -> do
es <- liftM nub $ getGFEFiles opts file
system $ "gf -examples" +++ unlines es
@@ -303,6 +307,15 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
comm = "command" ----
returnArg (AString $ unlines $ mkTreebank opts st comm ts) sa
CLookupTreebank -> do
case treebanks st of
[] -> returnArg (AError "no treebank") sa
(_,tb):_ -> do
let s = prCommandArg a
let tes = map (string2treeErr gro . snd) $ lookupTreebank tb s
terms = [t | Ok t <- tes]
returnArg (ATrms terms) sa
CShowTreeGraph | oElem emitCode opts -> do -- -o
returnArg (AString $ visualizeTrees opts $ strees $ s2t a) sa
CShowTreeGraph -> do

View File

@@ -66,6 +66,7 @@ txtHelpFile =
"\n -o do emit code (default with new grammar format)" ++
"\n -ex preprocess .gfe files if needed" ++
"\n -prob read probabilities from top grammar file (format --# prob Fun Double)" ++
"\n -treebank read a treebank file to memory (xml format)" ++
"\n flags:" ++
"\n -abs set the name used for abstract syntax (with -old option)" ++
"\n -cnc set the name used for concrete syntax (with -old option)" ++
@@ -238,11 +239,18 @@ txtHelpFile =
"\n -c compare to existing xml-formatted treebank" ++
"\n -trees return the trees of the treebank" ++
"\n -xml wrap the treebank (or comparison results) with XML tags" ++
"\n -mem write the treebank in memory instead of a file TODO" ++
"\n examples:" ++
"\n gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file" ++
"\n rf tb.xml | tb -c -- compare-test treebank from file" ++
"\n rf old.xml | tb -trees | tb -xml -- create new treebank from old" ++
"\n" ++
"\nlt, lookup_treebank: lt String" ++
"\n Lookup a string in a treebank and return the resulting trees." ++
"\n Use 'tb' to create a treebank and 'i -treebank' to read it in memory." ++
"\n flag:" ++
"\n -treebank use this treebank (instead of the latest introduced one) TODO" ++
"\n" ++
"\ntt, test_tokenizer: tt String" ++
"\n Show the token list sent to the parser when String is parsed." ++
"\n HINT: can be useful when debugging the parser." ++

View File

@@ -114,6 +114,7 @@ pCommand ws = case ws of
"cc" : s -> aUnit $ CComputeConcrete $ unwords s
"so" : s -> aUnit $ CShowOpers $ unwords s
"tb" : [] -> aUnit CTreeBank
"lt" : s -> aString CLookupTreebank s
"tq" : i:o:[] -> aUnit (CTranslationQuiz (language i) (language o))
"tl":i:o:[] -> aUnit (CTranslationList (language i) (language o))

View File

@@ -57,6 +57,8 @@ data Command =
| CComputeConcrete String
| CShowOpers String
| CLookupTreebank
| CTranslationQuiz Language Language
| CTranslationList Language Language
| CMorphoQuiz
@@ -169,8 +171,9 @@ optionsOfCommand co = case co of
CSetFlag -> both "utf8 table struct record all multi"
"cat lang lexer parser number depth rawtrees unlexer optimize path conversion printer"
CImport _ -> both "old v s src gfc retain nocf nocheckcirc cflexer noemit o make ex prob"
"abs cnc res path optimize conversion cat preproc probs noparse"
CImport _ ->
both "old v s src gfc retain nocf nocheckcirc cflexer noemit o make ex prob treebank"
"abs cnc res path optimize conversion cat preproc probs noparse"
CRemoveLanguage _ -> none
CEmptyState -> none
CStripState -> none

View File

@@ -12,7 +12,14 @@
-- Purpose: to generate treebanks.
-----------------------------------------------------------------------------
module GF.UseGrammar.Treebank (mkTreebank,testTreebank,treesTreebank) where
module GF.UseGrammar.Treebank (
mkTreebank,
testTreebank,
treesTreebank,
getTreebank,
lookupTreebank,
pre2treebank
) where
import GF.Compile.ShellState
import GF.UseGrammar.Linear (linTree2string)
@@ -28,6 +35,7 @@ import GF.Grammar.Values (tree2exp)
import GF.Data.Operations
import GF.Infra.Option
import qualified GF.Grammar.Abstract as A
import qualified Data.Map as M
-- Generate a treebank with a multilingual grammar. AR 8/2/2006
-- (c) Aarne Ranta 2006 under GNU GPL
@@ -74,11 +82,13 @@ puts = return -- putStrLn
ret = [] -- return ()
--
type PreTreebank = [(String,[(String,String)])]
getTreebanks :: [String] -> [(String,String,String)]
getTreebanks = concatMap grps . getTreebank where
grps (t,lls) = [(t,x,y) | (x,y) <- lls]
getTreebank :: [String] -> [(String,[(String,String)])]
getTreebank :: [String] -> PreTreebank
getTreebank ll = case ll of
l:ls@(_:_:_) ->
let (l1,l2) = getItem ls
@@ -97,6 +107,12 @@ getTreebank ll = case ll of
getLang = takeWhile (/='"') . tail . dropWhile (/='"')
lookupTreebank :: Treebank -> String -> [(String,String)]
lookupTreebank tb s = maybe [] id $ M.lookup s tb
pre2treebank :: PreTreebank -> Treebank
pre2treebank tb = M.fromListWith (++) [(s,[(l,t)]) | (t,ls) <- tb, (l,s) <- ls]
annot :: StateGrammar -> String -> A.Tree
annot gr s = errVal (error "illegal tree") $ do
let t = tree2exp $ string2tree gr s

View File

@@ -37,6 +37,7 @@ i, import: i File
-o do emit code (default with new grammar format)
-ex preprocess .gfe files if needed
-prob read probabilities from top grammar file (format --# prob Fun Double)
-treebank read a treebank file to memory (xml format)
flags:
-abs set the name used for abstract syntax (with -old option)
-cnc set the name used for concrete syntax (with -old option)
@@ -209,11 +210,18 @@ tb, tree_bank: tb
-c compare to existing xml-formatted treebank
-trees return the trees of the treebank
-xml wrap the treebank (or comparison results) with XML tags
-mem write the treebank in memory instead of a file TODO
examples:
gr -cat=S -number=100 | tb -xml | wf tb.xml -- random treebank into file
rf tb.xml | tb -c -- compare-test treebank from file
rf old.xml | tb -trees | tb -xml -- create new treebank from old
lt, lookup_treebank: lt String
Lookup a string in a treebank and return the resulting trees.
Use 'tb' to create a treebank and 'i -treebank' to read it in memory.
flag:
-treebank use this treebank (instead of the latest introduced one) TODO
tt, test_tokenizer: tt String
Show the token list sent to the parser when String is parsed.
HINT: can be useful when debugging the parser.