Adding a basic lexicon-based tokenizer and the asociated command in gf shell

This commit is contained in:
gdetrez
2011-02-10 15:00:06 +00:00
parent 0c7676e6e9
commit 5be1b5d493
3 changed files with 37 additions and 0 deletions

View File

@@ -22,6 +22,7 @@ flag interrupt
library library
build-depends: base >= 4.2 && <5, build-depends: base >= 4.2 && <5,
array, array,
fst,
containers, containers,
bytestring, bytestring,
random, random,
@@ -42,6 +43,7 @@ library
PGF.Expr PGF.Expr
PGF.Type PGF.Type
PGF.Tree PGF.Tree
PGF.Tokenizer
PGF.Paraphrase PGF.Paraphrase
PGF.TypeCheck PGF.TypeCheck
PGF.Binary PGF.Binary
@@ -72,6 +74,7 @@ executable gf
containers, containers,
bytestring, bytestring,
filepath, filepath,
fst,
directory, directory,
random, random,
old-time, old-time,

View File

@@ -964,6 +964,13 @@ allCommands env@(pgf, mos) = Map.fromList [
], ],
flags = [("file","the output filename")] flags = [("file","the output filename")]
}), }),
("t", emptyCommandInfo {
longname = "tokenize",
synopsis = "Tokenize string usng the vocabulary",
exec = execToktok env,
options = [],
flags = [("lang","The name of the concrete to use")]
}),
("ai", emptyCommandInfo { ("ai", emptyCommandInfo {
longname = "abstract_info", longname = "abstract_info",
syntax = "ai IDENTIFIER or ai EXPR", syntax = "ai IDENTIFIER or ai EXPR",
@@ -1251,3 +1258,26 @@ prMorphoAnalysis (w,lps) =
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps]) unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
-- This function is to be excuted when the command 'tok' is parsed
execToktok :: PGFEnv -> [Option] -> [Expr] -> IO CommandOutput
execToktok (pgf, _) opts exprs = do
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
case getLang opts of
Nothing -> do
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
return ([ELit $ LStr o | o <- output],unlines output)
Just lang -> case Map.lookup lang tokenizers of
Just tok -> do
let output = toStringList $ tok input
return ([ELit $ LStr o | o <- output],unlines output)
Nothing -> return ([],"Unknown language: " ++ show lang)
where input = case exprs of
[ELit (LStr s)] -> s
_ -> ""
toStringList :: Maybe [String] -> [String]
toStringList Nothing = []
toStringList (Just l) = l
getLang :: [Option] -> Maybe Language
getLang [] = Nothing
getLang (OFlag "lang" (VId l):_) = readLanguage l
getLang (_:os) = getLang os

View File

@@ -109,6 +109,9 @@ module PGF(
-- ** Morphological Analysis -- ** Morphological Analysis
Lemma, Analysis, Morpho, Lemma, Analysis, Morpho,
lookupMorpho, buildMorpho, fullFormLexicon, lookupMorpho, buildMorpho, fullFormLexicon,
-- ** Tokenizing
mkTokenizer,
-- ** Visualizations -- ** Visualizations
graphvizAbstractTree, graphvizAbstractTree,
@@ -141,6 +144,7 @@ import PGF.Expr (Tree)
import PGF.Morphology import PGF.Morphology
import PGF.Data import PGF.Data
import PGF.Binary import PGF.Binary
import PGF.Tokenizer
import qualified PGF.Forest as Forest import qualified PGF.Forest as Forest
import qualified PGF.Parse as Parse import qualified PGF.Parse as Parse