Adding a basic lexicon-based tokenizer and the asociated command in gf shell

This commit is contained in:
gdetrez
2011-02-10 15:00:06 +00:00
parent d7ae73f1c7
commit 45ecae4b77
3 changed files with 37 additions and 0 deletions

View File

@@ -22,6 +22,7 @@ flag interrupt
library
build-depends: base >= 4.2 && <5,
array,
fst,
containers,
bytestring,
random,
@@ -42,6 +43,7 @@ library
PGF.Expr
PGF.Type
PGF.Tree
PGF.Tokenizer
PGF.Paraphrase
PGF.TypeCheck
PGF.Binary
@@ -72,6 +74,7 @@ executable gf
containers,
bytestring,
filepath,
fst,
directory,
random,
old-time,

View File

@@ -964,6 +964,13 @@ allCommands env@(pgf, mos) = Map.fromList [
],
flags = [("file","the output filename")]
}),
("t", emptyCommandInfo {
longname = "tokenize",
synopsis = "Tokenize string usng the vocabulary",
exec = execToktok env,
options = [],
flags = [("lang","The name of the concrete to use")]
}),
("ai", emptyCommandInfo {
longname = "abstract_info",
syntax = "ai IDENTIFIER or ai EXPR",
@@ -1251,3 +1258,26 @@ prMorphoAnalysis (w,lps) =
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
-- This function is to be excuted when the command 'tok' is parsed
execToktok :: PGFEnv -> [Option] -> [Expr] -> IO CommandOutput
execToktok (pgf, _) opts exprs = do
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
case getLang opts of
Nothing -> do
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
return ([ELit $ LStr o | o <- output],unlines output)
Just lang -> case Map.lookup lang tokenizers of
Just tok -> do
let output = toStringList $ tok input
return ([ELit $ LStr o | o <- output],unlines output)
Nothing -> return ([],"Unknown language: " ++ show lang)
where input = case exprs of
[ELit (LStr s)] -> s
_ -> ""
toStringList :: Maybe [String] -> [String]
toStringList Nothing = []
toStringList (Just l) = l
getLang :: [Option] -> Maybe Language
getLang [] = Nothing
getLang (OFlag "lang" (VId l):_) = readLanguage l
getLang (_:os) = getLang os

View File

@@ -109,6 +109,9 @@ module PGF(
-- ** Morphological Analysis
Lemma, Analysis, Morpho,
lookupMorpho, buildMorpho, fullFormLexicon,
-- ** Tokenizing
mkTokenizer,
-- ** Visualizations
graphvizAbstractTree,
@@ -141,6 +144,7 @@ import PGF.Expr (Tree)
import PGF.Morphology
import PGF.Data
import PGF.Binary
import PGF.Tokenizer
import qualified PGF.Forest as Forest
import qualified PGF.Parse as Parse