From 45ecae4b774aee96dcc3e9f2c5f82307982faa08 Mon Sep 17 00:00:00 2001 From: gdetrez Date: Thu, 10 Feb 2011 15:00:06 +0000 Subject: [PATCH] Adding a basic lexicon-based tokenizer and the asociated command in gf shell --- gf.cabal | 3 +++ src/compiler/GF/Command/Commands.hs | 30 +++++++++++++++++++++++++++++ src/runtime/haskell/PGF.hs | 4 ++++ 3 files changed, 37 insertions(+) diff --git a/gf.cabal b/gf.cabal index 9d23e0dde..acd1ac2e3 100644 --- a/gf.cabal +++ b/gf.cabal @@ -22,6 +22,7 @@ flag interrupt library build-depends: base >= 4.2 && <5, array, + fst, containers, bytestring, random, @@ -42,6 +43,7 @@ library PGF.Expr PGF.Type PGF.Tree + PGF.Tokenizer PGF.Paraphrase PGF.TypeCheck PGF.Binary @@ -72,6 +74,7 @@ executable gf containers, bytestring, filepath, + fst, directory, random, old-time, diff --git a/src/compiler/GF/Command/Commands.hs b/src/compiler/GF/Command/Commands.hs index 1290666cb..00d8e427a 100644 --- a/src/compiler/GF/Command/Commands.hs +++ b/src/compiler/GF/Command/Commands.hs @@ -964,6 +964,13 @@ allCommands env@(pgf, mos) = Map.fromList [ ], flags = [("file","the output filename")] }), + ("t", emptyCommandInfo { + longname = "tokenize", + synopsis = "Tokenize string usng the vocabulary", + exec = execToktok env, + options = [], + flags = [("lang","The name of the concrete to use")] + }), ("ai", emptyCommandInfo { longname = "abstract_info", syntax = "ai IDENTIFIER or ai EXPR", @@ -1251,3 +1258,26 @@ prMorphoAnalysis (w,lps) = unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps]) +-- This function is to be excuted when the command 'tok' is parsed +execToktok :: PGFEnv -> [Option] -> [Expr] -> IO CommandOutput +execToktok (pgf, _) opts exprs = do + let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf] + case getLang opts of + Nothing -> do + let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers] + return ([ELit $ LStr o | o <- output],unlines output) + Just lang -> case Map.lookup lang tokenizers of + Just tok -> do + let output = toStringList $ tok input + return ([ELit $ LStr o | o <- output],unlines output) + Nothing -> return ([],"Unknown language: " ++ show lang) + where input = case exprs of + [ELit (LStr s)] -> s + _ -> "" + toStringList :: Maybe [String] -> [String] + toStringList Nothing = [] + toStringList (Just l) = l + getLang :: [Option] -> Maybe Language + getLang [] = Nothing + getLang (OFlag "lang" (VId l):_) = readLanguage l + getLang (_:os) = getLang os diff --git a/src/runtime/haskell/PGF.hs b/src/runtime/haskell/PGF.hs index 42ef8aaff..8530d9a71 100644 --- a/src/runtime/haskell/PGF.hs +++ b/src/runtime/haskell/PGF.hs @@ -109,6 +109,9 @@ module PGF( -- ** Morphological Analysis Lemma, Analysis, Morpho, lookupMorpho, buildMorpho, fullFormLexicon, + + -- ** Tokenizing + mkTokenizer, -- ** Visualizations graphvizAbstractTree, @@ -141,6 +144,7 @@ import PGF.Expr (Tree) import PGF.Morphology import PGF.Data import PGF.Binary +import PGF.Tokenizer import qualified PGF.Forest as Forest import qualified PGF.Parse as Parse