mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
Adding a basic lexicon-based tokenizer and the asociated command in gf shell
This commit is contained in:
3
gf.cabal
3
gf.cabal
@@ -22,6 +22,7 @@ flag interrupt
|
||||
library
|
||||
build-depends: base >= 4.2 && <5,
|
||||
array,
|
||||
fst,
|
||||
containers,
|
||||
bytestring,
|
||||
random,
|
||||
@@ -42,6 +43,7 @@ library
|
||||
PGF.Expr
|
||||
PGF.Type
|
||||
PGF.Tree
|
||||
PGF.Tokenizer
|
||||
PGF.Paraphrase
|
||||
PGF.TypeCheck
|
||||
PGF.Binary
|
||||
@@ -72,6 +74,7 @@ executable gf
|
||||
containers,
|
||||
bytestring,
|
||||
filepath,
|
||||
fst,
|
||||
directory,
|
||||
random,
|
||||
old-time,
|
||||
|
||||
@@ -964,6 +964,13 @@ allCommands env@(pgf, mos) = Map.fromList [
|
||||
],
|
||||
flags = [("file","the output filename")]
|
||||
}),
|
||||
("t", emptyCommandInfo {
|
||||
longname = "tokenize",
|
||||
synopsis = "Tokenize string usng the vocabulary",
|
||||
exec = execToktok env,
|
||||
options = [],
|
||||
flags = [("lang","The name of the concrete to use")]
|
||||
}),
|
||||
("ai", emptyCommandInfo {
|
||||
longname = "abstract_info",
|
||||
syntax = "ai IDENTIFIER or ai EXPR",
|
||||
@@ -1251,3 +1258,26 @@ prMorphoAnalysis (w,lps) =
|
||||
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
|
||||
|
||||
|
||||
-- This function is to be excuted when the command 'tok' is parsed
|
||||
execToktok :: PGFEnv -> [Option] -> [Expr] -> IO CommandOutput
|
||||
execToktok (pgf, _) opts exprs = do
|
||||
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
|
||||
case getLang opts of
|
||||
Nothing -> do
|
||||
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
|
||||
return ([ELit $ LStr o | o <- output],unlines output)
|
||||
Just lang -> case Map.lookup lang tokenizers of
|
||||
Just tok -> do
|
||||
let output = toStringList $ tok input
|
||||
return ([ELit $ LStr o | o <- output],unlines output)
|
||||
Nothing -> return ([],"Unknown language: " ++ show lang)
|
||||
where input = case exprs of
|
||||
[ELit (LStr s)] -> s
|
||||
_ -> ""
|
||||
toStringList :: Maybe [String] -> [String]
|
||||
toStringList Nothing = []
|
||||
toStringList (Just l) = l
|
||||
getLang :: [Option] -> Maybe Language
|
||||
getLang [] = Nothing
|
||||
getLang (OFlag "lang" (VId l):_) = readLanguage l
|
||||
getLang (_:os) = getLang os
|
||||
|
||||
@@ -109,6 +109,9 @@ module PGF(
|
||||
-- ** Morphological Analysis
|
||||
Lemma, Analysis, Morpho,
|
||||
lookupMorpho, buildMorpho, fullFormLexicon,
|
||||
|
||||
-- ** Tokenizing
|
||||
mkTokenizer,
|
||||
|
||||
-- ** Visualizations
|
||||
graphvizAbstractTree,
|
||||
@@ -141,6 +144,7 @@ import PGF.Expr (Tree)
|
||||
import PGF.Morphology
|
||||
import PGF.Data
|
||||
import PGF.Binary
|
||||
import PGF.Tokenizer
|
||||
import qualified PGF.Forest as Forest
|
||||
import qualified PGF.Parse as Parse
|
||||
|
||||
|
||||
Reference in New Issue
Block a user