forked from GitHub/gf-core
Adding a basic lexicon-based tokenizer and the asociated command in gf shell
This commit is contained in:
3
gf.cabal
3
gf.cabal
@@ -22,6 +22,7 @@ flag interrupt
|
|||||||
library
|
library
|
||||||
build-depends: base >= 4.2 && <5,
|
build-depends: base >= 4.2 && <5,
|
||||||
array,
|
array,
|
||||||
|
fst,
|
||||||
containers,
|
containers,
|
||||||
bytestring,
|
bytestring,
|
||||||
random,
|
random,
|
||||||
@@ -42,6 +43,7 @@ library
|
|||||||
PGF.Expr
|
PGF.Expr
|
||||||
PGF.Type
|
PGF.Type
|
||||||
PGF.Tree
|
PGF.Tree
|
||||||
|
PGF.Tokenizer
|
||||||
PGF.Paraphrase
|
PGF.Paraphrase
|
||||||
PGF.TypeCheck
|
PGF.TypeCheck
|
||||||
PGF.Binary
|
PGF.Binary
|
||||||
@@ -72,6 +74,7 @@ executable gf
|
|||||||
containers,
|
containers,
|
||||||
bytestring,
|
bytestring,
|
||||||
filepath,
|
filepath,
|
||||||
|
fst,
|
||||||
directory,
|
directory,
|
||||||
random,
|
random,
|
||||||
old-time,
|
old-time,
|
||||||
|
|||||||
@@ -964,6 +964,13 @@ allCommands env@(pgf, mos) = Map.fromList [
|
|||||||
],
|
],
|
||||||
flags = [("file","the output filename")]
|
flags = [("file","the output filename")]
|
||||||
}),
|
}),
|
||||||
|
("t", emptyCommandInfo {
|
||||||
|
longname = "tokenize",
|
||||||
|
synopsis = "Tokenize string usng the vocabulary",
|
||||||
|
exec = execToktok env,
|
||||||
|
options = [],
|
||||||
|
flags = [("lang","The name of the concrete to use")]
|
||||||
|
}),
|
||||||
("ai", emptyCommandInfo {
|
("ai", emptyCommandInfo {
|
||||||
longname = "abstract_info",
|
longname = "abstract_info",
|
||||||
syntax = "ai IDENTIFIER or ai EXPR",
|
syntax = "ai IDENTIFIER or ai EXPR",
|
||||||
@@ -1251,3 +1258,26 @@ prMorphoAnalysis (w,lps) =
|
|||||||
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
|
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
|
||||||
|
|
||||||
|
|
||||||
|
-- This function is to be excuted when the command 'tok' is parsed
|
||||||
|
execToktok :: PGFEnv -> [Option] -> [Expr] -> IO CommandOutput
|
||||||
|
execToktok (pgf, _) opts exprs = do
|
||||||
|
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
|
||||||
|
case getLang opts of
|
||||||
|
Nothing -> do
|
||||||
|
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
|
||||||
|
return ([ELit $ LStr o | o <- output],unlines output)
|
||||||
|
Just lang -> case Map.lookup lang tokenizers of
|
||||||
|
Just tok -> do
|
||||||
|
let output = toStringList $ tok input
|
||||||
|
return ([ELit $ LStr o | o <- output],unlines output)
|
||||||
|
Nothing -> return ([],"Unknown language: " ++ show lang)
|
||||||
|
where input = case exprs of
|
||||||
|
[ELit (LStr s)] -> s
|
||||||
|
_ -> ""
|
||||||
|
toStringList :: Maybe [String] -> [String]
|
||||||
|
toStringList Nothing = []
|
||||||
|
toStringList (Just l) = l
|
||||||
|
getLang :: [Option] -> Maybe Language
|
||||||
|
getLang [] = Nothing
|
||||||
|
getLang (OFlag "lang" (VId l):_) = readLanguage l
|
||||||
|
getLang (_:os) = getLang os
|
||||||
|
|||||||
@@ -109,6 +109,9 @@ module PGF(
|
|||||||
-- ** Morphological Analysis
|
-- ** Morphological Analysis
|
||||||
Lemma, Analysis, Morpho,
|
Lemma, Analysis, Morpho,
|
||||||
lookupMorpho, buildMorpho, fullFormLexicon,
|
lookupMorpho, buildMorpho, fullFormLexicon,
|
||||||
|
|
||||||
|
-- ** Tokenizing
|
||||||
|
mkTokenizer,
|
||||||
|
|
||||||
-- ** Visualizations
|
-- ** Visualizations
|
||||||
graphvizAbstractTree,
|
graphvizAbstractTree,
|
||||||
@@ -141,6 +144,7 @@ import PGF.Expr (Tree)
|
|||||||
import PGF.Morphology
|
import PGF.Morphology
|
||||||
import PGF.Data
|
import PGF.Data
|
||||||
import PGF.Binary
|
import PGF.Binary
|
||||||
|
import PGF.Tokenizer
|
||||||
import qualified PGF.Forest as Forest
|
import qualified PGF.Forest as Forest
|
||||||
import qualified PGF.Parse as Parse
|
import qualified PGF.Parse as Parse
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user