forked from GitHub/gf-core
drop the dependency to FST
This commit is contained in:
@@ -1120,13 +1120,6 @@ allCommands = Map.fromList [
|
||||
],
|
||||
flags = [("file","the output filename")]
|
||||
}),
|
||||
("t", emptyCommandInfo {
|
||||
longname = "tokenize",
|
||||
synopsis = "Tokenize string using the vocabulary",
|
||||
exec = execToktok,
|
||||
options = [],
|
||||
flags = [("lang","The name of the concrete to use")]
|
||||
}),
|
||||
("ai", emptyCommandInfo {
|
||||
longname = "abstract_info",
|
||||
syntax = "ai IDENTIFIER or ai EXPR",
|
||||
@@ -1426,32 +1419,6 @@ prMorphoAnalysis (w,lps) =
|
||||
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
|
||||
|
||||
|
||||
-- This function is to be excuted when the command 'tok' is parsed
|
||||
execToktok :: Monad m => PGFEnv -> [Option] -> [Expr] -> m CommandOutput
|
||||
execToktok (pgf, _) opts exprs = do
|
||||
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
|
||||
case getLang opts of
|
||||
Nothing -> do
|
||||
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
|
||||
return (fromStrings output)
|
||||
Just lang -> case Map.lookup lang tokenizers of
|
||||
Just tok -> do
|
||||
let output = toStringList $ tok input
|
||||
return (fromStrings output)
|
||||
Nothing -> return (pipeMessage ("Unknown language: " ++ show lang))
|
||||
where input = case exprs of
|
||||
[ELit (LStr s)] -> s
|
||||
_ -> ""
|
||||
toStringList :: Maybe [String] -> [String]
|
||||
toStringList Nothing = []
|
||||
toStringList (Just l) = l
|
||||
getLang :: [Option] -> Maybe Language
|
||||
getLang [] = Nothing
|
||||
getLang (OFlag "lang" (VId l):_) = readLanguage l
|
||||
getLang (_:os) = getLang os
|
||||
|
||||
|
||||
|
||||
trie = render . pptss . toTrie . map toATree
|
||||
where
|
||||
pptss [ts] = "*"<+>nest 2 (ppts ts)
|
||||
|
||||
@@ -119,9 +119,6 @@ module PGF(
|
||||
-- extra:
|
||||
morphoKnown, isInMorpho,
|
||||
|
||||
-- ** Tokenizing
|
||||
mkTokenizer,
|
||||
|
||||
-- ** Visualizations
|
||||
graphvizAbstractTree,
|
||||
graphvizParseTree,
|
||||
@@ -165,7 +162,6 @@ import PGF.Expr (Tree)
|
||||
import PGF.Morphology
|
||||
import PGF.Data
|
||||
import PGF.Binary()
|
||||
import PGF.Tokenizer
|
||||
import qualified PGF.Forest as Forest
|
||||
import qualified PGF.Parse as Parse
|
||||
import PGF.Utilities(replace)
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
{-
|
||||
GF Tokenizer.
|
||||
|
||||
In this module are implemented function that build a fst-based tokenizer
|
||||
from a Concrete grammar.
|
||||
|
||||
-}
|
||||
{-# LANGUAGE CPP #-}
|
||||
module PGF.Tokenizer
|
||||
( mkTokenizer
|
||||
) where
|
||||
|
||||
#if MIN_VERSION_base(4,8,0)
|
||||
import Prelude hiding ((<*>))
|
||||
#endif
|
||||
--import Data.List (intercalate)
|
||||
--import Test.QuickCheck
|
||||
import FST.TransducerInterface
|
||||
import PGF.Morphology (fullFormLexicon, buildMorpho)
|
||||
import PGF.Data (PGF, Language)
|
||||
|
||||
|
||||
|
||||
data LexSymbol = Tok String
|
||||
deriving (Show, Read)
|
||||
|
||||
type Lexicon = [LexSymbol]
|
||||
|
||||
-- | This is the construction function. Given a PGF and a Language, it
|
||||
-- extract the lexicon for this language and build a tokenization fst
|
||||
-- from it.
|
||||
mkTokenizer :: PGF -> Language -> (String -> Maybe [String])
|
||||
mkTokenizer pgf lang = mkTrans lexicon
|
||||
where lexicon = map (Tok . fst) lexicon'
|
||||
lexicon' = fullFormLexicon $ buildMorpho pgf lang
|
||||
|
||||
mkTrans :: Lexicon -> (String -> Maybe [String])
|
||||
mkTrans = applyDown . lexiconTrans
|
||||
|
||||
lexiconTrans :: Lexicon -> Transducer Char
|
||||
lexiconTrans lexicon = compile (words |> star ((spaces <|> glue) |> words)) "abcdefghijklmnopqrstuvwxyz "
|
||||
where words = foldr (<|>) (empty) $ map tokToRR lexicon
|
||||
glue = eps <*> stringReg " &+ "
|
||||
|
||||
stringReg :: String -> Reg Char
|
||||
stringReg str = foldr (\x y -> s x |> y) eps str
|
||||
|
||||
tokToRR:: LexSymbol -> RReg Char
|
||||
tokToRR (Tok str) = foldr ((|>) . idR . s) (idR eps) str
|
||||
|
||||
spaces :: RReg Char
|
||||
spaces = idR $ s ' '
|
||||
|
||||
|
||||
|
||||
-- TESTING
|
||||
|
||||
-- verry small test lexicon
|
||||
-- testLexicon :: Lexicon
|
||||
-- testLexicon
|
||||
-- = [ Tok "car"
|
||||
-- , Tok "elf"
|
||||
-- ]
|
||||
|
||||
-- myTrans :: String -> Maybe [String]
|
||||
-- myTrans = mkTrans testLexicon
|
||||
|
||||
-- data TestCase = TestCase String String
|
||||
-- deriving (Show, Read)
|
||||
|
||||
-- instance Arbitrary TestCase where
|
||||
-- arbitrary = arbitraryTestCase
|
||||
-- --coarbitrary c = variant (ord c `rem` 4)
|
||||
|
||||
-- arbitraryTestCase:: Gen TestCase
|
||||
-- arbitraryTestCase = do
|
||||
-- words <- listOf1 $ elements [t | Tok t <- testLexicon]
|
||||
-- tokens <- intercalateSometime "+&+" words
|
||||
-- return $ TestCase (linearize tokens) (intercalate " " tokens)
|
||||
-- where intercalateSometime :: a -> [a] -> Gen [a]
|
||||
-- intercalateSometime x (x1:x2:xs) = do
|
||||
-- b <- arbitrary
|
||||
-- let pre = case b of
|
||||
-- True -> x1:x:[]
|
||||
-- False -> x1:[]
|
||||
-- suf <- intercalateSometime x (x2:xs)
|
||||
-- return (pre++suf)
|
||||
-- intercalateSometime _ xs = return xs
|
||||
|
||||
-- linearize :: [String] -> String
|
||||
-- linearize = linearize' False
|
||||
-- where linearize' _ [] = "" -- boolean indicate if the last token was a real word and not +&+
|
||||
-- linearize' _ ("+&+":ss) = linearize' False ss
|
||||
-- linearize' True (s:ss) = ' ':s ++ linearize' True ss
|
||||
-- linearize' False (s:ss) = s ++ linearize' True ss
|
||||
|
||||
-- testTrans :: (String -> Maybe [String]) -> TestCase -> Bool
|
||||
-- testTrans t (TestCase s1 s2) =
|
||||
-- case t s1 of
|
||||
-- Nothing -> False
|
||||
-- Just l -> elem s2 l
|
||||
|
||||
-- main :: IO ()
|
||||
-- main = do
|
||||
-- putStrLn "\n=== Transducer ==="
|
||||
-- print $ lexiconTrans lexicon
|
||||
-- putStrLn "\n=== example output ==="
|
||||
-- putStrLn $ "Input: " ++ show "car elfcar elf"
|
||||
-- putStrLn $ "Output: " ++ (show $ mkTrans lexicon "car elfcar elf")
|
||||
-- putStrLn "\n=== QuickCheck tests ==="
|
||||
-- quickCheck (testTrans myTrans)
|
||||
-- putStrLn "\n=== Examples of test cases ==="
|
||||
-- sample (arbitrary :: Gen TestCase)
|
||||
Reference in New Issue
Block a user