drop the dependency to FST

This commit is contained in:
krasimir
2015-04-20 11:56:13 +00:00
parent 79b710a6b7
commit 61e37c9e5e
3 changed files with 0 additions and 150 deletions

View File

@@ -1120,13 +1120,6 @@ allCommands = Map.fromList [
], ],
flags = [("file","the output filename")] flags = [("file","the output filename")]
}), }),
("t", emptyCommandInfo {
longname = "tokenize",
synopsis = "Tokenize string using the vocabulary",
exec = execToktok,
options = [],
flags = [("lang","The name of the concrete to use")]
}),
("ai", emptyCommandInfo { ("ai", emptyCommandInfo {
longname = "abstract_info", longname = "abstract_info",
syntax = "ai IDENTIFIER or ai EXPR", syntax = "ai IDENTIFIER or ai EXPR",
@@ -1426,32 +1419,6 @@ prMorphoAnalysis (w,lps) =
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps]) unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
-- This function is to be excuted when the command 'tok' is parsed
execToktok :: Monad m => PGFEnv -> [Option] -> [Expr] -> m CommandOutput
execToktok (pgf, _) opts exprs = do
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
case getLang opts of
Nothing -> do
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
return (fromStrings output)
Just lang -> case Map.lookup lang tokenizers of
Just tok -> do
let output = toStringList $ tok input
return (fromStrings output)
Nothing -> return (pipeMessage ("Unknown language: " ++ show lang))
where input = case exprs of
[ELit (LStr s)] -> s
_ -> ""
toStringList :: Maybe [String] -> [String]
toStringList Nothing = []
toStringList (Just l) = l
getLang :: [Option] -> Maybe Language
getLang [] = Nothing
getLang (OFlag "lang" (VId l):_) = readLanguage l
getLang (_:os) = getLang os
trie = render . pptss . toTrie . map toATree trie = render . pptss . toTrie . map toATree
where where
pptss [ts] = "*"<+>nest 2 (ppts ts) pptss [ts] = "*"<+>nest 2 (ppts ts)

View File

@@ -119,9 +119,6 @@ module PGF(
-- extra: -- extra:
morphoKnown, isInMorpho, morphoKnown, isInMorpho,
-- ** Tokenizing
mkTokenizer,
-- ** Visualizations -- ** Visualizations
graphvizAbstractTree, graphvizAbstractTree,
graphvizParseTree, graphvizParseTree,
@@ -165,7 +162,6 @@ import PGF.Expr (Tree)
import PGF.Morphology import PGF.Morphology
import PGF.Data import PGF.Data
import PGF.Binary() import PGF.Binary()
import PGF.Tokenizer
import qualified PGF.Forest as Forest import qualified PGF.Forest as Forest
import qualified PGF.Parse as Parse import qualified PGF.Parse as Parse
import PGF.Utilities(replace) import PGF.Utilities(replace)

View File

@@ -1,113 +0,0 @@
{-
GF Tokenizer.
In this module are implemented function that build a fst-based tokenizer
from a Concrete grammar.
-}
{-# LANGUAGE CPP #-}
module PGF.Tokenizer
( mkTokenizer
) where
#if MIN_VERSION_base(4,8,0)
import Prelude hiding ((<*>))
#endif
--import Data.List (intercalate)
--import Test.QuickCheck
import FST.TransducerInterface
import PGF.Morphology (fullFormLexicon, buildMorpho)
import PGF.Data (PGF, Language)
data LexSymbol = Tok String
deriving (Show, Read)
type Lexicon = [LexSymbol]
-- | This is the construction function. Given a PGF and a Language, it
-- extract the lexicon for this language and build a tokenization fst
-- from it.
mkTokenizer :: PGF -> Language -> (String -> Maybe [String])
mkTokenizer pgf lang = mkTrans lexicon
where lexicon = map (Tok . fst) lexicon'
lexicon' = fullFormLexicon $ buildMorpho pgf lang
mkTrans :: Lexicon -> (String -> Maybe [String])
mkTrans = applyDown . lexiconTrans
lexiconTrans :: Lexicon -> Transducer Char
lexiconTrans lexicon = compile (words |> star ((spaces <|> glue) |> words)) "abcdefghijklmnopqrstuvwxyz "
where words = foldr (<|>) (empty) $ map tokToRR lexicon
glue = eps <*> stringReg " &+ "
stringReg :: String -> Reg Char
stringReg str = foldr (\x y -> s x |> y) eps str
tokToRR:: LexSymbol -> RReg Char
tokToRR (Tok str) = foldr ((|>) . idR . s) (idR eps) str
spaces :: RReg Char
spaces = idR $ s ' '
-- TESTING
-- verry small test lexicon
-- testLexicon :: Lexicon
-- testLexicon
-- = [ Tok "car"
-- , Tok "elf"
-- ]
-- myTrans :: String -> Maybe [String]
-- myTrans = mkTrans testLexicon
-- data TestCase = TestCase String String
-- deriving (Show, Read)
-- instance Arbitrary TestCase where
-- arbitrary = arbitraryTestCase
-- --coarbitrary c = variant (ord c `rem` 4)
-- arbitraryTestCase:: Gen TestCase
-- arbitraryTestCase = do
-- words <- listOf1 $ elements [t | Tok t <- testLexicon]
-- tokens <- intercalateSometime "+&+" words
-- return $ TestCase (linearize tokens) (intercalate " " tokens)
-- where intercalateSometime :: a -> [a] -> Gen [a]
-- intercalateSometime x (x1:x2:xs) = do
-- b <- arbitrary
-- let pre = case b of
-- True -> x1:x:[]
-- False -> x1:[]
-- suf <- intercalateSometime x (x2:xs)
-- return (pre++suf)
-- intercalateSometime _ xs = return xs
-- linearize :: [String] -> String
-- linearize = linearize' False
-- where linearize' _ [] = "" -- boolean indicate if the last token was a real word and not +&+
-- linearize' _ ("+&+":ss) = linearize' False ss
-- linearize' True (s:ss) = ' ':s ++ linearize' True ss
-- linearize' False (s:ss) = s ++ linearize' True ss
-- testTrans :: (String -> Maybe [String]) -> TestCase -> Bool
-- testTrans t (TestCase s1 s2) =
-- case t s1 of
-- Nothing -> False
-- Just l -> elem s2 l
-- main :: IO ()
-- main = do
-- putStrLn "\n=== Transducer ==="
-- print $ lexiconTrans lexicon
-- putStrLn "\n=== example output ==="
-- putStrLn $ "Input: " ++ show "car elfcar elf"
-- putStrLn $ "Output: " ++ (show $ mkTrans lexicon "car elfcar elf")
-- putStrLn "\n=== QuickCheck tests ==="
-- quickCheck (testTrans myTrans)
-- putStrLn "\n=== Examples of test cases ==="
-- sample (arbitrary :: Gen TestCase)