drop the dependency to FST

This commit is contained in:
krasimir
2015-04-20 11:56:13 +00:00
parent 5d7633fdb4
commit 8c697b72a4
3 changed files with 0 additions and 150 deletions

View File

@@ -1120,13 +1120,6 @@ allCommands = Map.fromList [
],
flags = [("file","the output filename")]
}),
("t", emptyCommandInfo {
longname = "tokenize",
synopsis = "Tokenize string using the vocabulary",
exec = execToktok,
options = [],
flags = [("lang","The name of the concrete to use")]
}),
("ai", emptyCommandInfo {
longname = "abstract_info",
syntax = "ai IDENTIFIER or ai EXPR",
@@ -1426,32 +1419,6 @@ prMorphoAnalysis (w,lps) =
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
-- This function is to be excuted when the command 'tok' is parsed
execToktok :: Monad m => PGFEnv -> [Option] -> [Expr] -> m CommandOutput
execToktok (pgf, _) opts exprs = do
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
case getLang opts of
Nothing -> do
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
return (fromStrings output)
Just lang -> case Map.lookup lang tokenizers of
Just tok -> do
let output = toStringList $ tok input
return (fromStrings output)
Nothing -> return (pipeMessage ("Unknown language: " ++ show lang))
where input = case exprs of
[ELit (LStr s)] -> s
_ -> ""
toStringList :: Maybe [String] -> [String]
toStringList Nothing = []
toStringList (Just l) = l
getLang :: [Option] -> Maybe Language
getLang [] = Nothing
getLang (OFlag "lang" (VId l):_) = readLanguage l
getLang (_:os) = getLang os
trie = render . pptss . toTrie . map toATree
where
pptss [ts] = "*"<+>nest 2 (ppts ts)

View File

@@ -119,9 +119,6 @@ module PGF(
-- extra:
morphoKnown, isInMorpho,
-- ** Tokenizing
mkTokenizer,
-- ** Visualizations
graphvizAbstractTree,
graphvizParseTree,
@@ -165,7 +162,6 @@ import PGF.Expr (Tree)
import PGF.Morphology
import PGF.Data
import PGF.Binary()
import PGF.Tokenizer
import qualified PGF.Forest as Forest
import qualified PGF.Parse as Parse
import PGF.Utilities(replace)

View File

@@ -1,113 +0,0 @@
{-
GF Tokenizer.
In this module are implemented function that build a fst-based tokenizer
from a Concrete grammar.
-}
{-# LANGUAGE CPP #-}
module PGF.Tokenizer
( mkTokenizer
) where
#if MIN_VERSION_base(4,8,0)
import Prelude hiding ((<*>))
#endif
--import Data.List (intercalate)
--import Test.QuickCheck
import FST.TransducerInterface
import PGF.Morphology (fullFormLexicon, buildMorpho)
import PGF.Data (PGF, Language)
data LexSymbol = Tok String
deriving (Show, Read)
type Lexicon = [LexSymbol]
-- | This is the construction function. Given a PGF and a Language, it
-- extract the lexicon for this language and build a tokenization fst
-- from it.
mkTokenizer :: PGF -> Language -> (String -> Maybe [String])
mkTokenizer pgf lang = mkTrans lexicon
where lexicon = map (Tok . fst) lexicon'
lexicon' = fullFormLexicon $ buildMorpho pgf lang
mkTrans :: Lexicon -> (String -> Maybe [String])
mkTrans = applyDown . lexiconTrans
lexiconTrans :: Lexicon -> Transducer Char
lexiconTrans lexicon = compile (words |> star ((spaces <|> glue) |> words)) "abcdefghijklmnopqrstuvwxyz "
where words = foldr (<|>) (empty) $ map tokToRR lexicon
glue = eps <*> stringReg " &+ "
stringReg :: String -> Reg Char
stringReg str = foldr (\x y -> s x |> y) eps str
tokToRR:: LexSymbol -> RReg Char
tokToRR (Tok str) = foldr ((|>) . idR . s) (idR eps) str
spaces :: RReg Char
spaces = idR $ s ' '
-- TESTING
-- verry small test lexicon
-- testLexicon :: Lexicon
-- testLexicon
-- = [ Tok "car"
-- , Tok "elf"
-- ]
-- myTrans :: String -> Maybe [String]
-- myTrans = mkTrans testLexicon
-- data TestCase = TestCase String String
-- deriving (Show, Read)
-- instance Arbitrary TestCase where
-- arbitrary = arbitraryTestCase
-- --coarbitrary c = variant (ord c `rem` 4)
-- arbitraryTestCase:: Gen TestCase
-- arbitraryTestCase = do
-- words <- listOf1 $ elements [t | Tok t <- testLexicon]
-- tokens <- intercalateSometime "+&+" words
-- return $ TestCase (linearize tokens) (intercalate " " tokens)
-- where intercalateSometime :: a -> [a] -> Gen [a]
-- intercalateSometime x (x1:x2:xs) = do
-- b <- arbitrary
-- let pre = case b of
-- True -> x1:x:[]
-- False -> x1:[]
-- suf <- intercalateSometime x (x2:xs)
-- return (pre++suf)
-- intercalateSometime _ xs = return xs
-- linearize :: [String] -> String
-- linearize = linearize' False
-- where linearize' _ [] = "" -- boolean indicate if the last token was a real word and not +&+
-- linearize' _ ("+&+":ss) = linearize' False ss
-- linearize' True (s:ss) = ' ':s ++ linearize' True ss
-- linearize' False (s:ss) = s ++ linearize' True ss
-- testTrans :: (String -> Maybe [String]) -> TestCase -> Bool
-- testTrans t (TestCase s1 s2) =
-- case t s1 of
-- Nothing -> False
-- Just l -> elem s2 l
-- main :: IO ()
-- main = do
-- putStrLn "\n=== Transducer ==="
-- print $ lexiconTrans lexicon
-- putStrLn "\n=== example output ==="
-- putStrLn $ "Input: " ++ show "car elfcar elf"
-- putStrLn $ "Output: " ++ (show $ mkTrans lexicon "car elfcar elf")
-- putStrLn "\n=== QuickCheck tests ==="
-- quickCheck (testTrans myTrans)
-- putStrLn "\n=== Examples of test cases ==="
-- sample (arbitrary :: Gen TestCase)