mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-22 17:42:51 -06:00
drop the dependency to FST
This commit is contained in:
@@ -1120,13 +1120,6 @@ allCommands = Map.fromList [
|
|||||||
],
|
],
|
||||||
flags = [("file","the output filename")]
|
flags = [("file","the output filename")]
|
||||||
}),
|
}),
|
||||||
("t", emptyCommandInfo {
|
|
||||||
longname = "tokenize",
|
|
||||||
synopsis = "Tokenize string using the vocabulary",
|
|
||||||
exec = execToktok,
|
|
||||||
options = [],
|
|
||||||
flags = [("lang","The name of the concrete to use")]
|
|
||||||
}),
|
|
||||||
("ai", emptyCommandInfo {
|
("ai", emptyCommandInfo {
|
||||||
longname = "abstract_info",
|
longname = "abstract_info",
|
||||||
syntax = "ai IDENTIFIER or ai EXPR",
|
syntax = "ai IDENTIFIER or ai EXPR",
|
||||||
@@ -1426,32 +1419,6 @@ prMorphoAnalysis (w,lps) =
|
|||||||
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
|
unlines (w:[showCId l ++ " : " ++ p | (l,p) <- lps])
|
||||||
|
|
||||||
|
|
||||||
-- This function is to be excuted when the command 'tok' is parsed
|
|
||||||
execToktok :: Monad m => PGFEnv -> [Option] -> [Expr] -> m CommandOutput
|
|
||||||
execToktok (pgf, _) opts exprs = do
|
|
||||||
let tokenizers = Map.fromList [ (l, mkTokenizer pgf l) | l <- languages pgf]
|
|
||||||
case getLang opts of
|
|
||||||
Nothing -> do
|
|
||||||
let output = concatMap toStringList [t input | (_,t) <- Map.toList tokenizers]
|
|
||||||
return (fromStrings output)
|
|
||||||
Just lang -> case Map.lookup lang tokenizers of
|
|
||||||
Just tok -> do
|
|
||||||
let output = toStringList $ tok input
|
|
||||||
return (fromStrings output)
|
|
||||||
Nothing -> return (pipeMessage ("Unknown language: " ++ show lang))
|
|
||||||
where input = case exprs of
|
|
||||||
[ELit (LStr s)] -> s
|
|
||||||
_ -> ""
|
|
||||||
toStringList :: Maybe [String] -> [String]
|
|
||||||
toStringList Nothing = []
|
|
||||||
toStringList (Just l) = l
|
|
||||||
getLang :: [Option] -> Maybe Language
|
|
||||||
getLang [] = Nothing
|
|
||||||
getLang (OFlag "lang" (VId l):_) = readLanguage l
|
|
||||||
getLang (_:os) = getLang os
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
trie = render . pptss . toTrie . map toATree
|
trie = render . pptss . toTrie . map toATree
|
||||||
where
|
where
|
||||||
pptss [ts] = "*"<+>nest 2 (ppts ts)
|
pptss [ts] = "*"<+>nest 2 (ppts ts)
|
||||||
|
|||||||
@@ -119,9 +119,6 @@ module PGF(
|
|||||||
-- extra:
|
-- extra:
|
||||||
morphoKnown, isInMorpho,
|
morphoKnown, isInMorpho,
|
||||||
|
|
||||||
-- ** Tokenizing
|
|
||||||
mkTokenizer,
|
|
||||||
|
|
||||||
-- ** Visualizations
|
-- ** Visualizations
|
||||||
graphvizAbstractTree,
|
graphvizAbstractTree,
|
||||||
graphvizParseTree,
|
graphvizParseTree,
|
||||||
@@ -165,7 +162,6 @@ import PGF.Expr (Tree)
|
|||||||
import PGF.Morphology
|
import PGF.Morphology
|
||||||
import PGF.Data
|
import PGF.Data
|
||||||
import PGF.Binary()
|
import PGF.Binary()
|
||||||
import PGF.Tokenizer
|
|
||||||
import qualified PGF.Forest as Forest
|
import qualified PGF.Forest as Forest
|
||||||
import qualified PGF.Parse as Parse
|
import qualified PGF.Parse as Parse
|
||||||
import PGF.Utilities(replace)
|
import PGF.Utilities(replace)
|
||||||
|
|||||||
@@ -1,113 +0,0 @@
|
|||||||
{-
|
|
||||||
GF Tokenizer.
|
|
||||||
|
|
||||||
In this module are implemented function that build a fst-based tokenizer
|
|
||||||
from a Concrete grammar.
|
|
||||||
|
|
||||||
-}
|
|
||||||
{-# LANGUAGE CPP #-}
|
|
||||||
module PGF.Tokenizer
|
|
||||||
( mkTokenizer
|
|
||||||
) where
|
|
||||||
|
|
||||||
#if MIN_VERSION_base(4,8,0)
|
|
||||||
import Prelude hiding ((<*>))
|
|
||||||
#endif
|
|
||||||
--import Data.List (intercalate)
|
|
||||||
--import Test.QuickCheck
|
|
||||||
import FST.TransducerInterface
|
|
||||||
import PGF.Morphology (fullFormLexicon, buildMorpho)
|
|
||||||
import PGF.Data (PGF, Language)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
data LexSymbol = Tok String
|
|
||||||
deriving (Show, Read)
|
|
||||||
|
|
||||||
type Lexicon = [LexSymbol]
|
|
||||||
|
|
||||||
-- | This is the construction function. Given a PGF and a Language, it
|
|
||||||
-- extract the lexicon for this language and build a tokenization fst
|
|
||||||
-- from it.
|
|
||||||
mkTokenizer :: PGF -> Language -> (String -> Maybe [String])
|
|
||||||
mkTokenizer pgf lang = mkTrans lexicon
|
|
||||||
where lexicon = map (Tok . fst) lexicon'
|
|
||||||
lexicon' = fullFormLexicon $ buildMorpho pgf lang
|
|
||||||
|
|
||||||
mkTrans :: Lexicon -> (String -> Maybe [String])
|
|
||||||
mkTrans = applyDown . lexiconTrans
|
|
||||||
|
|
||||||
lexiconTrans :: Lexicon -> Transducer Char
|
|
||||||
lexiconTrans lexicon = compile (words |> star ((spaces <|> glue) |> words)) "abcdefghijklmnopqrstuvwxyz "
|
|
||||||
where words = foldr (<|>) (empty) $ map tokToRR lexicon
|
|
||||||
glue = eps <*> stringReg " &+ "
|
|
||||||
|
|
||||||
stringReg :: String -> Reg Char
|
|
||||||
stringReg str = foldr (\x y -> s x |> y) eps str
|
|
||||||
|
|
||||||
tokToRR:: LexSymbol -> RReg Char
|
|
||||||
tokToRR (Tok str) = foldr ((|>) . idR . s) (idR eps) str
|
|
||||||
|
|
||||||
spaces :: RReg Char
|
|
||||||
spaces = idR $ s ' '
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
-- TESTING
|
|
||||||
|
|
||||||
-- verry small test lexicon
|
|
||||||
-- testLexicon :: Lexicon
|
|
||||||
-- testLexicon
|
|
||||||
-- = [ Tok "car"
|
|
||||||
-- , Tok "elf"
|
|
||||||
-- ]
|
|
||||||
|
|
||||||
-- myTrans :: String -> Maybe [String]
|
|
||||||
-- myTrans = mkTrans testLexicon
|
|
||||||
|
|
||||||
-- data TestCase = TestCase String String
|
|
||||||
-- deriving (Show, Read)
|
|
||||||
|
|
||||||
-- instance Arbitrary TestCase where
|
|
||||||
-- arbitrary = arbitraryTestCase
|
|
||||||
-- --coarbitrary c = variant (ord c `rem` 4)
|
|
||||||
|
|
||||||
-- arbitraryTestCase:: Gen TestCase
|
|
||||||
-- arbitraryTestCase = do
|
|
||||||
-- words <- listOf1 $ elements [t | Tok t <- testLexicon]
|
|
||||||
-- tokens <- intercalateSometime "+&+" words
|
|
||||||
-- return $ TestCase (linearize tokens) (intercalate " " tokens)
|
|
||||||
-- where intercalateSometime :: a -> [a] -> Gen [a]
|
|
||||||
-- intercalateSometime x (x1:x2:xs) = do
|
|
||||||
-- b <- arbitrary
|
|
||||||
-- let pre = case b of
|
|
||||||
-- True -> x1:x:[]
|
|
||||||
-- False -> x1:[]
|
|
||||||
-- suf <- intercalateSometime x (x2:xs)
|
|
||||||
-- return (pre++suf)
|
|
||||||
-- intercalateSometime _ xs = return xs
|
|
||||||
|
|
||||||
-- linearize :: [String] -> String
|
|
||||||
-- linearize = linearize' False
|
|
||||||
-- where linearize' _ [] = "" -- boolean indicate if the last token was a real word and not +&+
|
|
||||||
-- linearize' _ ("+&+":ss) = linearize' False ss
|
|
||||||
-- linearize' True (s:ss) = ' ':s ++ linearize' True ss
|
|
||||||
-- linearize' False (s:ss) = s ++ linearize' True ss
|
|
||||||
|
|
||||||
-- testTrans :: (String -> Maybe [String]) -> TestCase -> Bool
|
|
||||||
-- testTrans t (TestCase s1 s2) =
|
|
||||||
-- case t s1 of
|
|
||||||
-- Nothing -> False
|
|
||||||
-- Just l -> elem s2 l
|
|
||||||
|
|
||||||
-- main :: IO ()
|
|
||||||
-- main = do
|
|
||||||
-- putStrLn "\n=== Transducer ==="
|
|
||||||
-- print $ lexiconTrans lexicon
|
|
||||||
-- putStrLn "\n=== example output ==="
|
|
||||||
-- putStrLn $ "Input: " ++ show "car elfcar elf"
|
|
||||||
-- putStrLn $ "Output: " ++ (show $ mkTrans lexicon "car elfcar elf")
|
|
||||||
-- putStrLn "\n=== QuickCheck tests ==="
|
|
||||||
-- quickCheck (testTrans myTrans)
|
|
||||||
-- putStrLn "\n=== Examples of test cases ==="
|
|
||||||
-- sample (arbitrary :: Gen TestCase)
|
|
||||||
Reference in New Issue
Block a user