mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
Adding a missing file for the tokenizer...
This commit is contained in:
109
src/runtime/haskell/PGF/Tokenizer.hs
Normal file
109
src/runtime/haskell/PGF/Tokenizer.hs
Normal file
@@ -0,0 +1,109 @@
|
||||
{-
|
||||
GF Tokenizer.
|
||||
|
||||
In this module are implemented function that build a fst-based tokenizer
|
||||
from a Concrete grammar.
|
||||
|
||||
-}
|
||||
module PGF.Tokenizer
|
||||
( mkTokenizer
|
||||
) where
|
||||
|
||||
import Data.List (intercalate)
|
||||
--import Test.QuickCheck
|
||||
import FST.TransducerInterface
|
||||
import PGF.Morphology (fullFormLexicon, buildMorpho)
|
||||
import PGF.Data (PGF, Language)
|
||||
|
||||
|
||||
|
||||
data LexSymbol = Tok String
|
||||
deriving (Show, Read)
|
||||
|
||||
type Lexicon = [LexSymbol]
|
||||
|
||||
-- | This is the construction function. Given a PGF and a Language, it
|
||||
-- extract the lexicon for this language and build a tokenization fst
|
||||
-- from it.
|
||||
mkTokenizer :: PGF -> Language -> (String -> Maybe [String])
|
||||
mkTokenizer pgf lang = mkTrans lexicon
|
||||
where lexicon = map (Tok . fst) lexicon'
|
||||
lexicon' = fullFormLexicon $ buildMorpho pgf lang
|
||||
|
||||
mkTrans :: Lexicon -> (String -> Maybe [String])
|
||||
mkTrans = applyDown . lexiconTrans
|
||||
|
||||
lexiconTrans :: Lexicon -> Transducer Char
|
||||
lexiconTrans lexicon = compile (words |> star ((spaces <|> glue) |> words)) "abcdefghijklmnopqrstuvwxyz "
|
||||
where words = foldr (<|>) (empty) $ map tokToRR lexicon
|
||||
glue = eps <*> stringReg " &+ "
|
||||
|
||||
stringReg :: String -> Reg Char
|
||||
stringReg str = foldr (\x y -> s x |> y) eps str
|
||||
|
||||
tokToRR:: LexSymbol -> RReg Char
|
||||
tokToRR (Tok str) = foldr ((|>) . idR . s) (idR eps) str
|
||||
|
||||
spaces :: RReg Char
|
||||
spaces = idR $ s ' '
|
||||
|
||||
|
||||
|
||||
-- TESTING
|
||||
|
||||
-- verry small test lexicon
|
||||
-- testLexicon :: Lexicon
|
||||
-- testLexicon
|
||||
-- = [ Tok "car"
|
||||
-- , Tok "elf"
|
||||
-- ]
|
||||
|
||||
-- myTrans :: String -> Maybe [String]
|
||||
-- myTrans = mkTrans testLexicon
|
||||
|
||||
-- data TestCase = TestCase String String
|
||||
-- deriving (Show, Read)
|
||||
|
||||
-- instance Arbitrary TestCase where
|
||||
-- arbitrary = arbitraryTestCase
|
||||
-- --coarbitrary c = variant (ord c `rem` 4)
|
||||
|
||||
-- arbitraryTestCase:: Gen TestCase
|
||||
-- arbitraryTestCase = do
|
||||
-- words <- listOf1 $ elements [t | Tok t <- testLexicon]
|
||||
-- tokens <- intercalateSometime "+&+" words
|
||||
-- return $ TestCase (linearize tokens) (intercalate " " tokens)
|
||||
-- where intercalateSometime :: a -> [a] -> Gen [a]
|
||||
-- intercalateSometime x (x1:x2:xs) = do
|
||||
-- b <- arbitrary
|
||||
-- let pre = case b of
|
||||
-- True -> x1:x:[]
|
||||
-- False -> x1:[]
|
||||
-- suf <- intercalateSometime x (x2:xs)
|
||||
-- return (pre++suf)
|
||||
-- intercalateSometime _ xs = return xs
|
||||
|
||||
-- linearize :: [String] -> String
|
||||
-- linearize = linearize' False
|
||||
-- where linearize' _ [] = "" -- boolean indicate if the last token was a real word and not +&+
|
||||
-- linearize' _ ("+&+":ss) = linearize' False ss
|
||||
-- linearize' True (s:ss) = ' ':s ++ linearize' True ss
|
||||
-- linearize' False (s:ss) = s ++ linearize' True ss
|
||||
|
||||
-- testTrans :: (String -> Maybe [String]) -> TestCase -> Bool
|
||||
-- testTrans t (TestCase s1 s2) =
|
||||
-- case t s1 of
|
||||
-- Nothing -> False
|
||||
-- Just l -> elem s2 l
|
||||
|
||||
-- main :: IO ()
|
||||
-- main = do
|
||||
-- putStrLn "\n=== Transducer ==="
|
||||
-- print $ lexiconTrans lexicon
|
||||
-- putStrLn "\n=== example output ==="
|
||||
-- putStrLn $ "Input: " ++ show "car elfcar elf"
|
||||
-- putStrLn $ "Output: " ++ (show $ mkTrans lexicon "car elfcar elf")
|
||||
-- putStrLn "\n=== QuickCheck tests ==="
|
||||
-- quickCheck (testTrans myTrans)
|
||||
-- putStrLn "\n=== Examples of test cases ==="
|
||||
-- sample (arbitrary :: Gen TestCase)
|
||||
Reference in New Issue
Block a user