1
0
forked from GitHub/gf-core
Files
gf-core/src/runtime/haskell/PGF/Tokenizer.hs
hallgren 3814841d7d Eliminate mutual dependencies between the GF compiler and the PGF library
+ References to modules under src/compiler have been eliminated from the PGF
  library (under src/runtime/haskell). Only two functions had to be moved (from
  GF.Data.Utilities to PGF.Utilities) to make this possible, other apparent
  dependencies turned out to be vacuous.

+ In gf.cabal, the GF executable no longer directly depends on the PGF library
  source directory, but only on the exposed library modules. This means that
  there is less duplication in gf.cabal and that the 30 modules in the
  PGF library will no longer be compiled twice while building GF.

  To make this possible, additional PGF library modules have been exposed, even
  though they should probably be considered for internal use only. They could
  be collected in a PGF.Internal module, or marked as "unstable", to make
  this explicit.

+ Also, by using the -fwarn-unused-imports flag, ~220 redundant imports were
  found and removed, reducing the total number of imports by ~15%.
2013-11-05 13:11:10 +00:00

110 lines
3.2 KiB
Haskell

{-
GF Tokenizer.
In this module are implemented function that build a fst-based tokenizer
from a Concrete grammar.
-}
module PGF.Tokenizer
( mkTokenizer
) where
--import Data.List (intercalate)
--import Test.QuickCheck
import FST.TransducerInterface
import PGF.Morphology (fullFormLexicon, buildMorpho)
import PGF.Data (PGF, Language)
data LexSymbol = Tok String
deriving (Show, Read)
type Lexicon = [LexSymbol]
-- | This is the construction function. Given a PGF and a Language, it
-- extract the lexicon for this language and build a tokenization fst
-- from it.
mkTokenizer :: PGF -> Language -> (String -> Maybe [String])
mkTokenizer pgf lang = mkTrans lexicon
where lexicon = map (Tok . fst) lexicon'
lexicon' = fullFormLexicon $ buildMorpho pgf lang
mkTrans :: Lexicon -> (String -> Maybe [String])
mkTrans = applyDown . lexiconTrans
lexiconTrans :: Lexicon -> Transducer Char
lexiconTrans lexicon = compile (words |> star ((spaces <|> glue) |> words)) "abcdefghijklmnopqrstuvwxyz "
where words = foldr (<|>) (empty) $ map tokToRR lexicon
glue = eps <*> stringReg " &+ "
stringReg :: String -> Reg Char
stringReg str = foldr (\x y -> s x |> y) eps str
tokToRR:: LexSymbol -> RReg Char
tokToRR (Tok str) = foldr ((|>) . idR . s) (idR eps) str
spaces :: RReg Char
spaces = idR $ s ' '
-- TESTING
-- verry small test lexicon
-- testLexicon :: Lexicon
-- testLexicon
-- = [ Tok "car"
-- , Tok "elf"
-- ]
-- myTrans :: String -> Maybe [String]
-- myTrans = mkTrans testLexicon
-- data TestCase = TestCase String String
-- deriving (Show, Read)
-- instance Arbitrary TestCase where
-- arbitrary = arbitraryTestCase
-- --coarbitrary c = variant (ord c `rem` 4)
-- arbitraryTestCase:: Gen TestCase
-- arbitraryTestCase = do
-- words <- listOf1 $ elements [t | Tok t <- testLexicon]
-- tokens <- intercalateSometime "+&+" words
-- return $ TestCase (linearize tokens) (intercalate " " tokens)
-- where intercalateSometime :: a -> [a] -> Gen [a]
-- intercalateSometime x (x1:x2:xs) = do
-- b <- arbitrary
-- let pre = case b of
-- True -> x1:x:[]
-- False -> x1:[]
-- suf <- intercalateSometime x (x2:xs)
-- return (pre++suf)
-- intercalateSometime _ xs = return xs
-- linearize :: [String] -> String
-- linearize = linearize' False
-- where linearize' _ [] = "" -- boolean indicate if the last token was a real word and not +&+
-- linearize' _ ("+&+":ss) = linearize' False ss
-- linearize' True (s:ss) = ' ':s ++ linearize' True ss
-- linearize' False (s:ss) = s ++ linearize' True ss
-- testTrans :: (String -> Maybe [String]) -> TestCase -> Bool
-- testTrans t (TestCase s1 s2) =
-- case t s1 of
-- Nothing -> False
-- Just l -> elem s2 l
-- main :: IO ()
-- main = do
-- putStrLn "\n=== Transducer ==="
-- print $ lexiconTrans lexicon
-- putStrLn "\n=== example output ==="
-- putStrLn $ "Input: " ++ show "car elfcar elf"
-- putStrLn $ "Output: " ++ (show $ mkTrans lexicon "car elfcar elf")
-- putStrLn "\n=== QuickCheck tests ==="
-- quickCheck (testTrans myTrans)
-- putStrLn "\n=== Examples of test cases ==="
-- sample (arbitrary :: Gen TestCase)