Files
gf-core/src-3.0/PGF/Data.hs

202 lines
6.0 KiB
Haskell

module PGF.Data where
import PGF.CId
import GF.Text.UTF8
import GF.Data.Assoc
import qualified Data.Map as Map
import Data.List
import Data.Array
-- internal datatypes for PGF
-- | An abstract data type representing multilingual grammar
-- in Portable Grammar Format.
data PGF = PGF {
absname :: CId ,
cncnames :: [CId] ,
gflags :: Map.Map CId String, -- value of a global flag
abstract :: Abstr ,
concretes :: Map.Map CId Concr
}
data Abstr = Abstr {
aflags :: Map.Map CId String, -- value of a flag
funs :: Map.Map CId (Type,Expr), -- type and def of a fun
cats :: Map.Map CId [Hypo], -- context of a cat
catfuns :: Map.Map CId [CId] -- funs to a cat (redundant, for fast lookup)
}
data Concr = Concr {
cflags :: Map.Map CId String, -- value of a flag
lins :: Map.Map CId Term, -- lin of a fun
opers :: Map.Map CId Term, -- oper generated by subex elim
lincats :: Map.Map CId Term, -- lin type of a cat
lindefs :: Map.Map CId Term, -- lin default of a cat
printnames :: Map.Map CId Term, -- printname of a cat or a fun
paramlincats :: Map.Map CId Term, -- lin type of cat, with printable param names
parser :: Maybe ParserInfo -- parser
}
data Type =
DTyp [Hypo] CId [Expr]
deriving (Eq,Ord,Show)
data Literal =
LStr String -- ^ string constant
| LInt Integer -- ^ integer constant
| LFlt Double -- ^ floating point constant
deriving (Eq,Ord,Show)
-- | The tree is an evaluated expression in the abstract syntax
-- of the grammar. The type is especially restricted to not
-- allow unapplied lambda abstractions. The tree is used directly
-- from the linearizer and is produced directly from the parser.
data Tree =
Abs [CId] Tree -- ^ lambda abstraction. The list of variables is non-empty
| Var CId -- ^ variable
| Fun CId [Tree] -- ^ function application
| Lit Literal -- ^ literal
| Meta Int -- ^ meta variable
deriving (Show, Eq, Ord)
-- | An expression represents a potentially unevaluated expression
-- in the abstract syntax of the grammar. It can be evaluated with
-- the 'expr2tree' function and then linearized or it can be used
-- directly in the dependent types.
data Expr =
EAbs CId Expr -- ^ lambda abstraction
| EApp Expr Expr -- ^ application
| ELit Literal -- ^ literal
| EMeta Int -- ^ meta variable
| EVar CId -- ^ variable or function reference
| EEq [Equation] -- ^ lambda function defined as a set of equations with pattern matching
deriving (Eq,Ord,Show)
data Term =
R [Term]
| P Term Term
| S [Term]
| K Tokn
| V Int
| C Int
| F CId
| FV [Term]
| W String Term
| TM String
deriving (Eq,Ord,Show)
data Tokn =
KS String
| KP [String] [Alternative]
deriving (Eq,Ord,Show)
data Alternative =
Alt [String] [String]
deriving (Eq,Ord,Show)
data Hypo =
Hyp CId Type
deriving (Eq,Ord,Show)
-- | The equation is used to define lambda function as a sequence
-- of equations with pattern matching. The list of 'Expr' represents
-- the patterns and the second 'Expr' is the function body for this
-- equation.
data Equation =
Equ [Expr] Expr
deriving (Eq,Ord,Show)
type FToken = String
type FCat = Int
type FIndex = Int
data FSymbol
= FSymCat {-# UNPACK #-} !FIndex {-# UNPACK #-} !Int
| FSymTok FToken
type Profile = [Int]
type FPointPos = Int
type FGrammar = ([FRule], Map.Map CId [FCat])
data FRule = FRule CId [Profile] [FCat] FCat (Array FIndex (Array FPointPos FSymbol))
type RuleId = Int
data ParserInfo
= ParserInfo { allRules :: Array RuleId FRule
, topdownRules :: Assoc FCat [RuleId]
-- ^ used in 'GF.Parsing.MCFG.Active' (Earley):
-- , emptyRules :: [RuleId]
, epsilonRules :: [RuleId]
-- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
, leftcornerCats :: Assoc FCat [RuleId]
, leftcornerTokens :: Assoc FToken [RuleId]
-- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
, grammarCats :: [FCat]
, grammarToks :: [FToken]
, startupCats :: Map.Map CId [FCat]
}
fcatString, fcatInt, fcatFloat, fcatVar :: Int
fcatString = (-1)
fcatInt = (-2)
fcatFloat = (-3)
fcatVar = (-4)
-- print statistics
statGFCC :: PGF -> String
statGFCC pgf = unlines [
"Abstract\t" ++ prCId (absname pgf),
"Concretes\t" ++ unwords (map prCId (cncnames pgf)),
"Categories\t" ++ unwords (map prCId (Map.keys (cats (abstract pgf))))
]
-- merge two GFCCs; fails is differens absnames; priority to second arg
unionPGF :: PGF -> PGF -> PGF
unionPGF one two = case absname one of
n | n == wildCId -> two -- extending empty grammar
| n == absname two -> one { -- extending grammar with same abstract
concretes = Map.union (concretes two) (concretes one),
cncnames = union (cncnames two) (cncnames one)
}
_ -> one -- abstracts don't match ---- print error msg
emptyPGF :: PGF
emptyPGF = PGF {
absname = wildCId,
cncnames = [] ,
gflags = Map.empty,
abstract = error "empty grammar, no abstract",
concretes = Map.empty
}
-- encode idenfifiers and strings in UTF8
utf8GFCC :: PGF -> PGF
utf8GFCC pgf = pgf {
concretes = Map.map u8concr (concretes pgf)
}
where
u8concr cnc = cnc {
lins = Map.map u8term (lins cnc),
opers = Map.map u8term (opers cnc)
}
u8term = convertStringsInTerm encodeUTF8
---- TODO: convert identifiers and flags
convertStringsInTerm conv t = case t of
K (KS s) -> K (KS (conv s))
W s r -> W (conv s) (convs r)
R ts -> R $ map convs ts
S ts -> S $ map convs ts
FV ts -> FV $ map convs ts
P u v -> P (convs u) (convs v)
_ -> t
where
convs = convertStringsInTerm conv