forked from GitHub/gf-core
202 lines
6.0 KiB
Haskell
202 lines
6.0 KiB
Haskell
module PGF.Data where
|
|
|
|
import PGF.CId
|
|
import GF.Text.UTF8
|
|
import GF.Data.Assoc
|
|
|
|
import qualified Data.Map as Map
|
|
import Data.List
|
|
import Data.Array
|
|
|
|
-- internal datatypes for PGF
|
|
|
|
-- | An abstract data type representing multilingual grammar
|
|
-- in Portable Grammar Format.
|
|
data PGF = PGF {
|
|
absname :: CId ,
|
|
cncnames :: [CId] ,
|
|
gflags :: Map.Map CId String, -- value of a global flag
|
|
abstract :: Abstr ,
|
|
concretes :: Map.Map CId Concr
|
|
}
|
|
|
|
data Abstr = Abstr {
|
|
aflags :: Map.Map CId String, -- value of a flag
|
|
funs :: Map.Map CId (Type,Expr), -- type and def of a fun
|
|
cats :: Map.Map CId [Hypo], -- context of a cat
|
|
catfuns :: Map.Map CId [CId] -- funs to a cat (redundant, for fast lookup)
|
|
}
|
|
|
|
data Concr = Concr {
|
|
cflags :: Map.Map CId String, -- value of a flag
|
|
lins :: Map.Map CId Term, -- lin of a fun
|
|
opers :: Map.Map CId Term, -- oper generated by subex elim
|
|
lincats :: Map.Map CId Term, -- lin type of a cat
|
|
lindefs :: Map.Map CId Term, -- lin default of a cat
|
|
printnames :: Map.Map CId Term, -- printname of a cat or a fun
|
|
paramlincats :: Map.Map CId Term, -- lin type of cat, with printable param names
|
|
parser :: Maybe ParserInfo -- parser
|
|
}
|
|
|
|
data Type =
|
|
DTyp [Hypo] CId [Expr]
|
|
deriving (Eq,Ord,Show)
|
|
|
|
data Literal =
|
|
LStr String -- ^ string constant
|
|
| LInt Integer -- ^ integer constant
|
|
| LFlt Double -- ^ floating point constant
|
|
deriving (Eq,Ord,Show)
|
|
|
|
-- | The tree is an evaluated expression in the abstract syntax
|
|
-- of the grammar. The type is especially restricted to not
|
|
-- allow unapplied lambda abstractions. The tree is used directly
|
|
-- from the linearizer and is produced directly from the parser.
|
|
data Tree =
|
|
Abs [CId] Tree -- ^ lambda abstraction. The list of variables is non-empty
|
|
| Var CId -- ^ variable
|
|
| Fun CId [Tree] -- ^ function application
|
|
| Lit Literal -- ^ literal
|
|
| Meta Int -- ^ meta variable
|
|
deriving (Show, Eq, Ord)
|
|
|
|
-- | An expression represents a potentially unevaluated expression
|
|
-- in the abstract syntax of the grammar. It can be evaluated with
|
|
-- the 'expr2tree' function and then linearized or it can be used
|
|
-- directly in the dependent types.
|
|
data Expr =
|
|
EAbs CId Expr -- ^ lambda abstraction
|
|
| EApp Expr Expr -- ^ application
|
|
| ELit Literal -- ^ literal
|
|
| EMeta Int -- ^ meta variable
|
|
| EVar CId -- ^ variable or function reference
|
|
| EEq [Equation] -- ^ lambda function defined as a set of equations with pattern matching
|
|
deriving (Eq,Ord,Show)
|
|
|
|
data Term =
|
|
R [Term]
|
|
| P Term Term
|
|
| S [Term]
|
|
| K Tokn
|
|
| V Int
|
|
| C Int
|
|
| F CId
|
|
| FV [Term]
|
|
| W String Term
|
|
| TM String
|
|
deriving (Eq,Ord,Show)
|
|
|
|
data Tokn =
|
|
KS String
|
|
| KP [String] [Alternative]
|
|
deriving (Eq,Ord,Show)
|
|
|
|
data Alternative =
|
|
Alt [String] [String]
|
|
deriving (Eq,Ord,Show)
|
|
|
|
data Hypo =
|
|
Hyp CId Type
|
|
deriving (Eq,Ord,Show)
|
|
|
|
-- | The equation is used to define lambda function as a sequence
|
|
-- of equations with pattern matching. The list of 'Expr' represents
|
|
-- the patterns and the second 'Expr' is the function body for this
|
|
-- equation.
|
|
data Equation =
|
|
Equ [Expr] Expr
|
|
deriving (Eq,Ord,Show)
|
|
|
|
|
|
type FToken = String
|
|
type FCat = Int
|
|
type FIndex = Int
|
|
data FSymbol
|
|
= FSymCat {-# UNPACK #-} !FIndex {-# UNPACK #-} !Int
|
|
| FSymTok FToken
|
|
type Profile = [Int]
|
|
type FPointPos = Int
|
|
type FGrammar = ([FRule], Map.Map CId [FCat])
|
|
data FRule = FRule CId [Profile] [FCat] FCat (Array FIndex (Array FPointPos FSymbol))
|
|
|
|
type RuleId = Int
|
|
|
|
data ParserInfo
|
|
= ParserInfo { allRules :: Array RuleId FRule
|
|
, topdownRules :: Assoc FCat [RuleId]
|
|
-- ^ used in 'GF.Parsing.MCFG.Active' (Earley):
|
|
-- , emptyRules :: [RuleId]
|
|
, epsilonRules :: [RuleId]
|
|
-- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
|
|
, leftcornerCats :: Assoc FCat [RuleId]
|
|
, leftcornerTokens :: Assoc FToken [RuleId]
|
|
-- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
|
|
, grammarCats :: [FCat]
|
|
, grammarToks :: [FToken]
|
|
, startupCats :: Map.Map CId [FCat]
|
|
}
|
|
|
|
|
|
fcatString, fcatInt, fcatFloat, fcatVar :: Int
|
|
fcatString = (-1)
|
|
fcatInt = (-2)
|
|
fcatFloat = (-3)
|
|
fcatVar = (-4)
|
|
|
|
|
|
-- print statistics
|
|
|
|
statGFCC :: PGF -> String
|
|
statGFCC pgf = unlines [
|
|
"Abstract\t" ++ prCId (absname pgf),
|
|
"Concretes\t" ++ unwords (map prCId (cncnames pgf)),
|
|
"Categories\t" ++ unwords (map prCId (Map.keys (cats (abstract pgf))))
|
|
]
|
|
|
|
-- merge two GFCCs; fails is differens absnames; priority to second arg
|
|
|
|
unionPGF :: PGF -> PGF -> PGF
|
|
unionPGF one two = case absname one of
|
|
n | n == wildCId -> two -- extending empty grammar
|
|
| n == absname two -> one { -- extending grammar with same abstract
|
|
concretes = Map.union (concretes two) (concretes one),
|
|
cncnames = union (cncnames two) (cncnames one)
|
|
}
|
|
_ -> one -- abstracts don't match ---- print error msg
|
|
|
|
emptyPGF :: PGF
|
|
emptyPGF = PGF {
|
|
absname = wildCId,
|
|
cncnames = [] ,
|
|
gflags = Map.empty,
|
|
abstract = error "empty grammar, no abstract",
|
|
concretes = Map.empty
|
|
}
|
|
|
|
-- encode idenfifiers and strings in UTF8
|
|
|
|
utf8GFCC :: PGF -> PGF
|
|
utf8GFCC pgf = pgf {
|
|
concretes = Map.map u8concr (concretes pgf)
|
|
}
|
|
where
|
|
u8concr cnc = cnc {
|
|
lins = Map.map u8term (lins cnc),
|
|
opers = Map.map u8term (opers cnc)
|
|
}
|
|
u8term = convertStringsInTerm encodeUTF8
|
|
|
|
---- TODO: convert identifiers and flags
|
|
|
|
convertStringsInTerm conv t = case t of
|
|
K (KS s) -> K (KS (conv s))
|
|
W s r -> W (conv s) (convs r)
|
|
R ts -> R $ map convs ts
|
|
S ts -> S $ map convs ts
|
|
FV ts -> FV $ map convs ts
|
|
P u v -> P (convs u) (convs v)
|
|
_ -> t
|
|
where
|
|
convs = convertStringsInTerm conv
|
|
|