gf-core/src-3.0/PGF/Data.hs

module PGF.Data where

import PGF.CId
import GF.Text.UTF8
import GF.Data.Assoc

import qualified Data.Map as Map
import Data.List
import Data.Array

-- internal datatypes for PGF

-- | An abstract data type representing multilingual grammar
-- in Portable Grammar Format.
data PGF = PGF {
  absname   :: CId ,
  cncnames  :: [CId] ,
  gflags    :: Map.Map CId String,   -- value of a global flag
  abstract  :: Abstr ,
  concretes :: Map.Map CId Concr
  }

data Abstr = Abstr {
  aflags  :: Map.Map CId String,      -- value of a flag
  funs    :: Map.Map CId (Type,Expr), -- type and def of a fun
  cats    :: Map.Map CId [Hypo],      -- context of a cat
  catfuns :: Map.Map CId [CId]        -- funs to a cat (redundant, for fast lookup)
  }

data Concr = Concr {
  cflags  :: Map.Map CId String,    -- value of a flag
  lins    :: Map.Map CId Term,      -- lin of a fun
  opers   :: Map.Map CId Term,      -- oper generated by subex elim
  lincats :: Map.Map CId Term,      -- lin type of a cat
  lindefs :: Map.Map CId Term,      -- lin default of a cat
  printnames :: Map.Map CId Term,   -- printname of a cat or a fun
  paramlincats :: Map.Map CId Term, -- lin type of cat, with printable param names
  parser  :: Maybe ParserInfo       -- parser
  }

data Type =
   DTyp [Hypo] CId [Expr]
  deriving (Eq,Ord,Show)

data Literal =
   LStr String                      -- ^ string constant
 | LInt Integer                     -- ^ integer constant
 | LFlt Double                      -- ^ floating point constant
 deriving (Eq,Ord,Show)

-- | The tree is an evaluated expression in the abstract syntax
-- of the grammar. The type is especially restricted to not
-- allow unapplied lambda abstractions. The tree is used directly
-- from the linearizer and is produced directly from the parser.
data Tree =
   Abs [CId] Tree                   -- ^ lambda abstraction. The list of variables is non-empty
 | Var CId                          -- ^ variable
 | Fun CId [Tree]                   -- ^ function application
 | Lit Literal                      -- ^ literal
 | Meta Int                         -- ^ meta variable
  deriving (Show, Eq, Ord)

-- | An expression represents a potentially unevaluated expression
-- in the abstract syntax of the grammar. It can be evaluated with
-- the 'expr2tree' function and then linearized or it can be used
-- directly in the dependent types.
data Expr =
   EAbs CId Expr                    -- ^ lambda abstraction
 | EApp Expr Expr                   -- ^ application
 | ELit Literal                     -- ^ literal
 | EMeta  Int                       -- ^ meta variable
 | EVar   CId                       -- ^ variable or function reference
 | EEq [Equation]                   -- ^ lambda function defined as a set of equations with pattern matching
  deriving (Eq,Ord,Show)

data Term =
   R [Term]
 | P Term Term
 | S [Term]
 | K Tokn
 | V Int
 | C Int
 | F CId
 | FV [Term]
 | W String Term
 | TM String
  deriving (Eq,Ord,Show)

data Tokn =
   KS String
 | KP [String] [Alternative]
  deriving (Eq,Ord,Show)

data Alternative =
   Alt [String] [String]
  deriving (Eq,Ord,Show)

data Hypo =
   Hyp CId Type
  deriving (Eq,Ord,Show)

-- | The equation is used to define lambda function as a sequence
-- of equations with pattern matching. The list of 'Expr' represents
-- the patterns and the second 'Expr' is the function body for this
-- equation.
data Equation =
   Equ [Expr] Expr
  deriving (Eq,Ord,Show)


type FToken    = String
type FCat      = Int
type FIndex    = Int
data FSymbol
  = FSymCat {-# UNPACK #-} !FIndex {-# UNPACK #-} !Int
  | FSymTok FToken
type Profile   = [Int]
type FPointPos = Int
type FGrammar  = ([FRule], Map.Map CId [FCat])
data FRule     = FRule CId [Profile] [FCat] FCat (Array FIndex (Array FPointPos FSymbol))

type RuleId = Int

data ParserInfo
    = ParserInfo { allRules           :: Array RuleId FRule
                 , topdownRules       :: Assoc FCat [RuleId]
	  	   -- ^ used in 'GF.Parsing.MCFG.Active' (Earley):
	           -- , emptyRules    :: [RuleId]
	         , epsilonRules       :: [RuleId]
		   -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
	         , leftcornerCats     :: Assoc FCat   [RuleId]
	         , leftcornerTokens   :: Assoc FToken [RuleId]
		   -- ^ used in 'GF.Parsing.MCFG.Active' (Kilbury):
	         , grammarCats        :: [FCat]
	         , grammarToks        :: [FToken]
	         , startupCats        :: Map.Map CId [FCat]
	         }


fcatString, fcatInt, fcatFloat, fcatVar :: Int
fcatString = (-1)
fcatInt    = (-2)
fcatFloat  = (-3)
fcatVar    = (-4)


-- print statistics

statGFCC :: PGF -> String
statGFCC pgf = unlines [
  "Abstract\t" ++ prCId (absname pgf),
  "Concretes\t" ++ unwords (map prCId (cncnames pgf)),
  "Categories\t" ++ unwords (map prCId (Map.keys (cats (abstract pgf))))
  ]

-- merge two GFCCs; fails is differens absnames; priority to second arg

unionPGF :: PGF -> PGF -> PGF
unionPGF one two = case absname one of
  n | n == wildCId     -> two    -- extending empty grammar
    | n == absname two -> one {  -- extending grammar with same abstract
      concretes = Map.union (concretes two) (concretes one),
      cncnames  = union (cncnames two) (cncnames one)
    }
  _ -> one   -- abstracts don't match ---- print error msg

emptyPGF :: PGF
emptyPGF = PGF {
  absname   = wildCId,
  cncnames  = [] ,
  gflags    = Map.empty,
  abstract  = error "empty grammar, no abstract",
  concretes = Map.empty
  }

-- encode idenfifiers and strings in UTF8

utf8GFCC :: PGF -> PGF
utf8GFCC pgf = pgf {
  concretes = Map.map u8concr (concretes pgf)
  }
 where
   u8concr cnc = cnc {
     lins = Map.map u8term (lins cnc),
     opers = Map.map u8term (opers cnc)
     }
   u8term = convertStringsInTerm encodeUTF8

---- TODO: convert identifiers and flags

convertStringsInTerm conv t = case t of
  K (KS s) -> K (KS (conv s))
  W s r    -> W (conv s) (convs r)
  R ts     -> R $ map convs ts
  S ts     -> S $ map convs ts
  FV ts    -> FV $ map convs ts
  P u v    -> P (convs u) (convs v)
  _        -> t
 where
  convs = convertStringsInTerm conv