gfcc.txt almost ready

2026-07-08 14:42:46 -06:00 · 2006-10-02 19:16:38 +00:00
parent 514f732d16
commit 2f284deb1c
3 changed files with 29 additions and 521 deletions
@@ -12,7 +12,7 @@
 -- GFC to GFCC compiler. AR Aug-Oct 2006
 -----------------------------------------------------------------------------

-module GF.Canon.CanonToGFCC (prCanon2gfcc) where
+module GF.Canon.CanonToGFCC (prCanon2gfcc, prCanon2f_gfcc) where

 import GF.Canon.AbsGFC
 import qualified GF.Canon.GFC as GFC
@@ -30,6 +30,11 @@ import qualified GF.Infra.Modules as M
 import qualified GF.Infra.Option as O
 import GF.UseGrammar.Linear (expandLinTables, unoptimizeCanon)

+-- these are needed for FCFG printing and might be moved
+import GF.FCFG.ToFCFG (printFGrammar)
+import GF.Conversion.GFC (gfc2fcfg)
+import GF.Infra.Option (noOptions)
+
 import GF.Infra.Ident
 import GF.Data.Operations
 import GF.Text.UTF8
@@ -44,12 +49,25 @@ prCanon2gfcc :: CanonGrammar -> String
 prCanon2gfcc = 
  Pr.printTree . canon2gfcc . reorder . utf8Conv . canon2canon . normalize

+-- print FCFG corresponding to the GFCC
+prCanon2f_gfcc :: CanonGrammar -> String
+prCanon2f_gfcc = 
+  unlines . map printFGrammar . toFCFG . 
+  reorder . utf8Conv . canon2canon . normalizeNoOpt
+ where
+   toFCFG cgr@(M.MGrammar (am:cms)) = 
+    [gfc2fcfg noOptions (M.MGrammar [am,cm],c) | cm@(c,_) <- cms]
+-- gfc2fcfg :: Options -> (CanonGrammar, Ident) -> FGrammar
+
 -- This is needed to reorganize the grammar. GFCC has its own back-end optimization.
 -- But we need to have the canonical order in tables, created by valOpt
 normalize :: CanonGrammar -> CanonGrammar
 normalize = share . unoptimizeCanon . Sub.unSubelimCanon where
  share = M.MGrammar . map (shareModule valOpt) . M.modules --- allOpt

+-- for FCFG generation
+normalizeNoOpt = unoptimizeCanon . Sub.unSubelimCanon
+
 -- Generate GFCC from GFCM.
 -- this assumes a grammar translated by canon2canon

@@ -115,20 +133,10 @@ reorder cg = M.MGrammar $
       cncs = sortBy (\ (x,_) (y,_) -> compare x y)
            [(lang, concr lang) | lang <- M.allConcretes cg abs]
       concr la = sortBy (\ (f,_) (g,_) -> compare f g) 
-            [finfo | 
+            [changeTyp finfo | 
              (i,mo) <- mos, M.isModCnc mo, elem i (M.allExtends cg la),
              finfo <- tree2list (M.jments mo)]

-- one grammar per language - needed for symtab generation
-repartition :: CanonGrammar -> [CanonGrammar]
-repartition cg = [M.partOfGrammar cg (lang,mo) | 
-  let abs = maybe (error "no abstract") id $ M.greatestAbstract cg,
-  let mos = M.allModMod cg,
-  lang <- M.allConcretes cg abs,
-  let mo = errVal 
-       (error ("no module found for " ++ A.prt lang)) $ M.lookupModule cg lang
-  ]
-
 -- convert to UTF8 if not yet converted
 utf8Conv :: CanonGrammar -> CanonGrammar
 utf8Conv = M.MGrammar . map toUTF8 . M.modules where
@@ -1,13 +1,4 @@
 The GFCC Grammar Format
-Aarne Ranta
-October 3, 2006
-
-Author's address:
-[``http://www.cs.chalmers.se/~aarne`` http://www.cs.chalmers.se/~aarne]
-
-% to compile: txt2tags -thtml --toc gfcc.txt
-
-==What is GFCC==

 GFCC is a low-level format for GF grammars. Its aim is to contain the minimum
 that is needed to process GF grammars at runtime. This minimality has three
@@ -17,43 +8,25 @@ advantages:
 - simple definition of interpreters


-The idea is that all embedded GF applications are compiled to GFCC.
-The GF system would be primarily used as a compiler and as a grammar
-development tool.
-
-Since GFCC is implemented in BNFC, a parser of the format is readily
-available for C, C++, Haskell, Java, and OCaml. Also an XML 
-representation is generated in BNFC. A 
-[reference implementation ../]
-of linearization and some other functions has been written in Haskell.
-
-
-==GFCC vs. GFC==
-
-GFCC is aimed to replace GFC as the run-time grammar format. GFC was designed
-to be a run-time format, but also to
-support separate compilation of grammars, i.e.
-to store the results of compiling 
-individual GF modules. But this means that GFC has to contain extra information,
-such as type annotations, which is only needed in compilation and not at
+GFCC is aimed to replace GFC as the run-time grammar format. GFC is designed
+to support separate compilation of grammars, to store the results of compiling 
+individual GF modules. But this means it has to contain extra information,
+such as type information, which is only needed in compilation and not at
 run-time. In particular, the pattern matching syntax and semantics of GFC is
 complex and therefore difficult to implement in new platforms.

-The main differences of GFCC compared with GFC can be summarized as follows:
- there are no modules, and therefore no qualified names
+The main novelties of GFCC compared with GFC can be summarized as follows:
 - a GFCC grammar is multilingual, and consists of a common abstract syntax 
  together with one concrete syntax per language
+- there are no modules, and therefore no qualified names
 - records and tables are replaced by arrays
 - record labels and parameter values are replaced by integers
 - record projection and table selection are replaced by array indexing
- there is (so far) no support for dependent types or higher-order abstract
-  syntax (which would be easy to add, but make interpreters much more difficult
-  to write)
+


 Here is an example of a GF grammar, consisting of three modules, 
-as translated to GFCC. The representations are aligned, with the exceptions
-due to the alphabetical sorting of GFCC grammars.
+as translated to GFCC.
 ```  
                                    grammar Ex (Eng Swe);

@@ -102,477 +75,3 @@ concrete Swe of Ex = {              concrete Swe {
                                      } ;
 }                                   ;
 ```
-
-==The syntax of GFCC files==
-
-===Top level===
-
-A grammar has a header telling the name of the abstract syntax
-(often specifying an application domain), and the names of
-the concrete languages. The abstract syntax and the concrete
-syntaxes themselves follow.
-```
-  Grammar  ::= Header ";" Abstract ";" [Concrete] ";" ;
-  Header   ::= "grammar" CId "(" [CId] ")" ;
-  Abstract ::= "abstract" "{" [AbsDef] "}" ";" ;
-  Concrete ::= "concrete" CId "{" [CncDef] "}" ;
-```
-Abstract syntax judgements give typings and semantic definitions.
-Concrete syntax judgements give linearizations.
-```
-  AbsDef   ::= CId ":" Type "=" Exp ;
-  CncDef   ::= CId "=" Term ;
-```
-Also flags are possible, local to each "module" (i.e. abstract and concretes).
-```
-  AbsDef   ::= "%" CId "=" String ;
-  CncDef   ::= "%" CId "=" String ;
-```
-For the run-time system, the reference implementation in Haskell
-uses a structure that gives efficient look-up:
-```
-  data GFCC = GFCC {
-    absname   :: CId ,
-    cncnames  :: [CId] ,
-    abstract  :: Abstr ,
-    concretes :: Map CId Concr
-    }
-
-  data Abstr = Abstr {
-    funs :: Map CId Type,   -- find the type of a fun
-    cats :: Map CId [CId]   -- find the funs giving a cat
-    }
-
-  type Concr = Map CId Term
-```
-
-
-===Abstract syntax===
-
-Types are first-order function types built from
-category symbols. Syntax trees (``Exp``) are
-rose trees with the head (``Atom``) either a function
-constant, a metavariable, or a string, integer, or float
-literal.
-```
-  Type     ::= [CId] "->" CId ;
-  Exp      ::= "(" Atom [Exp] ")" ;
-  Atom     ::= CId ;        -- function constant
-  Atom     ::= "?" ;        -- metavariable
-  Atom     ::= String ;     -- string literal
-  Atom     ::= Integer ;    -- integer literal
-  Atom     ::= Double ;     -- float literal
-```
-
-
-===Concrete syntax===
-
-Linearization terms (``Term``) are built as follows.
-```
-  Term     ::= "[" [Term] "]" ;          -- array
-  Term     ::= Term "[" Term "]" ;       -- access to indexed field
-  Term     ::= "(" [Term] ")" ;          -- sequence with ++
-  Term     ::= Tokn ;                    -- token
-  Term     ::= "$" Integer ;             -- argument subtree
-  Term     ::= Integer ;                 -- array index
-  Term     ::= "[|" [Term] "|]" ;        -- free variation
-```
-Tokens are strings or (maybe obsolescent) prefix-dependent
-variant lists.
-```
-  Tokn     ::= String ;
-  Tokn     ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
-  Variant  ::= [String] "/" [String] ;
-```
-Three special forms of terms are introduced by the compiler
-as optimizations. They can in principle be eliminated, but
-their presence makes grammars much more compact. Their semantics
-will be explained in a later section.
-```
-  Term     ::= CId ;                     -- global constant
-  Term     ::= "(" String "+" Term ")" ; -- prefix + suffix table
-  Term     ::= "(" Term "@" Term ")";    -- record parameter alias
-```
-Identifiers are like ``Ident`` in GF and GFC, except that
-the compiler produces constants prefixed with ``_`` in
-the common subterm elimination optimization.
-```
-  token CId (('_' | letter) (letter | digit | '\'' | '_')*) ;
-```
-
-
-==The semantics of concrete syntax terms==
-
-===Linearization and realization===
-
-The linearization algorithm is essentially the same as in
-GFC: a tree is linearized by evaluating its linearization term
-in the environment of the linearizations of the subtrees.
-Literal atoms are linearized in the obvious way.
-The function also needs to know the language (i.e. concrete syntax)
-in which linearization is performed.
-```
-  linExp :: GFCC -> CId -> Exp -> Term
-  linExp mcfg lang tree@(Tr at trees) = case at of
-    AC fun -> comp (Prelude.map lin trees) $ look fun
-    AS s   -> R [kks (show s)] -- quoted
-    AI i   -> R [kks (show i)]
-    AF d   -> R [kks (show d)]
-    AM     -> R [kks "?"]      ---- TODO: proper lincat
-   where
-     lin  = linExp mcfg lang
-     comp = compute mcfg lang
-     look = lookLin mcfg lang
-```
-The result of linearization is usually a record, which is realized as
-a string using the following algorithm.
-```
-  realize :: Term -> String
-  realize trm = case trm of
-    R (t:_)  -> realize t
-    S ss     -> unwords $ Prelude.map realize ss
-    K (KS s) -> s
-    K (KP s _) -> unwords s ---- prefix choice TODO
-    W s t    -> s ++ realize t
-    FV (t:_) -> realize t
-```
-Since the order of record fields is not necessarily
-the same as in GF source,
-this realization does not work securely for
-categories whose lincats more than one field.
-
-
-===Term evaluation===
-
-Evaluation follows call-by-value order, with two environments
-needed:
- the grammar (a concrete syntax) to give the global constants
- an array of terms to give the subtree linearizations
-
-
-The code is cleaned from debugging information present in the working
-version.
-```
-  compute :: GFCC -> CId -> [Term] -> Term -> Term
-  compute mcfg lang args = comp where
-    comp trm = case trm of
-      P r (FV ts) -> FV $ Prelude.map (comp . P r) ts
-
-      P r p -> case (comp r, comp p) of 
-
-        -- for the suffix optimization
-        (W s (R ss), p') -> case comp $ idx ss (getIndex p') of
-          K (KS u) -> kks (s ++ u)
-
-        (r', p') -> comp $ (getFields r') !! (getIndex p')
-
-      RP i t -> RP (comp i) (comp t)
-      W s t -> W s (comp t)
-      R ts  -> R $ Prelude.map comp ts
-      V i   -> args !! (fromInteger i)    -- already computed
-      S ts  -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts
-      F c   -> comp $ lookLin mcfg lang   -- not yet computed
-      FV ts -> FV $ Prelude.map comp ts
-      _ -> trm
-
-    getIndex t =  case t of
-      C i -> fromInteger i
-      RP p _ -> getIndex p
-
-    getFields t = case t of
-      R rs -> rs
-      RP _ r -> getFields r
-```
-
-===The special term constructors===
-
-The three forms introduced by the compiler may a need special
-explanation.
-
-Global constants
-```
-  Term     ::= CId ;
-```
-are shorthands for complex terms. They are produced by the
-compiler by (iterated) common subexpression elimination.
-They are often more powerful than hand-devised code sharing in the source
-code. They could be computed off-line by replacing each identifier by 
-its definition.
-
-Prefix-suffix tables 
-```
-  Term     ::= "(" String "+" Term ")" ; 
-```
-represent tables of word forms divided to the longest common prefix
-and its array of suffixes. In the example grammar above, we have
-```
-  Sleep = [("sleep" + ["s",""])]
-```
-which in fact is equal to the array of full forms
-```
-  ["sleeps", "sleep"]
-```
-The power of this construction comes from the fact that suffix sets
-tend to be repeated in a language, and can therefore be collected
-by common subexpression elimination. It is this technique that
-explains the used syntax rather than the more accurate
-```
-  "(" String "+" [String] ")"
-```
-since we want the suffix part to be a ``Term`` for the optimization to
-take effect.
-
-The most curious construct of GFCC is the parameter array alias, 
-```
-  Term     ::= "(" Term "@" Term ")";
-```
-This form is used as the value of parameter records, such as the type
-```
-  {n : Number ; p : Person}
-```
-The problem with parameter records is their double role.
-They can be used like parameter values, as indices in selection,
-```
-  VP.s ! {n = Sg ; p = P3}
-```
-but also as records, from which parameters can be projected:
-```
-  {n = Sg ; p = P3}.n
-```
-Whichever use is selected as primary, a prohibitively complex
-case expression must be generated at compilation to GFCC to get the
-other use. The adopted
-solution is to generate a pair containing both a parameter value index 
-and an array of indices of record fields. For instance, if we have
-```
-  param Number = Sg | Pl ; Person = P1 | P2 | P3 ;
-```
-we get the encoding
-```
-  {n = Sg ; p = P3}  ---> (2 @ [0,2])
-```
-The GFCC computation rules are essentially
-```
-  t [(i @ r)]  = t[i]
-  (i @ r) [j]  = r[j]
-```
-
-
-==Compiling to GFCC==
-
-Compilation to GFCC is performed by the GF grammar compiler, and
-GFCC interpreters need not know what it does. For grammar writers,
-however, it might be interesting to know what happens to the grammars
-in the process.
-
-The compilation phases are the following
-+ translate GF source to GFC, as always in GF
-+ undo GFC back-end optimizations
-+ perform the ``values`` optimization to normalize tables
-+ create a symbol table mapping the GFC parameter and record types to
-  fixed-size arrays, and parameter values and record labels to integers
-+ traverse the linearization rules replacing parameters and labels by integers
-+ reorganize the created GFC grammar so that it has just one abstract syntax
-  and one concrete syntax per language
-+ apply UTF8 encoding to the grammar, if not yet applied (this is told by the
-  ``coding`` flag)
-+ translate the GFC syntax tree to a GFCC syntax tree, using a simple
-  compositional mapping
-+ perform the word-suffix optimization on GFCC linearization terms
-+ perform subexpression elimination on each concrete syntax module
-+ print out the GFCC code
-
-
-Notice that a major part of the compilation is done within GFC, so that
-GFC-related tasks (such as parser generation) could be performed by
-using the old algorithms.
-
-
-===Problems in GFCC compilation===
-
-Two major problems had to be solved in compiling GFC to GFCC:
- consistent order of tables and records, to permit the array translation
- run-time variables in complex parameter values.
-
-
-The current implementation is still experimental and may fail
-to generate correct code. Any errors remaining are likely to be 
-related to the two problems just mentioned.
-
-The order problem is solved in different ways for tables and records.
-For tables, the ``values`` optimization of GFC already manages to
-maintain a canonical order. But this order can be destroyed by the
-``share`` optimization. To make sure that GFCC compilation works properly,
-it is safest to recompile the GF grammar by using the ``values``
-optimization flag.
-
-Records can be canonically ordered by sorting them by labels.
-In fact, this was done in connection of the GFCC work as a part
-of the GFC generation, to guarantee consistency. This means that
-e.g. the ``s`` field will in general no longer appear as the first
-field, even if it does so in the GF source code. But relying on the
-order of fields in a labelled record would be misplaced anyway.
-
-The canonical form of records is further complicated by lock fields,
-i.e. dummy fields of form ``lock_C = <>``, which are added to grammar
-libraries to force intensionality of linearization types. The problem
-is that the absence of a lock field only generates a warning, not
-an error. Therefore a GFC grammar can contain objects of the same
-type with and without a lock field. This problem was solved in GFCC
-generation by just removing all lock fields (defined as fields whose
-type is the empty record type). This has the further advantage of
-(slightly) reducing the grammar size. More importantly, it is safe
-to remove lock fields, because they are never used in computation,
-and because intensional types are only needed in grammars reused
-as libraries, not in grammars used at runtime.
-
-While the order problem is rather bureaucratic in nature, run-time 
-variables are an interesting problem. They arise in the presence
-of complex parameter values, created by argument-taking constructors
-and parameter records. To give an example, consider the GF parameter
-type system
-```
-  Number = Sg | Pl ;
-  Person = P1 | P2 | P3 ;
-  Agr = Ag Number Person ;
-```
-The values can be translated to integers in the expected way,
-```
-  Sg = 0, Pl = 1
-  P1 = 0, P2 = 1, P3 = 2
-  Ag Sg P1 = 0, Ag Sg P2 = 1, Ag Sg P3 = 2,
-  Ag Pl P1 = 3, Ag Pl P2 = 4, Ag Pl P3 = 5
-```
-However, an argument of ``Agr`` can be a run-time variable, as in
-```
-  Ag np.n P3
-```
-This expression must first be translated to a case expression,
-```
-  case np.n of {
-    0 => 2 ;
-    1 => 5
-    }
-```
-which can then be translated to the GFCC term
-```
-  [2,5][$0[$1]]  
-```
-assuming that the variable $np$ is the first argument and that its
-$Number$ field is the second in the record.
-
-This transformation of course has to be performed recursively, since
-there can be several run-time variables in a parameter value:
-```
-  Ag np.n np.p
-```
-A similar transformation would be possible to deal with the double
-role of parameter records discussed above. Thus the type
-```
-  RNP = {n : Number ; p : Person}
-```
-could be uniformly translated into the set ``{0,1,2,3,4,5}``
-as ``Agr`` above. Selections would be simple instances of indexing.
-But any projection from the record should be translated into
-a case expression,
-```
-  rnp.n  ===> 
-  case rnp of {
-    0 => 0 ;
-    1 => 0 ;
-    2 => 0 ;
-    3 => 1 ;
-    4 => 1 ;
-    5 => 1
-    }
-```
-To avoid the code bloat resulting from this, we chose the alias representation
-which is easy enough to deal with in interpreters.
-
-
-
-===Running the compiler and the GFCC interpreter===
-
-GFCC generation is a part of the 
-[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html] 
-of GF since September 2006. To invoke the compiler, the flag 
-``-printer=gfcc`` to the command
-``pm = print_multi`` is used. It is wise to recompile the grammar from
-source, since previously compiled libraries may not obey the canonical
-order of records. To ``strip`` the grammar before
-GFCC translation removes unnecessary interface references.
-Here is an example, performed in
-[example/bronzeage ../../../../../examples/bronzeage].
-```
-  i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageEng.gf
-  i -src -path=.:prelude:resource-1.0/* -optimize=all_subs BronzeageGer.gf
-  strip
-  pm -printer=gfcc | wf bronze.gfcc
-```
-
-
-
-==The reference interpreter==
-
-The reference interpreter written in Haskell consists of the following files:
-```
-  -- source file for BNFC
-  GFCC.cf       -- labelled BNF grammar of gfcc
-
-  -- files generated by BNFC
-  AbsGFCC.hs    -- abstrac syntax of gfcc
-  ErrM.hs       -- error monad used internally
-  LexGFCC.hs    -- lexer of gfcc files
-  ParGFCC.hs    -- parser of gfcc files and syntax trees
-  PrintGFCC.hs  -- printer of gfcc files and syntax trees
-
-  -- hand-written files
-  DataGFCC.hs   -- post-parser grammar creation, linearization and evaluation
-  GenGFCC.hs    -- random and exhaustive generation, generate-and-test parsing
-  RunGFCC.hs    -- main function - a simple command interpreter
-```
-It is included in the
-[developers' version http://www.cs.chalmers.se/Cs/Research/Language-technology/darcs/GF/doc/darcs.html]
-of GF, in the subdirectory [``GF/src/GF/Canon/GFCC`` ../].
-
-To compile the interpreter, type
-```
-  make gfcc
-```
-in ``GF/src``. To run it, type
-```
-  ./gfcc <GFCC-file>
-```
-The available commands are
- ``gr <Cat> <Int>``:  generate a number of random trees in category.
-  and show their linearizations in all languages
- ``grt <Cat> <Int>``:  generate a number of random trees in category.
-  and show the trees and their linearizations in all languages
- ``gt <Cat> <Int>``:  generate a number of trees in category from smallest,
-  and show their linearizations in all languages
- ``gtt <Cat> <Int>``:  generate a number of trees in category from smallest,
-  and show the trees and their linearizations in all languages
- ``p <Int> <Cat> <String>``: "parse", i.e. generate trees until match or 
-  until the given number have been generated
- ``<Tree>``: linearize tree in all languages, also showing full records
- ``quit``: terminate the system cleanly
-
-
-==Some things to do==
-
-Interpreters in Java and C++.
-
-Parsing via MCFG 
- the FCFG format can possibly be simplified
- parser grammars should be saved in files to make interpreters easier
-
-
-File compression of GFCC output.
-
-Syntax editor based on GFCC.
-
-Rewriting of resource libraries in order to exploit the
-word-suffix sharing better (depth-one tables, as in FM).
-
-
-
@@ -345,6 +345,7 @@ customMultiGrammarPrinter =
  [
   (strCI "gfcm", const MC.prCanon)
  ,(strCI "gfcc", const GFCC.prCanon2gfcc)
+  ,(strCI "f_gfcc", const GFCC.prCanon2f_gfcc)
  ,(strCI "header", const (MC.prCanonMGr . unoptimizeCanon))
  ,(strCI "cfgm", prCanonAsCFGM)
  ,(strCI "graph", visualizeCanonGrammar)