gfcc report

2026-07-12 16:42:46 -06:00 · 2004-09-28 19:44:02 +00:00
parent 422b626a36
commit 01f872bf5d
11 changed files with 137 additions and 71 deletions
@@ -1,6 +1,6 @@
 --# -path=.:../prelude
 concrete ImperC of Imper = open ResImper in {
-  flags lexer=codevars ; unlexer=code ; startcat=Stm ;
+  flags lexer=codevars ; unlexer=code ; startcat=Program ;

  lincat
    Exp = PrecExp ; 
@@ -28,12 +28,12 @@ jvm.tmp       -- pseudo-JVM produced by GF linearization

 Required programs to use the compiler:
 -------------------------------------
-gf+       -- Grammatical Framework version 2.0+, >= 23/9/2004
-jasmin    -- JVM assembler (to compile Foo.j to Foo.class)
+gf+        -- Grammatical Framework version 2.1beta, >= 23/9/2004
+jasmin     -- JVM assembler (to compile Foo.j to Foo.class)

 Required programs to build the compiler:
 ---------------------------------------
-bnfc       -- BNF Converter version 2.1+,  >= 23/9/2004
+bnfc       -- BNF Converter version 2.2beta,  >= 23/9/2004
 happy      -- parser generator for Haskell, >= 1.13 
 alex       -- lexer generator for Haskell, >= 2.0
 Profile.hs -- BNFC source file (formats/profile), must be on your path
@@ -10,11 +10,28 @@ int fact (int n) {
  return f ;
 } ;

+int factr (int n) {
+  int f ;
+  {
+  if (n < 2) {
+    f = 1 ;
+    }
+  else {
+    f = n * factr (n-1) ;
+  }
+  }
+  return f ;
+} ;
+
 int main () {
  int n ;
  n = 1 ;
  {
-    while (n < 11) printf("%d",fact(n)) ; n = n+1 ;
+    while (n < 11) {
+      printf("%d",fact(n)) ;
+      printf("%d",factr(n)) ; 
+      n = n+1 ;
+    }
  }
  return ;
 } ;
@@ -1,4 +1,4 @@
 ./TestImperC $1 | tail -1 >gft.tmp
-echo "es -file=typecheck.gfs" | gf+ -s Imper.gfcm
+echo "es -file=typecheck.gfs" | gf -s Imper.gfcm
 runhugs CleanJVM jvm.tmp $1
-#rm *.tmp
+rm *.tmp
@@ -1,10 +1,10 @@
-GF=gf+
+GF=gf
 SRC=../

 all: compiler

 compiler:
-	echo "pm | wf Imper.gfcm ;; pg -lang=ImperC -printer=lbnf | wf ImperC.tmp" | $(GF) $(SRC)ImperC.gf $(SRC)ImperJVM.gf
+	echo "pm | wf Imper.gfcm ;; pg -lang=ImperC -printer=plbnf | wf ImperC.tmp" | $(GF) $(SRC)ImperC.gf $(SRC)ImperJVM.gf
 	echo "entrypoints Program, Stm, Exp ;" >entry.tmp
 	cat entry.tmp ImperC.tmp >ImperC.cf
 	bnfc -m -prof ImperC.cf
@@ -81,11 +81,12 @@ The second point, code generation by linearization, means that the
 back end is likewise implemented by a grammar of the target
 language (in this case, a fragment of JVM). This grammar is the
 declarative source from which the compiler back end is derived.
-In addition, some simple string processing is needed to
+In addition, some postprocessing is needed to
 make the code conform to Jasmin assembler requirements.

-The complete code of the compiler is 300 lines. It is presented in 
-the appendices of this paper.
+The complete code of the compiler is 300 lines: 250 lines for the grammars,
+50 lines for the postprocessor. The code 
+is presented in the appendices of this paper.



@@ -148,6 +149,11 @@ GF then works much the same way as any grammar
 formalism or parser generator. 
 The largest number of languages in an application known to us is 88;
 its domain are numeral expressions from 1 to 999,999 \cite{gf-homepage}.
+
+In addition to linearization and parsing, GF supports grammar-based
+\empha{multilingual authoring} \cite{khegai}: interactive editing
+of abstract syntax trees with immediate feedback as linearized texts,
+and the possibility to textual through the parsers. 
 
 From the GF point of view, the goal of the compiler experiment
 is to investigate if GF is capable of implementing
@@ -191,10 +197,13 @@ hence needs some postprocessing.

 Using \HOAS\ to encode all bindings is sometimes cumbersome.
 \enqu
-The first two shortcomings seem to be inevitable with the technique
-we use. The best we can do with the JVM syntax is to use simple
-postprocessing, on string level, to obtain valid JVM. The last 
-shortcoming is partly inherent to the problem of binding:
+The first shortcoming seems to be inevitable with the technique
+we use: just like lambda calculus, our C semantics allows
+overshadowing of earlier bindings by later ones. 
+The second problem is systematically solved by using
+an intermediate JVM format, where symbolic variable addresses
+are used instead of numeric stack addresses.
+The last shortcoming is partly inherent in the problem of binding:
 to spell out, in any formal notation,
 what happens in complex binding structures \textit{is}
 complicated. But it also suggests ways in which GF could be 
@@ -232,8 +241,8 @@ until the next keyword is encountered.

 The abstract syntax that we will present is no doubt closer
 to C than to JVM. One reason is that what we are building is a
-\textit{C compiler}, and match with the target language is
-secondary consideration. Another, more general reason is tha
+\textit{C compiler}, and match with the target language is a
+secondary consideration. Another, more general reason is that
 C is a higher-level language and JVM which means, among
 other things, that C makes more semantic distinctions.
 In general, the abstract syntax of a translation system
@@ -258,8 +267,8 @@ that are needed to construct statements
    Var Typ ;
 \end{verbatim}
 The type \texttt{Typ} is the type of C's datatypes.
-The type (\texttt{Exp}) of expressions is a dependent type, 
-since it has a nonempty context, indicating that \texttt{Exp} take
+The type of expressions is a dependent type, 
+since it has a nonempty context, indicating that \texttt{Exp} takes
 a \texttt{Typ} as argument. The rules for \texttt{Exp}
 will thus be rules to construct well-typed expressions of
 a given type. \texttt{Var}\ is the type of variables,
@@ -435,7 +444,8 @@ to concrete syntax is somewhat remote. Here is an example of
 the code of a function and its abstract syntax:
 \begin{verbatim}
                      let int = TNum TInt in
-  int fact (int n) {  Funct int int (\fact -> RecOne int (\n -> 
+  int fact            Funct (ConsTyp int NilTyp) int (\fact -> 
+        (int n) {                                       RecOne int (\n -> 
    int f ;             Decl int (\f -> 
    f = 1 ;             Assign int f (EInt 1) ( 
    while (1 < n) {     While (ELt int (EInt 1) (EVar int n)) (Block (
@@ -521,8 +531,8 @@ extended with fields for each of the variable symbols:
 \;=\;
 \sugmap{b} *\!* \{\$_{0} = \sugmap{x_{0}} ; \ldots ; \$_{n} = \sugmap{x_{n}}\}
 \]
-Notice that the requirement the variable symbols can
-also be found because linearizable trees are in $\eta$-long normal form.
+Notice that the variable symbols can
+always be found because linearizable trees are in $\eta$-long normal form.
 Also notice that we are here using the 
 \sugmap{} notation in yet another way, to denote the magic operation
 that converts variable symbols into strings.
@@ -552,6 +562,11 @@ may not be recursive. It is due to these restrictions that
 we can always derive a parsing algorithm from a set of
 linearization rules.

+In addition to types defined in \texttt{param} judgements,
+initial segments of natural numbers, \texttt{Ints n},
+can be used as parameter types. This is the most important parameter
+type we use in the syntax of C, to represent precedence.
+
 The following string operations are useful in almost
 all grammars. They are actually included in a GF \texttt{Prelude},
 but are here defined from scratch to make the code shown in
@@ -571,9 +586,9 @@ the Appendices complete.
 We want to be able to recognize and generate one and the same expression with
 or without parentheses, depending on whether its precedence level
 is lower or higher than expected. For instance, a sum used as
-an operand of multiplication should be in parentheses. We
+an operand of multiplication must be in parentheses. We
 capture this by defining a parameter type of
-precedence levels. Four levels are enough for the present
+precedence levels. Five levels are enough for the present
 fragment of C, so we use the enumeration type of 
 integers from 0 to 4 to define the \empha{inherent precedence level}
 of an expression
@@ -588,7 +603,7 @@ in a resource module (see Appendix D), and
 \end{verbatim}
 in the concrete syntax of C itself. 

-To state that an expression has a certain inherent precedence level,
+To build an expression that has a certain inherent precedence level,
 we use the operation
 \begin{verbatim}
    mkPrec : Prec -> Str -> PrecExp = \p,s -> {s = s ; p = p} ;
@@ -598,8 +613,8 @@ we define a function that says that, if the inherent level is lower
 than the expected level, parentheses are required. 
 \begin{verbatim}
    usePrec : PrecExp -> Prec -> Str = \x,p ->
-      ifThenElse 
-        (Predef.lessInt x.p p) 
+      ifThenStr
+        (less x.p p) 
        (paren x.s)
        x.s ;
 \end{verbatim}
@@ -731,13 +746,13 @@ components into a linear structure:

 JVM syntax is, linguistically, more straightforward than
 the syntax of C, and could even be defined by a regular
-expression. However, the JVM syntax that our compiler is
-generating does not comprise full JVM, but only the fragment
+expression. However, the JVM syntax that our compiler
+generates does not comprise full JVM, but only the fragment
 that corresponds to well-formed C programs.

-The JVM syntax we use is from the Jasmin assembler
-\cite{jasmin}, with some deviations which are corrected
-by a postprocessor. The main deviation are
+The JVM syntax we use is a symbolic variant of the Jasmin assembler
+\cite{jasmin}.
+The main deviation from Jasmin are
 variable addresses, as described in Section~\ref{postproc}.
 The other deviations have to do with spacing: the normal
 unlexer of GF puts spaces between constituents, whereas
@@ -789,8 +804,9 @@ is the generation of fresh labels for
 jumps. We solve this in linearization
 by maintaining a growing label suffix
 as a field of the linearization of statements into
-instructions. The problem remains that two branches
-in an \texttt{if-else} statement can use the same
+instructions. The problem remains that statements on the
+same nesting level, e.g.\ the two branches
+of an \texttt{if-else} statement can use the same
 labels. Making them unique must be
 added to the post-processing pass. This is
 always possible, because labels are nested in a
@@ -799,7 +815,7 @@ disciplined way, and jumps can never go to remote labels.
 As it turned out laborious to thread the label counter
 to expressions, we decided to compile comparison 
 expressions (\verb6x < y6) into function calls, and provide the functions in
-a run-time library. This would no more work for the 
+a run-time library. This will no more work for the 
 conjunction (\verb6x && y6)
 and disjunction (\verb6x || y6), if we want to keep their semantics
 lazy, since function calls are strict in their arguments.
@@ -840,7 +856,7 @@ target languages.

 \subsection{Problems with the JVM bytecode verifier}

-An important restriction for linearization in GF is compositionality.
+An inherent restriction for linearization in GF is compositionality.
 This prevents optimizations during linearization
 by clever instruction selection, elimination of superfluous
 labels and jumps, etc. One such optimization, the removal
@@ -848,12 +864,13 @@ of unreachable code (i.e.\ code after a \texttt{return} instruction)
 is actually required by the JVM byte code verifier.
 The solution is, again, to perform this optimization in postprocessing.
 What we currently do, however, is to be careful and write
-C programs so that they always end with a return statement.
+C programs so that they always end with a return statement in the
+outermost block.

 Another problem related to \texttt{return} instructions is that
 both C and JVM programs have a designated \texttt{main} function.
 This function must have a certain type, which is different in C and
-JVM. In C, \texttt{main} returns an integer encoding what runtime
+JVM. In C, \texttt{main} returns an integer encoding what
 errors may have happend during execution. The JVM 
 \texttt{main}, on the other hand, returns a \texttt{void}, i.e.\
 no value at all. A \texttt{main} program returning an
@@ -866,6 +883,13 @@ The parameter list of \texttt{main} is also different in C (empty list)
 and JVM (a string array \texttt{args}). We handle this problem
 with an \empha{ad hoc} postprocessor rule.

+Every function prelude in JVM must indicate the maximum space for
+local variables, and the maximum evaluation stack space (within
+the function's own stack frame). The locals limit is computed in
+linearization by maintaining a counter field. The stack limit
+is blindly set to 1000; it would be possible to set an
+accurate limit in the postprocessing phase.
+

 \section{Translation as linearization vs.\ transfer}

@@ -890,12 +914,16 @@ function from the abstract syntax of C to a different abstract
 syntax of JVM. The abstract syntax notation of GF permits
 definitions of functions, and the GF interpreter can be used
 for evaluating terms into normal form. Thus one could write
+the code generator just like in any functional language:
+by sending in an environment and a syntax tree, and
+returning a new environment with an instruction list:
 \begin{verbatim}
  fun 
    transStm : Env -> Stm -> EnvInstr ;
  def
-    transStm env (Decl typ rest) = ... 
-    transStm env (Assign typ var exp rest) = ... 
+    transStm env (Decl typ cont) = ... 
+    transStm env (While (ELt a b) stm cont) = ... 
+    transStm env (While exp stm cont) = ... 
 \end{verbatim}
 This would be cumbersome in practice, because
 GF does not have programming-language facilities 
@@ -937,6 +965,12 @@ performs at parsing; it is much more difficult to do this for
 a grammar written in the abstract way that GF permits (cf.\ the
 example in Appendix B).

+The current version of the C grammar is ambiguous. GF's own
+parser returns all alternatives, whereas the parser generated by
+Happy rules out some of them by its normal conflict handling
+policy. This means, in practice, that extra brackets are
+sometimes needed to group staments together.
+

 \subsection{Another notation for \HOAS}

@@ -951,8 +985,8 @@ Compare this with a corresponding LBNF rule (also using a continuation):
 \begin{verbatim}
  Decl. Stm ::= Typ Ident ";" Stm ;
 \end{verbatim}
-To explain bindings attached to this rule, one can say, in English, 
-that the identifier gets bound in the following statement.
+To explain bindings attached to this rule, one can say, in natural language, 
+that the identifier gets bound in the statement that follows.
 This means that syntax trees formed by this rule do not have 
 the form \verb6(Decl typ x stm)6, but the form \verb6(Decl typ (\x -> stm))6.

@@ -990,7 +1024,7 @@ by linearization are explained in Section~\ref{postproc} above.

 In addition to the batch compiler, GF provides an interactive 
 syntax editor, in which C programs can be constructed by
-stepwise refinements, local changes, etc. The user of the
+stepwise refinements, local changes, etc.\ \cite{khegai}. The user of the
 editor can work simultaneously on all languages involved.
 In our case, this means that changes can be done both to
 the C code and to the JVM code, and they are automatically
@@ -1062,18 +1096,24 @@ possible. To build a parser that is more efficient than
 GF's generic one, GF offers code generation for standard 
 parser tools.

+One result of the experiment is the beginning of a
+library for dealing with typical programming language structures
+such as precedence. This library is exploited in the parser
+generator, which maps certain parameters used into GF grammars
+into precedence directives in labelled BNF grammars.
+
 The most serious difficulty with JVM code generation by linearization
 is to maintain a symbol table mapping variables to addresses.
 The solution we have chosen is to generate Symbolic JVM, that is,
 JVM with symbolic addresses, and translate the symbolic addresses to
 (relative) memory locations by a postprocessor. 

-Since the postprocessor works uniformly for whole Symbolic JVM,
+Since the postprocessor works uniformly for the whole Symbolic JVM,
 building a new compiler to generate JVM should now be 
 possible by just writing GF grammars. The most immediate
 idea for developing GF as a compiler tool is to define
 a similar symbolic format for an intermediate language,
-using three-operand code and virtual registers.
+which uses three-operand code and virtual registers.



@@ -1337,30 +1377,15 @@ resource ResImper = open Predef in {

  -- string operations

-    SS  : Type = {s : Str} ;
-    ss  : Str -> SS = \s -> {s = s} ;
-    cc2 : (_,_ : SS) -> SS = \x,y -> ss (x.s ++ y.s) ;
-
+    SS    : Type = {s : Str} ;
+    ss    : Str -> SS = \s -> {s = s} ;
+    cc2   : (_,_ : SS) -> SS = \x,y -> ss (x.s ++ y.s) ;
    paren : Str -> Str = \str -> "(" ++ str ++ ")" ;

    continues : Str -> SS -> SS = \s,t -> ss (s ++ ";" ++ t.s) ; 
    continue  : Str -> SS -> SS = \s,t -> ss (s ++ t.s) ;
    statement : Str -> SS       = \s   -> ss (s ++ ";"); 

-  -- taking cases of list size
-
-  param
-    Size = Zero | One | More ;
-  oper
-    nextSize : Size -> Size = \n -> case n of {
-      Zero => One ;
-      _    => More
-      } ;
-    separator : Str -> Size -> Str = \t,n -> case n of {
-      Zero => [] ;
-     _ => t
-     } ;
-
  -- operations for JVM

    Instr  : Type = {s,s2,s3 : Str} ; -- code, variables, labels
@@ -15,6 +15,8 @@ mkdir -pv gf-grammars/resource/romance
 mkdir -pv gf-grammars/resource/russian
 mkdir -pv gf-grammars/resource/swedish
 mkdir -pv gf-grammars/database
+mkdir -pv gf-grammars/imperative
+mkdir -pv gf-grammars/imperative/compiler

 cp -pv letter/README gf-grammars/letter/
 cp -pv letter/mkLetter.gfs gf-grammars/letter/
@@ -34,6 +36,7 @@ cp -pv newresource/mkParadigms.gfs gf-grammars/resource/
 cp -pv newresource/README gf-grammars/resource/
 cp -pv newresource/abstract/*.gf gf-grammars/resource/abstract/
 cp -pv newresource/english/*.gf gf-grammars/resource/english/
+rm -f  gf-grammars/resource/english/ResLex*
 cp -pv newresource/finnish/*.gf gf-grammars/resource/finnish/
 cp -pv newresource/french/*.gf gf-grammars/resource/french/
 cp -pv newresource/german/*.gf gf-grammars/resource/german/
@@ -45,4 +48,15 @@ cp -pv newresource/swedish/*.gf gf-grammars/resource/swedish/
 cp -pv database/README gf-grammars/database/
 cp -pv database/*.gf gf-grammars/database/

+cp -pv imperative/*.gf gf-grammars/imperative/
+cp -pv imperative/compiler/*.hs gf-grammars/imperative/compiler/
+cp -pv imperative/compiler/README gf-grammars/imperative/compiler/
+cp -pv imperative/compiler/FILES gf-grammars/imperative/compiler/
+cp -pv imperative/compiler/gfcc gf-grammars/imperative/compiler/
+cp -pv imperative/compiler/makefile gf-grammars/imperative/compiler/
+cp -pv imperative/compiler/*.c gf-grammars/imperative/compiler/
+cp -pv imperative/compiler/*.gfs gf-grammars/imperative/compiler/
+cp -pv imperative/compiler/runtime.j gf-grammars/imperative/compiler/
+
+
 tar cvfz gf-grammars.tgz gf-grammars
@@ -75,7 +75,7 @@ welcomeMsg =
  "Welcome to " ++ authorMsg ++++ welcomeArch ++ "\n\nType 'h' for help."

 authorMsg = unlines [
- "Grammatical Framework, Version 2.0+",
+ "Grammatical Framework, Version 2.1 beta",
 "Compiled " ++ today,
 "Copyright (c)", 
 "Björn Bringert, Markus Forsberg, Thomas Hallgren, Harald Hammarström,",
@@ -22,7 +22,7 @@ import List (nub)
 prLBNF :: Bool -> StateGrammar -> String
 prLBNF new gr = unlines $ pragmas ++ (map (prCFRule cs) rules)
  where    
-    cs = map IC ["Int","String"] ++ [catId c | (_,(c,_)) <- rules]
+    cs = map IC ["Int","String"] ++ [catIdPlus c | (_,(c,_)) <- rules]
    cf = stateCF gr
    (pragmas,rules) = if new  -- tries to treat precedence levels
                         then mkLBNF (stateGrammarST gr) $ rulesOfCF cf
@@ -82,6 +82,10 @@ mkLBNF gr rules = (coercions, nub $ concatMap mkRule rules) where

 catId ((CFCat ((CIQ _ c),l))) = c

+catIdPlus ((CFCat ((CIQ _ c@(IC s)),l))) = case reverse s of
+  '+':cs -> IC $ reverse $ dropWhile isDigit cs
+  _ -> c
+
 prCFRule :: [Ident] -> CFRule -> String
 prCFRule cs (fun,(cat,its)) = 
  prCFFun cat fun ++ "." +++ prCFCat True cat +++ "::=" +++  --- err in cat -> in syntax
@@ -121,7 +125,7 @@ prCFCat :: Bool -> CFCat -> String
 prCFCat b (CFCat ((CIQ _ c),l)) = prId b c ++ prLab l ----

 -- if a category does not have a production of its own, we replace it by Ident
-prCFItem cs (CFNonterm c) = if elem (catId c) cs then prCFCat False c else "Ident"
+prCFItem cs (CFNonterm c) = if elem (catIdPlus c) cs then prCFCat False c else "Ident"
 prCFItem _ (CFTerm a) = prRegExp a

 prRegExp (RegAlts tt) = case tt of
@@ -435,8 +435,11 @@ q, quit: q
    -printer=gfc            GFC grammar
    -printer=gf             GF grammar
    -printer=old            old GF grammar
-    -printer=cf             context-free grammar
-   *-printer=happy          source file for Happy parser generator
+    -printer=cf             context-free grammar, with profiles
+    -printer=bnf            context-free grammar, without profiles
+    -printer=lbnf           labelled context-free grammar for BNF Converter
+    -printer=plbnf          grammar for BNF Converter, with precedence levels
+   *-printer=happy          source file for Happy parser generator (use lbnf!)
    -printer=srg            speech recognition grammar
    -printer=haskell        abstract syntax in Haskell, with transl to/from GF
    -printer=morpho         full-form lexicon, long format
@@ -448,8 +448,11 @@ txtHelpFile =
  "\n    -printer=gfc            GFC grammar" ++
  "\n    -printer=gf             GF grammar" ++
  "\n    -printer=old            old GF grammar" ++
-  "\n    -printer=cf             context-free grammar" ++
-  "\n   *-printer=happy          source file for Happy parser generator" ++
+  "\n    -printer=cf             context-free grammar, with profiles" ++
+  "\n    -printer=bnf            context-free grammar, without profiles" ++
+  "\n    -printer=lbnf           labelled context-free grammar for BNF Converter" ++
+  "\n    -printer=plbnf          grammar for BNF Converter, with precedence levels" ++
+  "\n   *-printer=happy          source file for Happy parser generator (use lbnf!)" ++
  "\n    -printer=srg            speech recognition grammar" ++
  "\n    -printer=haskell        abstract syntax in Haskell, with transl to/from GF" ++
  "\n    -printer=morpho         full-form lexicon, long format" ++