From 57ff3cc5a391ca95a31357e4fed6ee456a2c6dcc Mon Sep 17 00:00:00 2001 From: aarne Date: Tue, 28 Sep 2004 19:44:02 +0000 Subject: [PATCH] gfcc report --- examples/gfcc/ImperC.gf | 2 +- examples/gfcc/compiler/FILES | 6 +- examples/gfcc/compiler/factorial.c | 19 +++- examples/gfcc/compiler/gfcc | 4 +- examples/gfcc/compiler/makefile | 4 +- examples/gfcc/complin.tex | 135 +++++++++++++++++------------ grammars/mkDistr.sh | 14 +++ src/GF.hs | 2 +- src/GF/CF/PrLBNF.hs | 8 +- src/HelpFile | 7 +- src/HelpFile.hs | 7 +- 11 files changed, 137 insertions(+), 71 deletions(-) diff --git a/examples/gfcc/ImperC.gf b/examples/gfcc/ImperC.gf index fe79cab9a..191cdcf26 100644 --- a/examples/gfcc/ImperC.gf +++ b/examples/gfcc/ImperC.gf @@ -1,6 +1,6 @@ --# -path=.:../prelude concrete ImperC of Imper = open ResImper in { - flags lexer=codevars ; unlexer=code ; startcat=Stm ; + flags lexer=codevars ; unlexer=code ; startcat=Program ; lincat Exp = PrecExp ; diff --git a/examples/gfcc/compiler/FILES b/examples/gfcc/compiler/FILES index b33a27a1c..a52e74c9e 100644 --- a/examples/gfcc/compiler/FILES +++ b/examples/gfcc/compiler/FILES @@ -28,12 +28,12 @@ jvm.tmp -- pseudo-JVM produced by GF linearization Required programs to use the compiler: ------------------------------------- -gf+ -- Grammatical Framework version 2.0+, >= 23/9/2004 -jasmin -- JVM assembler (to compile Foo.j to Foo.class) +gf+ -- Grammatical Framework version 2.1beta, >= 23/9/2004 +jasmin -- JVM assembler (to compile Foo.j to Foo.class) Required programs to build the compiler: --------------------------------------- -bnfc -- BNF Converter version 2.1+, >= 23/9/2004 +bnfc -- BNF Converter version 2.2beta, >= 23/9/2004 happy -- parser generator for Haskell, >= 1.13 alex -- lexer generator for Haskell, >= 2.0 Profile.hs -- BNFC source file (formats/profile), must be on your path diff --git a/examples/gfcc/compiler/factorial.c b/examples/gfcc/compiler/factorial.c index 2a1c3f5f3..76fee32d0 100644 --- a/examples/gfcc/compiler/factorial.c +++ b/examples/gfcc/compiler/factorial.c @@ -10,11 +10,28 @@ int fact (int n) { return f ; } ; +int factr (int n) { + int f ; + { + if (n < 2) { + f = 1 ; + } + else { + f = n * factr (n-1) ; + } + } + return f ; +} ; + int main () { int n ; n = 1 ; { - while (n < 11) printf("%d",fact(n)) ; n = n+1 ; + while (n < 11) { + printf("%d",fact(n)) ; + printf("%d",factr(n)) ; + n = n+1 ; + } } return ; } ; diff --git a/examples/gfcc/compiler/gfcc b/examples/gfcc/compiler/gfcc index c36e42404..9750b8133 100644 --- a/examples/gfcc/compiler/gfcc +++ b/examples/gfcc/compiler/gfcc @@ -1,4 +1,4 @@ ./TestImperC $1 | tail -1 >gft.tmp -echo "es -file=typecheck.gfs" | gf+ -s Imper.gfcm +echo "es -file=typecheck.gfs" | gf -s Imper.gfcm runhugs CleanJVM jvm.tmp $1 -#rm *.tmp +rm *.tmp diff --git a/examples/gfcc/compiler/makefile b/examples/gfcc/compiler/makefile index 802d024cc..cc175731b 100644 --- a/examples/gfcc/compiler/makefile +++ b/examples/gfcc/compiler/makefile @@ -1,10 +1,10 @@ -GF=gf+ +GF=gf SRC=../ all: compiler compiler: - echo "pm | wf Imper.gfcm ;; pg -lang=ImperC -printer=lbnf | wf ImperC.tmp" | $(GF) $(SRC)ImperC.gf $(SRC)ImperJVM.gf + echo "pm | wf Imper.gfcm ;; pg -lang=ImperC -printer=plbnf | wf ImperC.tmp" | $(GF) $(SRC)ImperC.gf $(SRC)ImperJVM.gf echo "entrypoints Program, Stm, Exp ;" >entry.tmp cat entry.tmp ImperC.tmp >ImperC.cf bnfc -m -prof ImperC.cf diff --git a/examples/gfcc/complin.tex b/examples/gfcc/complin.tex index 32fdb87a7..625ceed6e 100644 --- a/examples/gfcc/complin.tex +++ b/examples/gfcc/complin.tex @@ -81,11 +81,12 @@ The second point, code generation by linearization, means that the back end is likewise implemented by a grammar of the target language (in this case, a fragment of JVM). This grammar is the declarative source from which the compiler back end is derived. -In addition, some simple string processing is needed to +In addition, some postprocessing is needed to make the code conform to Jasmin assembler requirements. -The complete code of the compiler is 300 lines. It is presented in -the appendices of this paper. +The complete code of the compiler is 300 lines: 250 lines for the grammars, +50 lines for the postprocessor. The code +is presented in the appendices of this paper. @@ -148,6 +149,11 @@ GF then works much the same way as any grammar formalism or parser generator. The largest number of languages in an application known to us is 88; its domain are numeral expressions from 1 to 999,999 \cite{gf-homepage}. + +In addition to linearization and parsing, GF supports grammar-based +\empha{multilingual authoring} \cite{khegai}: interactive editing +of abstract syntax trees with immediate feedback as linearized texts, +and the possibility to textual through the parsers. From the GF point of view, the goal of the compiler experiment is to investigate if GF is capable of implementing @@ -191,10 +197,13 @@ hence needs some postprocessing. Using \HOAS\ to encode all bindings is sometimes cumbersome. \enqu -The first two shortcomings seem to be inevitable with the technique -we use. The best we can do with the JVM syntax is to use simple -postprocessing, on string level, to obtain valid JVM. The last -shortcoming is partly inherent to the problem of binding: +The first shortcoming seems to be inevitable with the technique +we use: just like lambda calculus, our C semantics allows +overshadowing of earlier bindings by later ones. +The second problem is systematically solved by using +an intermediate JVM format, where symbolic variable addresses +are used instead of numeric stack addresses. +The last shortcoming is partly inherent in the problem of binding: to spell out, in any formal notation, what happens in complex binding structures \textit{is} complicated. But it also suggests ways in which GF could be @@ -232,8 +241,8 @@ until the next keyword is encountered. The abstract syntax that we will present is no doubt closer to C than to JVM. One reason is that what we are building is a -\textit{C compiler}, and match with the target language is -secondary consideration. Another, more general reason is tha +\textit{C compiler}, and match with the target language is a +secondary consideration. Another, more general reason is that C is a higher-level language and JVM which means, among other things, that C makes more semantic distinctions. In general, the abstract syntax of a translation system @@ -258,8 +267,8 @@ that are needed to construct statements Var Typ ; \end{verbatim} The type \texttt{Typ} is the type of C's datatypes. -The type (\texttt{Exp}) of expressions is a dependent type, -since it has a nonempty context, indicating that \texttt{Exp} take +The type of expressions is a dependent type, +since it has a nonempty context, indicating that \texttt{Exp} takes a \texttt{Typ} as argument. The rules for \texttt{Exp} will thus be rules to construct well-typed expressions of a given type. \texttt{Var}\ is the type of variables, @@ -435,7 +444,8 @@ to concrete syntax is somewhat remote. Here is an example of the code of a function and its abstract syntax: \begin{verbatim} let int = TNum TInt in - int fact (int n) { Funct int int (\fact -> RecOne int (\n -> + int fact Funct (ConsTyp int NilTyp) int (\fact -> + (int n) { RecOne int (\n -> int f ; Decl int (\f -> f = 1 ; Assign int f (EInt 1) ( while (1 < n) { While (ELt int (EInt 1) (EVar int n)) (Block ( @@ -521,8 +531,8 @@ extended with fields for each of the variable symbols: \;=\; \sugmap{b} *\!* \{\$_{0} = \sugmap{x_{0}} ; \ldots ; \$_{n} = \sugmap{x_{n}}\} \] -Notice that the requirement the variable symbols can -also be found because linearizable trees are in $\eta$-long normal form. +Notice that the variable symbols can +always be found because linearizable trees are in $\eta$-long normal form. Also notice that we are here using the \sugmap{} notation in yet another way, to denote the magic operation that converts variable symbols into strings. @@ -552,6 +562,11 @@ may not be recursive. It is due to these restrictions that we can always derive a parsing algorithm from a set of linearization rules. +In addition to types defined in \texttt{param} judgements, +initial segments of natural numbers, \texttt{Ints n}, +can be used as parameter types. This is the most important parameter +type we use in the syntax of C, to represent precedence. + The following string operations are useful in almost all grammars. They are actually included in a GF \texttt{Prelude}, but are here defined from scratch to make the code shown in @@ -571,9 +586,9 @@ the Appendices complete. We want to be able to recognize and generate one and the same expression with or without parentheses, depending on whether its precedence level is lower or higher than expected. For instance, a sum used as -an operand of multiplication should be in parentheses. We +an operand of multiplication must be in parentheses. We capture this by defining a parameter type of -precedence levels. Four levels are enough for the present +precedence levels. Five levels are enough for the present fragment of C, so we use the enumeration type of integers from 0 to 4 to define the \empha{inherent precedence level} of an expression @@ -588,7 +603,7 @@ in a resource module (see Appendix D), and \end{verbatim} in the concrete syntax of C itself. -To state that an expression has a certain inherent precedence level, +To build an expression that has a certain inherent precedence level, we use the operation \begin{verbatim} mkPrec : Prec -> Str -> PrecExp = \p,s -> {s = s ; p = p} ; @@ -598,8 +613,8 @@ we define a function that says that, if the inherent level is lower than the expected level, parentheses are required. \begin{verbatim} usePrec : PrecExp -> Prec -> Str = \x,p -> - ifThenElse - (Predef.lessInt x.p p) + ifThenStr + (less x.p p) (paren x.s) x.s ; \end{verbatim} @@ -731,13 +746,13 @@ components into a linear structure: JVM syntax is, linguistically, more straightforward than the syntax of C, and could even be defined by a regular -expression. However, the JVM syntax that our compiler is -generating does not comprise full JVM, but only the fragment +expression. However, the JVM syntax that our compiler +generates does not comprise full JVM, but only the fragment that corresponds to well-formed C programs. -The JVM syntax we use is from the Jasmin assembler -\cite{jasmin}, with some deviations which are corrected -by a postprocessor. The main deviation are +The JVM syntax we use is a symbolic variant of the Jasmin assembler +\cite{jasmin}. +The main deviation from Jasmin are variable addresses, as described in Section~\ref{postproc}. The other deviations have to do with spacing: the normal unlexer of GF puts spaces between constituents, whereas @@ -789,8 +804,9 @@ is the generation of fresh labels for jumps. We solve this in linearization by maintaining a growing label suffix as a field of the linearization of statements into -instructions. The problem remains that two branches -in an \texttt{if-else} statement can use the same +instructions. The problem remains that statements on the +same nesting level, e.g.\ the two branches +of an \texttt{if-else} statement can use the same labels. Making them unique must be added to the post-processing pass. This is always possible, because labels are nested in a @@ -799,7 +815,7 @@ disciplined way, and jumps can never go to remote labels. As it turned out laborious to thread the label counter to expressions, we decided to compile comparison expressions (\verb6x < y6) into function calls, and provide the functions in -a run-time library. This would no more work for the +a run-time library. This will no more work for the conjunction (\verb6x && y6) and disjunction (\verb6x || y6), if we want to keep their semantics lazy, since function calls are strict in their arguments. @@ -840,7 +856,7 @@ target languages. \subsection{Problems with the JVM bytecode verifier} -An important restriction for linearization in GF is compositionality. +An inherent restriction for linearization in GF is compositionality. This prevents optimizations during linearization by clever instruction selection, elimination of superfluous labels and jumps, etc. One such optimization, the removal @@ -848,12 +864,13 @@ of unreachable code (i.e.\ code after a \texttt{return} instruction) is actually required by the JVM byte code verifier. The solution is, again, to perform this optimization in postprocessing. What we currently do, however, is to be careful and write -C programs so that they always end with a return statement. +C programs so that they always end with a return statement in the +outermost block. Another problem related to \texttt{return} instructions is that both C and JVM programs have a designated \texttt{main} function. This function must have a certain type, which is different in C and -JVM. In C, \texttt{main} returns an integer encoding what runtime +JVM. In C, \texttt{main} returns an integer encoding what errors may have happend during execution. The JVM \texttt{main}, on the other hand, returns a \texttt{void}, i.e.\ no value at all. A \texttt{main} program returning an @@ -866,6 +883,13 @@ The parameter list of \texttt{main} is also different in C (empty list) and JVM (a string array \texttt{args}). We handle this problem with an \empha{ad hoc} postprocessor rule. +Every function prelude in JVM must indicate the maximum space for +local variables, and the maximum evaluation stack space (within +the function's own stack frame). The locals limit is computed in +linearization by maintaining a counter field. The stack limit +is blindly set to 1000; it would be possible to set an +accurate limit in the postprocessing phase. + \section{Translation as linearization vs.\ transfer} @@ -890,12 +914,16 @@ function from the abstract syntax of C to a different abstract syntax of JVM. The abstract syntax notation of GF permits definitions of functions, and the GF interpreter can be used for evaluating terms into normal form. Thus one could write +the code generator just like in any functional language: +by sending in an environment and a syntax tree, and +returning a new environment with an instruction list: \begin{verbatim} fun transStm : Env -> Stm -> EnvInstr ; def - transStm env (Decl typ rest) = ... - transStm env (Assign typ var exp rest) = ... + transStm env (Decl typ cont) = ... + transStm env (While (ELt a b) stm cont) = ... + transStm env (While exp stm cont) = ... \end{verbatim} This would be cumbersome in practice, because GF does not have programming-language facilities @@ -937,6 +965,12 @@ performs at parsing; it is much more difficult to do this for a grammar written in the abstract way that GF permits (cf.\ the example in Appendix B). +The current version of the C grammar is ambiguous. GF's own +parser returns all alternatives, whereas the parser generated by +Happy rules out some of them by its normal conflict handling +policy. This means, in practice, that extra brackets are +sometimes needed to group staments together. + \subsection{Another notation for \HOAS} @@ -951,8 +985,8 @@ Compare this with a corresponding LBNF rule (also using a continuation): \begin{verbatim} Decl. Stm ::= Typ Ident ";" Stm ; \end{verbatim} -To explain bindings attached to this rule, one can say, in English, -that the identifier gets bound in the following statement. +To explain bindings attached to this rule, one can say, in natural language, +that the identifier gets bound in the statement that follows. This means that syntax trees formed by this rule do not have the form \verb6(Decl typ x stm)6, but the form \verb6(Decl typ (\x -> stm))6. @@ -990,7 +1024,7 @@ by linearization are explained in Section~\ref{postproc} above. In addition to the batch compiler, GF provides an interactive syntax editor, in which C programs can be constructed by -stepwise refinements, local changes, etc. The user of the +stepwise refinements, local changes, etc.\ \cite{khegai}. The user of the editor can work simultaneously on all languages involved. In our case, this means that changes can be done both to the C code and to the JVM code, and they are automatically @@ -1062,18 +1096,24 @@ possible. To build a parser that is more efficient than GF's generic one, GF offers code generation for standard parser tools. +One result of the experiment is the beginning of a +library for dealing with typical programming language structures +such as precedence. This library is exploited in the parser +generator, which maps certain parameters used into GF grammars +into precedence directives in labelled BNF grammars. + The most serious difficulty with JVM code generation by linearization is to maintain a symbol table mapping variables to addresses. The solution we have chosen is to generate Symbolic JVM, that is, JVM with symbolic addresses, and translate the symbolic addresses to (relative) memory locations by a postprocessor. -Since the postprocessor works uniformly for whole Symbolic JVM, +Since the postprocessor works uniformly for the whole Symbolic JVM, building a new compiler to generate JVM should now be possible by just writing GF grammars. The most immediate idea for developing GF as a compiler tool is to define a similar symbolic format for an intermediate language, -using three-operand code and virtual registers. +which uses three-operand code and virtual registers. @@ -1337,30 +1377,15 @@ resource ResImper = open Predef in { -- string operations - SS : Type = {s : Str} ; - ss : Str -> SS = \s -> {s = s} ; - cc2 : (_,_ : SS) -> SS = \x,y -> ss (x.s ++ y.s) ; - + SS : Type = {s : Str} ; + ss : Str -> SS = \s -> {s = s} ; + cc2 : (_,_ : SS) -> SS = \x,y -> ss (x.s ++ y.s) ; paren : Str -> Str = \str -> "(" ++ str ++ ")" ; continues : Str -> SS -> SS = \s,t -> ss (s ++ ";" ++ t.s) ; continue : Str -> SS -> SS = \s,t -> ss (s ++ t.s) ; statement : Str -> SS = \s -> ss (s ++ ";"); - -- taking cases of list size - - param - Size = Zero | One | More ; - oper - nextSize : Size -> Size = \n -> case n of { - Zero => One ; - _ => More - } ; - separator : Str -> Size -> Str = \t,n -> case n of { - Zero => [] ; - _ => t - } ; - -- operations for JVM Instr : Type = {s,s2,s3 : Str} ; -- code, variables, labels diff --git a/grammars/mkDistr.sh b/grammars/mkDistr.sh index 02806784a..c88140148 100644 --- a/grammars/mkDistr.sh +++ b/grammars/mkDistr.sh @@ -15,6 +15,8 @@ mkdir -pv gf-grammars/resource/romance mkdir -pv gf-grammars/resource/russian mkdir -pv gf-grammars/resource/swedish mkdir -pv gf-grammars/database +mkdir -pv gf-grammars/imperative +mkdir -pv gf-grammars/imperative/compiler cp -pv letter/README gf-grammars/letter/ cp -pv letter/mkLetter.gfs gf-grammars/letter/ @@ -34,6 +36,7 @@ cp -pv newresource/mkParadigms.gfs gf-grammars/resource/ cp -pv newresource/README gf-grammars/resource/ cp -pv newresource/abstract/*.gf gf-grammars/resource/abstract/ cp -pv newresource/english/*.gf gf-grammars/resource/english/ +rm -f gf-grammars/resource/english/ResLex* cp -pv newresource/finnish/*.gf gf-grammars/resource/finnish/ cp -pv newresource/french/*.gf gf-grammars/resource/french/ cp -pv newresource/german/*.gf gf-grammars/resource/german/ @@ -45,4 +48,15 @@ cp -pv newresource/swedish/*.gf gf-grammars/resource/swedish/ cp -pv database/README gf-grammars/database/ cp -pv database/*.gf gf-grammars/database/ +cp -pv imperative/*.gf gf-grammars/imperative/ +cp -pv imperative/compiler/*.hs gf-grammars/imperative/compiler/ +cp -pv imperative/compiler/README gf-grammars/imperative/compiler/ +cp -pv imperative/compiler/FILES gf-grammars/imperative/compiler/ +cp -pv imperative/compiler/gfcc gf-grammars/imperative/compiler/ +cp -pv imperative/compiler/makefile gf-grammars/imperative/compiler/ +cp -pv imperative/compiler/*.c gf-grammars/imperative/compiler/ +cp -pv imperative/compiler/*.gfs gf-grammars/imperative/compiler/ +cp -pv imperative/compiler/runtime.j gf-grammars/imperative/compiler/ + + tar cvfz gf-grammars.tgz gf-grammars diff --git a/src/GF.hs b/src/GF.hs index 451a519a7..c573e0e1c 100644 --- a/src/GF.hs +++ b/src/GF.hs @@ -75,7 +75,7 @@ welcomeMsg = "Welcome to " ++ authorMsg ++++ welcomeArch ++ "\n\nType 'h' for help." authorMsg = unlines [ - "Grammatical Framework, Version 2.0+", + "Grammatical Framework, Version 2.1 beta", "Compiled " ++ today, "Copyright (c)", "Björn Bringert, Markus Forsberg, Thomas Hallgren, Harald Hammarström,", diff --git a/src/GF/CF/PrLBNF.hs b/src/GF/CF/PrLBNF.hs index fe06cbf9e..a0b33de87 100644 --- a/src/GF/CF/PrLBNF.hs +++ b/src/GF/CF/PrLBNF.hs @@ -22,7 +22,7 @@ import List (nub) prLBNF :: Bool -> StateGrammar -> String prLBNF new gr = unlines $ pragmas ++ (map (prCFRule cs) rules) where - cs = map IC ["Int","String"] ++ [catId c | (_,(c,_)) <- rules] + cs = map IC ["Int","String"] ++ [catIdPlus c | (_,(c,_)) <- rules] cf = stateCF gr (pragmas,rules) = if new -- tries to treat precedence levels then mkLBNF (stateGrammarST gr) $ rulesOfCF cf @@ -82,6 +82,10 @@ mkLBNF gr rules = (coercions, nub $ concatMap mkRule rules) where catId ((CFCat ((CIQ _ c),l))) = c +catIdPlus ((CFCat ((CIQ _ c@(IC s)),l))) = case reverse s of + '+':cs -> IC $ reverse $ dropWhile isDigit cs + _ -> c + prCFRule :: [Ident] -> CFRule -> String prCFRule cs (fun,(cat,its)) = prCFFun cat fun ++ "." +++ prCFCat True cat +++ "::=" +++ --- err in cat -> in syntax @@ -121,7 +125,7 @@ prCFCat :: Bool -> CFCat -> String prCFCat b (CFCat ((CIQ _ c),l)) = prId b c ++ prLab l ---- -- if a category does not have a production of its own, we replace it by Ident -prCFItem cs (CFNonterm c) = if elem (catId c) cs then prCFCat False c else "Ident" +prCFItem cs (CFNonterm c) = if elem (catIdPlus c) cs then prCFCat False c else "Ident" prCFItem _ (CFTerm a) = prRegExp a prRegExp (RegAlts tt) = case tt of diff --git a/src/HelpFile b/src/HelpFile index 85b39c8d9..af09b5e01 100644 --- a/src/HelpFile +++ b/src/HelpFile @@ -435,8 +435,11 @@ q, quit: q -printer=gfc GFC grammar -printer=gf GF grammar -printer=old old GF grammar - -printer=cf context-free grammar - *-printer=happy source file for Happy parser generator + -printer=cf context-free grammar, with profiles + -printer=bnf context-free grammar, without profiles + -printer=lbnf labelled context-free grammar for BNF Converter + -printer=plbnf grammar for BNF Converter, with precedence levels + *-printer=happy source file for Happy parser generator (use lbnf!) -printer=srg speech recognition grammar -printer=haskell abstract syntax in Haskell, with transl to/from GF -printer=morpho full-form lexicon, long format diff --git a/src/HelpFile.hs b/src/HelpFile.hs index 3cc6b2ada..742279fc4 100644 --- a/src/HelpFile.hs +++ b/src/HelpFile.hs @@ -448,8 +448,11 @@ txtHelpFile = "\n -printer=gfc GFC grammar" ++ "\n -printer=gf GF grammar" ++ "\n -printer=old old GF grammar" ++ - "\n -printer=cf context-free grammar" ++ - "\n *-printer=happy source file for Happy parser generator" ++ + "\n -printer=cf context-free grammar, with profiles" ++ + "\n -printer=bnf context-free grammar, without profiles" ++ + "\n -printer=lbnf labelled context-free grammar for BNF Converter" ++ + "\n -printer=plbnf grammar for BNF Converter, with precedence levels" ++ + "\n *-printer=happy source file for Happy parser generator (use lbnf!)" ++ "\n -printer=srg speech recognition grammar" ++ "\n -printer=haskell abstract syntax in Haskell, with transl to/from GF" ++ "\n -printer=morpho full-form lexicon, long format" ++