1
0
forked from GitHub/gf-core

almost the final version

This commit is contained in:
aarne
2004-09-26 15:44:08 +00:00
parent a06dcaab30
commit 1b5c20dced
8 changed files with 364 additions and 192 deletions

View File

@@ -3,7 +3,8 @@ concrete ImperC of Imper = open ResImper in {
flags lexer=codevars ; unlexer=code ; startcat=Stm ;
lincat
Exp = PrecExp ;
Exp = PrecExp ;
Typ, NumTyp = {s,s2 : Str} ;
Rec = {s,s2,s3 : Str} ;
lin
@@ -30,7 +31,7 @@ concrete ImperC of Imper = open ResImper in {
While exp loop = continue ("while" ++ paren exp.s ++ loop.s) ;
IfElse exp t f = continue ("if" ++ paren exp.s ++ t.s ++ "else" ++ f.s) ;
Block stm = continue ("{" ++ stm.s ++ "}") ;
Printf t e = continues ("printf" ++ paren (t.s ++ "," ++ e.s)) ;
Printf t e = continues ("printf" ++ paren (t.s2 ++ "," ++ e.s)) ;
Return _ exp = statement ("return" ++ exp.s) ;
Returnv = statement "return" ;
End = ss [] ;
@@ -47,8 +48,8 @@ concrete ImperC of Imper = open ResImper in {
EApp args val f exps = constant (f.s ++ paren exps.s) ;
TNum t = t ;
TInt = ss "int" ;
TFloat = ss "float" ;
TInt = {s = "int" ; s2 = "\"%d\""} ;
TFloat = {s = "float" ; s2 = "\"%f\""} ;
NilTyp = ss [] ;
ConsTyp = cc2 ;
OneExp _ e = e ;

View File

@@ -3,7 +3,7 @@ module Main where
import Char
import System
--- now works for programs with exactly 2 functions, main last
--- translation from Symbolic JVM to real Jasmin code
main :: IO ()
main = do
@@ -18,30 +18,36 @@ main = do
return ()
mkJVM :: String -> String -> String
mkJVM cls = unlines . map trans . lines where
trans s = case words s of
mkJVM cls = unlines . reverse . fst . foldl trans ([],([],0)) . lines where
trans (code,(env,v)) s = case words s of
".method":p:s:f:ns
| take 5 f == "main_" -> ".method public static main([Ljava/lang/String;)V"
| otherwise -> unwords [".method",p,s, unindex f ++ typesig ns]
".limit":"locals":ns -> ".limit locals " ++ show (length ns)
"invokestatic":t:f:ns | take 8 f == "runtime/" ->
"invokestatic " ++ "runtime/" ++ t ++ drop 8 f ++ typesig ns
"invokestatic":f:ns -> "invokestatic " ++ cls ++ "/" ++ unindex f ++ typesig ns
"alloc":ns -> "; " ++ s
t:('_':instr):[] -> t ++ instr
t:('_':instr):x:_ -> t ++ instr ++ " " ++ address x
"goto":ns -> "goto " ++ label ns
"ifeq":ns -> "ifeq " ++ label ns
"label":ns -> label ns ++ ":"
";":[] -> ""
_ -> s
| f == "main" ->
(".method public static main([Ljava/lang/String;)V":code,([],1))
| otherwise ->
(unwords [".method",p,s, f ++ typesig ns] : code,([],0))
"alloc":t:x:_ -> (("; " ++ s):code, ((x,v):env, v + size t))
".limit":"locals":ns -> chCode (".limit locals " ++ show (length ns))
"invokestatic":t:f:ns
| take 8 f == "runtime/" ->
chCode $ "invokestatic " ++ "runtime/" ++ t ++ drop 8 f ++ typesig ns
"invokestatic":f:ns ->
chCode $ "invokestatic " ++ cls ++ "/" ++ f ++ typesig ns
"alloc":ns -> chCode $ "; " ++ s
t:('_':instr):[";"] -> chCode $ t ++ instr
t:('_':instr):x:_ -> chCode $ t ++ instr ++ " " ++ look x
"goto":ns -> chCode $ "goto " ++ label ns
"ifeq":ns -> chCode $ "ifeq " ++ label ns
"label":ns -> chCode $ label ns ++ ":"
";":[] -> chCode ""
_ -> chCode s
where
unindex = reverse . drop 1 . dropWhile (/= '_') . reverse
typesig = init . map toUpper . concat
address x = case (filter isDigit . reverse . takeWhile (/= '_') . reverse) x of
s@(_:_) -> show $ read s - (1 :: Int)
s -> s
label = init . concat
chCode c = (c:code,(env,v))
look x = maybe (error $ x ++ show env) show $ lookup x env
typesig = init . map toUpper . concat
label = init . concat
size t = case t of
"d" -> 2
_ -> 1
boilerplate :: String -> String
boilerplate cls = unlines [

View File

@@ -13,7 +13,7 @@ int abs (int x){
int main () {
int i ;
i = abs (16);
printf (int,i) ;
printf ("%d",i) ;
return ;
} ;

View File

@@ -14,7 +14,7 @@ int main () {
int n ;
n = 1 ;
{
while (n < 11) printf(int,fact(n)) ; n = n+1 ;
while (n < 11) printf("%d",fact(n)) ; n = n+1 ;
}
return ;
} ;

View File

@@ -6,10 +6,10 @@ int main () {
int lo ; int hi ;
lo = 1 ;
hi = lo ;
printf(int,lo) ;
printf("%d",lo) ;
{
while (hi < mx()) {
printf(int,hi) ;
printf("%d",hi) ;
hi = lo + hi ;
lo = hi - lo ;
}

View File

@@ -1,4 +1,4 @@
./TestImperC $1 | tail -1 >gft.tmp
echo "es -file=typecheck.gfs" | gf+ -s Imper.gfcm
runhugs CleanJVM jvm.tmp $1
rm *.tmp
#rm *.tmp

View File

@@ -3,6 +3,4 @@ open gft.tmp
'
c solve
'
c reindex
'
save ImperJVM jvm.tmp

View File

@@ -49,7 +49,7 @@ its definition consists of an abstract syntax of program
structures and two concrete syntaxes matching the abstract
syntax: one for C and one for JVM. From these grammar components,
the compiler is derived by using the GF (Grammatical Framework)
grammat tool: the front-end consists of parsing and semantic
grammat tool: the front end consists of parsing and semantic
checking in accordance to the C grammar, and the back end consists
of linearization in accordance to the JVM grammar. The tool provides
other functionalities as well, such as decompilation and interactive
@@ -112,9 +112,9 @@ An abstract syntax is similar to a \empha{theory}, or a
concrete syntax defines, in a declarative way,
a translation of abstract syntax trees (well-formed terms)
into concrete language structures, and from this definition, one can
derive both both linearization and parsing.
derive both linearization and parsing.
For example,
To give an example,
a (somewhat simplified) translator for addition expressions
consists of the abstract syntax rule
\begin{verbatim}
@@ -136,16 +136,16 @@ The C rule shows that the type information is suppressed,
and that the expression has precedence level 2 (which is a simplification,
since we will also treat associativity).
The JVM rule shows how addition is translated to stack machine
instructions, where the type of the postfixed addition instruction again has to
instructions, where the type of the postfixed addition instruction has to
be made explicit. Our compiler, like any GF translation system, will
consist of rules like these.
The number of languages related to one abstract syntax in
a translation system is of course not limited to two.
Sometimes just just one language is involved;
Sometimes just one language is involved;
GF then works much the same way as any grammar
formalism or parser generator.
The largest number of languages in an application known to us is 88, and
The largest number of languages in an application known to us is 88;
its domain are numeral expressions from 1 to 999,999 \cite{gf-homepage}.
From the GF point of view, the goal of the compiler experiment
@@ -182,27 +182,32 @@ compilation.
The problems that we encountered and their causes will be explained in
the relevant sections of this report. To summarize,
\bequ
The scoping conditions resulting from \HOAS are slightly different
The scoping conditions resulting from \HOAS\ are slightly different
from the standard ones of C.
Our JVM syntax is slightly different from the specification, and
hence needs some postprocessing.
Using \HOAS to encode all bindings is sometimes cumbersome.
Using \HOAS\ to encode all bindings is sometimes cumbersome.
\enqu
The first two shortcomings seem to be inevitable with the technique
we use. The best we can do with the JVM syntax is to use simple
postprocessing, on string level, to obtain valid JVM. The last
shortcoming is partly inherent to the problem of binding:
to spell out, in any formal notation,
what happens in nested binding structures \textit{is}
what happens in complex binding structures \textit{is}
complicated. But it also suggests ways in which GF could be
fine-tuned to give better support
tuned to give better support
to compiler construction, which, after all, is not an intended
use of GF as it is now.
\section{The abstract syntax}
An \empha{abstract syntax} in GF consists of \texttt{cat} judgements
@@ -372,7 +377,7 @@ forbids case analysis on the length of the lists.
On the top level, a program is a sequence of functions.
Each function may refer to functions defined earlier
in the program. The idea to express the binding of
function symbols with \HOAS is analogous to the binding
function symbols with \HOAS\ is analogous to the binding
of variables in statements, using a continuation.
As with variables, the principal way to build function symbols is as
bound variables (in addition, there can be some
@@ -406,7 +411,7 @@ expressions are used to give arguments to functions.
However, this would lead to the need of cumbersome
projection functions when using the parameters
in the function body. A more elegant solution is
to use \HOAS to build function bodies:
to use \HOAS\ to build function bodies:
\begin{verbatim}
RecOne : (A : Typ) ->
(Var A -> Stm) -> Program -> Rec (ConsTyp A NilTyp) ;
@@ -446,7 +451,6 @@ statements, to be able to print values of different types.
\section{The concrete syntax of C}
A concrete syntax, for a given abstract syntax,
@@ -625,6 +629,23 @@ are simple and concise:
\end{verbatim}
\subsection{Types}
Types are expressed in two different ways:
in declarations, we have \texttt{int} and \texttt{float}, but
as formatting arguments to \texttt{printf}, we have
\verb6"%d"6 and \verb6"%f"6, with the quotes belonging to the
names. The simplest solution in GF is to linearize types
to records with two string fields.
\begin{verbatim}
lincat
Typ, NumTyp = {s,s2 : Str} ;
lin
TInt = {s = "int" ; s2 = "\"%d\""} ;
TFloat = {s = "float" ; s2 = "\"%f\""} ;
\end{verbatim}
\subsection{Statements}
Statements in C have
@@ -639,6 +660,8 @@ the use of semicolons on a high level.
\end{verbatim}
As for declarations, which bind variables, we notice the
projection \verb6.$06 to refer to the bound variable.
Also notice the use of the \texttt{s2} field of the type
in \texttt{printf}.
\begin{verbatim}
lin
Decl typ cont = continues (typ.s ++ cont.$0) cont ;
@@ -646,14 +669,13 @@ projection \verb6.$06 to refer to the bound variable.
While exp loop = continue ("while" ++ paren exp.s ++ loop.s) ;
IfElse exp t f = continue ("if" ++ paren exp.s ++ t.s ++ "else" ++ f.s) ;
Block stm = continue ("{" ++ stm.s ++ "}") ;
Printf t e = continues ("printf" ++ paren (t.s ++ "," ++ e.s)) ;
Printf t e = continues ("printf" ++ paren (t.s2 ++ "," ++ e.s)) ;
Return _ exp = statement ("return" ++ exp.s) ;
Returnv = statement "return" ;
End = ss [] ;
\end{verbatim}
\subsection{Functions and programs}
The category \texttt{Rec} of recursive function bodies with continuations
@@ -689,26 +711,55 @@ components into a linear structure:
\end{verbatim}
%%\subsection{Lexing and unlexing}
\section{The concrete syntax of JVM}
JVM syntax is, linguistically, more straightforward than
the syntax of C, and could even be defined by a regular
expression. The translation from our abstract syntax to JVM,
however, is tricky because variables are replaced by
their addresses (relative to the frame pointer), and
code generation must therefore maintain a symbol table that permits
the lookup of variable addresses. As shown in the code
in Appendix C, we have not attempted to do this
in linearization, but instead
generated code with symbolic addresses.
The postprocessor has to resolve the symbolic addresses, which
we help by
generating \texttt{alloc} pseudoinstructions from declarations
(in final JVM, no \texttt{alloc} instructions appear).
expression. However, the JVM syntax that our compiler is
generating does not comprise full JVM, but only the fragment
that corresponds to well-formed C programs.
The following example shows how the three representations (C, pseudo-JVM, JVM) look like
for a piece of code.
The JVM syntax we use is from the Jasmin assembler
\cite{jasmin}, with some deviations which are corrected
by a postprocessor. The main deviation are
variable addresses, as described in Section~\ref{postproc}.
The other deviations have to do with spacing: the normal
unlexer of GF puts spaces between constituents, whereas
in JVM, type names are integral parts of instruction names.
We indicate gluing uniformly by generating an underscore
on the side from which the adjacent element is glued. Thus
e.g.\ \verb6i _load6 becomes \verb6iload6.
\subsection{Symbolic JVM}
\label{postproc}
What makes the translation from our abstract syntax to JVM
tricky is that variables must be replaced by
numeric addresses (relative to the frame pointer).
Code generation must therefore maintain a symbol table that permits
the lookup of variable addresses. As shown in the code
in Appendix C, we do not treat symbol tables
in linearization, but instead generated code in
\empha{Symbolic JVM}---that is, JVM with symbolic addresses.
Therefore we need a postprocessor that resolves the symbolic addresses,
shown in Appendix D.
To make the postprocessor straightforward,
Symbolic JVM has special \texttt{alloc} instructions,
which are not present in real JVM.
Our compiler generates \texttt{alloc} instructions from
variable declarations.
The postprocessor comments out the \texttt{alloc} instructions, but we
found it a good idea not to erase them completely, since they make the
code more readable.
The following example shows how the three representations (C, Symbolic JVM, JVM)
look like for a piece of code.
\begin{verbatim}
int x ; alloc i x ; x gets address 0
int y ; alloc i y ; y gets address 1
@@ -717,33 +768,30 @@ for a piece of code.
y = x ; i _load x iload 0
i _store y istore 1
\end{verbatim}
A related problem is the generation of fresh labels for
jumps. We solve this by maintaining a growing label suffix
\subsection{Labels and jumps}
A problem related to variable addresses
is the generation of fresh labels for
jumps. We solve this in linearization
by maintaining a growing label suffix
as a field of the linearization of statements into
instructions. The problem remains that the two branches
instructions. The problem remains that two branches
in an \texttt{if-else} statement can use the same
labels. Making them unique will have to be
labels. Making them unique must be
added to the post-processing pass. This is
always possible, because labels are nested in a
disciplined way, and jumps can never go to remote labels.
As it turned out laborious to thread the label counter
to expressions, we decided to compile comparison \verb6x < y6
expressions into function calls, which are provided
by a run-time library. This would no more work for the
conjunction \verb6x && y6
and disjunction \verb6x || y6, if we want to keep their semantics
to expressions, we decided to compile comparison
expressions (\verb6x < y6) into function calls, and provide the functions in
a run-time library. This would no more work for the
conjunction (\verb6x && y6)
and disjunction (\verb6x || y6), if we want to keep their semantics
lazy, since function calls are strict in their arguments.
The JVM syntax used is from the Jasmin assembler
\cite{jasmin}, with small deviations which are corrected
by the postprocessor. The deviations other than
variable addresses have to do with spacing: the normal
unlexer of GF puts spaces between constituents, whereas
in JVM, type names are integral parts of instruction names.
We indicate gluing uniformly by generating an underscores
on the side from which the adjacent element is glued. Thus
e.g.\ \verb6i _load6 becomes \verb6iload6.
@@ -751,58 +799,79 @@ e.g.\ \verb6i _load6 becomes \verb6iload6.
\subsection{How to restore code generation by linearization}
Since postprocessing is needed, we have not quite achieved
the goal of code generation as linearization.
If linearization is understood in the
sense of GF. In GF, linearization rules must be
compositional, and can only depend on parameters from
finite parameter sets. Hence it is not possible to encode
linearization with updates to and lookups from a symbol table,
as is usual in code generation.
the goal of code generation as linearization---if
linearization is understood in the
sense of GF. In GF, linearization can only depend
on parameters from finite parameter sets. Since the size of
a symbol table can grow indefinitely, it is not
possible to encode linearization with updates to and
lookups from a symbol table, as is usual in code generation.
Compositionality also prevents optimizations during linearization
by clever instruction selection, elimination of superfluous
labels and jumps, etc.
One way to achieve compositional JVM linearization is to
alpha-convert abstract syntax syntax trees
so that variables are indexed with integers
that indicate their depths in the tree.
This hack works in the present fragment of C
because all variables need same amount of
memory (one word), but would break down if we
added double-precision floats. Therefore
One attempt we made to achieve JVM linearization with
numeric addresses was to alpha-convert abstract syntax syntax trees
so that variables get indexed with integers that indicate their
depths in the tree. This hack works in the present fragment of C
because all variables need the same amount of memory (one word),
but would break down if we added double-precision floats. Therefore
we have used the less pure (from the point of view of
code generation as linearization) method of
symbolic addresses.
It would certainly be possible to generate variable addresses
directly in the syntax trees
by using dependent types; but this would clutter the abstract
directly in the syntax trees by using dependent types; but this
would clutter the abstract
syntax in a way that is hard to motivate when we are in
the business of describing the syntax of C. The abstract syntax would
have to, so to say, anticipate all demands of the compiler's
target languages.
\subsection{Problems with JVM bytecode verifier}
\subsection{Problems with the JVM bytecode verifier}
An important restriction for linearization in GF is compositionality.
This prevents optimizations during linearization
by clever instruction selection, elimination of superfluous
labels and jumps, etc. One such optimization, the removal
of unreachable code (i.e.\ code after a \texttt{return} instruction)
is actually required by the JVM byte code verifier.
The solution is, again, to perform this optimization in postprocessing.
What we currently do, however, is to be careful and write
C programs so that they always end with a return statement.
Another problem related to \texttt{return} instructions is that
both C and JVM programs have a designated \texttt{main} function.
This function must have a certain type, which is different in C and
JVM. In C, \texttt{main} returns an integer encoding what runtime
errors may have happend during execution. The JVM
\texttt{main}, on the other hand, returns a \texttt{void}, i.e.\
no value at all. A \texttt{main} program returning an
integer therefore provokes a JVM bytecode verifier error.
The postprocessor could take care of this; but currently
we just write programs with void \texttt{return}s in the
\texttt{main} functions.
The parameter list of \texttt{main} is also different in C (empty list)
and JVM (a string array \texttt{args}). We handle this problem
with an \empha{ad hoc} postprocessor rule.
\section{Translation as linearization vs.\ translation by transfer}
\section{Translation as linearization vs.\ transfer}
The kind of problems we encountered in code generation by
Many of the problems we have encountered in code generation by
linearization are familiar from
translation systems for natural languages. For instance, to translate
the English pronoun \eex{you} to German, you have to choose
between \eex{du, ihr, Sie}; for Italian, there are four
variants, and so on. All semantic distinctions
made in any of the involved languages have to be present
in the common abstract syntax. The usual solution to
variants, and so on. To deal with this by linearization,
all semantic distinctions made in any of the involved languages
have to be present in the common abstract syntax. The usual solution to
this problem is not a universal abstract syntax, but
\empha{transfer}: translation does not just linearize
the same syntax trees to different languages, but defines
functions that translates
the trees of one language into the trees of another.
the same syntax trees to another language, but uses
a noncompositional function that translates
trees of one language into trees of another.
Using transfer in the compiler
Using transfer in the
back end is precisely what traditional compilers do.
The transfer function in our case would be a noncompositional
function from the abstract syntax of C to a different abstract
@@ -818,17 +887,114 @@ for evaluating terms into normal form. Thus one could write
\end{verbatim}
This would be cumbersome in practice, because
GF does not have programming-language facilities
like built-in lists and tuples, or monads. Of course,
like built-in lists and tuples, or monads. Moreover,
the compiler could no longer be inverted into a decompiler,
in the way true linearization can be inverted into a parser.
\section{Parser generation}
\label{bnfc}
The whole GF part of the compiler (parser, type checker, Symbolic JVM
generator) can be run in the GF interpreter.
The weakest point of the resulting compiler, by current
standards, is the parser. GF is a powerful grammar formalism, which
needs a very general parser, taking care of ambiguities and other
problems that are typical of natural languages but should be
overcome in programming languages by design. The parser is moreover run
in an interpreter that takes the grammar (in a suitably compiled form)
as an argument.
Fortunately, it is easy to replace the generic, interpreting GF parser
by a compiled LR(1) parser. GF supports the translation of a concrete
syntax into the \empha{Labelled BNF} (LBNF) format \cite{lbnf},
which in turn can be translated to parser generator code
(Happy, Bison, or JavaCUP), by the BNF Converter \cite{bnfc}.
The parser we are therefore using in the compiler is a Haskell
program generated by Happy \cite{happy}.
We regard parser generation
as a first step towards developing GF into a
production-quality compiler compiler. The efficiency of the parser
is not the only relevant thing. Another advantage of an LR(1)
parser generator is that it performs an analysis on the grammar
finding conflicts, and provides a debugger. It may be
difficult for a human to predict how a context-free grammar
performs at parsing; it is much more difficult to do this for
a grammar written in the abstract way that GF permits (cf.\ the
example in Appendix B).
\section{How to use the compiler}
\subsection{Another notation for \HOAS}
Describing variable bindings with \HOAS\ is sometimes considered
unintuitive. Let us consider the declaration rule of C (without
type dependencies for simplicity):
\begin{verbatim}
fun Decl : Typ -> (Var -> Stm) -> Stm ;
lin Decl typ stm = {s = typ.s ++ stm.$0 ++ ";" ++stm.s} ;
\end{verbatim}
Compare this with a corresponding LBNF rule (also using a continuation):
\begin{verbatim}
Decl. Stm ::= Typ Ident ";" Stm ;
\end{verbatim}
To explain bindings attached to this rule, one can say, in English,
that the identifier gets bound in the following statement.
This means that syntax trees formed by this rule do not have
the form \verb6(Decl typ x stm)6, but the form \verb6(Decl typ (\x -> stm))6.
One way to formalize the informal binding rules stated beside
BNF rules is to use \empha{profiles}: data structures describing
the way in which the logical arguments of the syntax tree are
represented by the linearized form. The declaration rule can be
written using a profile notation as follows:
\begin{verbatim}
Decl [1,(2)3]. Stm ::= Typ Ident ";" Stm ;
\end{verbatim}
When compiling GF grammars into LBNF, we were forced to enrich
LBNF by a (more general) profile notation
(cf.\ \cite{gf-jfp}, Section 3.3). This suggested at the same
time that profiles could provide a user-fiendly notation for
\HOAS\ avoiding the explicit use of lambda calculus.
\section{Using the compiler}
Our compiler is invoked, of course, by the command \texttt{gfcc}.
It produces a JVM \texttt{.class} file, by running the
Jasmin bytecode assembler \cite{jasmin} on a Jasmin (\texttt{.j})
file:
\begin{verbatim}
% gfcc factorial.c
> > wrote file factorial.j
Generated: factorial.class
\end{verbatim}
The Jasmin code is produced by a postprocessor, written in Haskell
(Appendix E), from the Symbolic JVM format that is produced by
linearization. The reasons why actual Jasmin is not generated
by linearization are explained in Section~\ref{postproc} above.
In addition to the batch compiler, GF provides an interactive
syntax editor, in which C programs can be constructed by
stepwise refinements, local changes, etc. The user of the
editor can work simultaneously on all languages involved.
In our case, this means that changes can be done both to
the C code and to the JVM code, and they are automatically
carried over from one language to the other.
A screen dump of the editor is shown in Fig~\ref{demo}.
\begin{figure}
\centerline{\psfig{figure=demo2.ps}} \caption{
GF editor session where an integer
expression is expected to be given. The left window shows the
abstract syntax tree, and the right window the evolving C and
JVM code. The editor focus is shadowed, and the refinement alternatives
are shown in a pop-up window.
}
\label{demo}
\end{figure}
@@ -838,28 +1004,26 @@ The theoretical ideas behind our compiler experiment
are familiar from various sources.
Single-source language and compiler definitions
can be built using attribute grammars \cite{knuth-attr}.
Building single-source language definitions with
dependent types and higher-order abstract syntax
The use of
dependent types in combination with higher-order abstract syntax
has been studied in various logical frameworks
\cite{harper-honsell,magnusson-nordstr,twelf}.
The addition of linearization rules to
type-theoretical abstract syntax is studied in
\cite{semBNF}, which also compares the method with
attribute grammars.
The addition of linearization rules to type-theoretical
abstract syntax is studied in \cite{semBNF}, which also
compares the method with attribute grammars.
The idea of using a common abstract syntax for different
languages was clearly exposed by Landin \cite{landin}. The view of
code generation as linearization is a central aspect of
the classic compiler textbook by Aho, Sethi, and Ullman
\cite{aho-ullman}.
The use of the same grammar both for parsing and linearization
The use of one and the same grammar both for parsing and linearization
is a guiding principle of unification-based linguistic grammar
formalisms \cite{pereira-shieber}. Interactive editors derived from
grammars have been used in various programming and proof
grammars have been developed in various programming and proof
assistants \cite{teitelbaum,metal,magnusson-nordstr}.
Even though the different ideas are well-known, they are
applied less in practice than in theory. In particular,
Even though the different ideas are well-known,
we have not seen them used together to construct a complete
compiler. In our view, putting these ideas together is
an attractive approach to compiling, since a compiler written
@@ -875,28 +1039,29 @@ semantics that is actually used in the implementation.
\section{Conclusion}
We have managed to compile a representative
subset of C to JVM, and growing it
The \texttt{gfcc} compiler translates a representative
fragment of C to JVM, and growing the fragment
does not necessarily pose any new kinds of problems.
Using \HOAS and dependent types to describe the abstract
Using \HOAS\ and dependent types to describe the abstract
syntax of C works fine, and defining the concrete syntax
of C on top of this using GF linearization machinery is
already possible, even though more support could be
desired for things like literals and precedences.
possible. To build a parser that is more efficient than
GF's generic one, GF offers code generation for standard
parser tools.
The parser generated by GF is not able to parse all
source programs, because some cyclic parse
rules (of the form $C ::= C$) are generated from our grammar.
Recovery from cyclic rules is ongoing work in GF independently of this
experiment. For the time being, the interactive editor is the best way to
construct C programs using our grammar.
The most serious difficulty with JVM code generation by linearization
is to maintain a symbol table mapping variables to addresses.
The solution we have chosen is to generate Symbolic JVM, that is,
JVM with symbolic addresses, and translate the symbolic addresses to
(relative) memory locations by a postprocessor.
Since the postprocessor works uniformly for whole Symbolic JVM,
building a new compiler to generate JVM should now be
possible by just writing GF grammars. The most immediate
idea for developing GF as a compiler tool is to define
a similar symbolic format for an intermediate language,
using three-operand code and virtual registers.
The most serious difficulty with using GF as a compiler tool
is how to generate machine code by linearization if this depends on
a symbol table mapping variables to addresses.
Since the compositional linearization model of GF does not
support this, we needed postprocessing to get real JVM code
from the linearization result.
\bibliographystyle{plain}
@@ -925,10 +1090,8 @@ abstract Imper = PredefAbs ** {
fun
Empty : Program ;
Funct : (AS : ListTyp) -> (V : Typ) ->
(Fun AS V -> Rec AS) -> Program ;
FunctNil : (V : Typ) ->
Stm -> (Fun NilTyp V -> Program) -> Program ;
Funct : (AS : ListTyp) -> (V : Typ) -> (Fun AS V -> Rec AS) -> Program ;
FunctNil : (V : Typ) -> Stm -> (Fun NilTyp V -> Program) -> Program ;
RecOne : (A : Typ) -> (Var A -> Stm) -> Program -> Rec (ConsTyp A NilTyp) ;
RecCons : (A : Typ) -> (AS : ListTyp) ->
(Var A -> Rec AS) -> Program -> Rec (ConsTyp A AS) ;
@@ -973,17 +1136,14 @@ concrete ImperC of Imper = open ResImper in {
lincat
Exp = PrecExp ;
Typ, NumTyp = {s,s2 : Str} ;
Rec = {s,s2,s3 : Str} ;
lin
Empty = ss [] ;
FunctNil val stm cont = ss (
val.s ++ cont.$0 ++ paren [] ++ "{" ++
stm.s ++ "}" ++ ";" ++ cont.s) ;
val.s ++ cont.$0 ++ paren [] ++ "{" ++ stm.s ++ "}" ++ ";" ++ cont.s) ;
Funct args val rec = ss (
val.s ++ rec.$0 ++ paren rec.s2 ++ "{" ++
rec.s ++ "}" ++ ";" ++ rec.s3) ;
val.s ++ rec.$0 ++ paren rec.s2 ++ "{" ++ rec.s ++ "}" ++ ";" ++ rec.s3) ;
RecOne typ stm prg = stm ** {
s2 = typ.s ++ stm.$0 ;
s3 = prg.s
@@ -999,7 +1159,7 @@ concrete ImperC of Imper = open ResImper in {
While exp loop = continue ("while" ++ paren exp.s ++ loop.s) ;
IfElse exp t f = continue ("if" ++ paren exp.s ++ t.s ++ "else" ++ f.s) ;
Block stm = continue ("{" ++ stm.s ++ "}") ;
Printf t e = continues ("printf" ++ paren (t.s ++ "," ++ e.s)) ;
Printf t e = continues ("printf" ++ paren (t.s2 ++ "," ++ e.s)) ;
Return _ exp = statement ("return" ++ exp.s) ;
Returnv = statement "return" ;
End = ss [] ;
@@ -1011,17 +1171,13 @@ concrete ImperC of Imper = open ResImper in {
EAdd _ = infixL 2 "+" ;
ESub _ = infixL 2 "-" ;
ELt _ = infixN 1 "<" ;
EAppNil val f = constant (f.s ++ paren []) ;
EApp args val f exps = constant (f.s ++ paren exps.s) ;
TNum t = t ;
TInt = ss "int" ;
TFloat = ss "float" ;
NilTyp = ss [] ;
ConsTyp = cc2 ;
OneExp _ e = e ;
ConsExp _ _ e es = ss (e.s ++ "," ++ es.s) ;
TNum t = t ;
TInt = {s = "int" ; s2 = "\"%d\""} ; TFloat = {s = "float" ; s2 = "\"%f\""} ;
NilTyp = ss [] ; ConsTyp = cc2 ;
OneExp _ e = e ; ConsExp _ _ e es = ss (e.s ++ "," ++ es.s) ;
}
\end{verbatim}
\normalsize
@@ -1213,54 +1369,65 @@ resource ResImper = open Predef in {
\newpage
\subsection*{Appendix E: Translation to real JVM}
\subsection*{Appendix E: Translation of Symbolic JVM to Jasmin}
This program is written in Haskell. Most of the changes concern
spacing and could be done line by line; the really substantial
change is due to the need to build a symbol table of variables
stored relative to the frame pointer and look up variable
addresses at each load and store.
\small
\begin{verbatim}
module JVM where
module Main where
import Char
import System
mkJVM :: String -> String
mkJVM = unlines . reverse . fst . foldl trans ([],([],0)) . lines where
main :: IO ()
main = do
jvm:src:_ <- getArgs
s <- readFile jvm
let cls = takeWhile (/='.') src
let obj = cls ++ ".j"
writeFile obj $ boilerplate cls
appendFile obj $ mkJVM cls s
putStrLn $ "wrote file " ++ obj
mkJVM :: String -> String -> String
mkJVM cls = unlines . reverse . fst . foldl trans ([],([],0)) . lines where
trans (code,(env,v)) s = case words s of
".method":f:ns -> ((".method " ++ f ++ concat ns):code,([],0))
"alloc":t:x:_ -> (code, ((x,v):env, v + size t))
".limit":"locals":ns -> chCode (".limit locals " ++ show (length ns - 1))
t:"_load" :x:_ -> chCode (t ++ "load " ++ look x)
t:"_store":x:_ -> chCode (t ++ "store " ++ look x)
t:"_return":_ -> chCode (t ++ "return")
"goto":ns -> chCode ("goto " ++ concat ns)
"ifzero":ns -> chCode ("ifeq " ++ concat ns)
_ -> chCode s
".method":p:s:f:ns
| f == "main" -> (".method public static main([Ljava/lang/String;)V":code,([],1))
| otherwise -> (unwords [".method",p,s, f ++ typesig ns] : code,([],0))
"alloc":t:x:_ -> (("; " ++ s):code, ((x,v):env, v + size t))
".limit":"locals":ns -> chCode (".limit locals " ++ show (length ns))
"invokestatic":t:f:ns | take 8 f == "runtime/" ->
chCode $ "invokestatic " ++ "runtime/" ++ t ++ drop 8 f ++ typesig ns
"invokestatic":f:ns -> chCode $ "invokestatic " ++ cls ++ "/" ++ f ++ typesig ns
"alloc":ns -> chCode $ "; " ++ s
t:('_':instr):[";"] -> chCode $ t ++ instr
t:('_':instr):x:_ -> chCode $ t ++ instr ++ " " ++ look x
"goto":ns -> chCode $ "goto " ++ label ns
"ifeq":ns -> chCode $ "ifeq " ++ label ns
"label":ns -> chCode $ label ns ++ ":"
";":[] -> chCode ""
_ -> chCode s
where
chCode c = (c:code,(env,v))
look x = maybe (x ++ show env) show $ lookup x env
size t = case t of
look x = maybe (error $ x ++ show env) show $ lookup x env
typesig = init . map toUpper . concat
label = init . concat
size t = case t of
"d" -> 2
_ -> 1
boilerplate :: String -> String
boilerplate cls = unlines [
".class public " ++ cls, ".super java/lang/Object",
".method public <init>()V","aload_0",
"invokenonvirtual java/lang/Object/<init>()V","return",
".end method"]
\end{verbatim}
\normalsize
\newpage
\subsection*{Appendix F: A Syntax Editor screen dump}
%Show Fig~\ref{demo}
\begin{figure}
\centerline{\psfig{figure=demo2.ps}} \caption{
GF editor session where an integer
expression is to be given. The left window shows the
abstract syntax tree, and the right window the evolving C and
JVM core. The focus is shadowed, and the possible refinements
are shown in a pop-up window.
}
\label{demo}
\end{figure}
\end{document}