This commit is contained in:
2026-06-11 10:59:54 -06:00
commit 8650a71f67
159 changed files with 78653 additions and 0 deletions
+82
View File
@@ -0,0 +1,82 @@
#include "../all.h"
typedef struct Amd64Op Amd64Op;
enum Amd64Reg {
RAX = RXX+1, /* caller-save */
RCX, /* caller-save */
RDX, /* caller-save */
RSI, /* caller-save on sysv, callee-save on win */
RDI, /* caller-save on sysv, callee-save on win */
R8, /* caller-save */
R9, /* caller-save */
R10, /* caller-save */
R11, /* caller-save */
RBX, /* callee-save */
R12,
R13,
R14,
R15,
RBP, /* globally live */
RSP,
XMM0, /* sse */
XMM1,
XMM2,
XMM3,
XMM4,
XMM5,
XMM6,
XMM7,
XMM8,
XMM9,
XMM10,
XMM11,
XMM12,
XMM13,
XMM14,
XMM15,
NFPR = XMM14 - XMM0 + 1, /* reserve XMM15 */
NGPR = RSP - RAX + 1,
NFPS = NFPR,
NGPS_SYSV = R11 - RAX + 1,
NCLR_SYSV = R15 - RBX + 1,
NGPS_WIN = R11 - RAX + 1 - 2, /* -2 for RDI/RDI */
NCLR_WIN = R15 - RBX + 1 + 2, /* +2 for RDI/RDI */
};
MAKESURE(reg_not_tmp, XMM15 < (int)Tmp0);
struct Amd64Op {
char nmem;
char zflag;
char lflag;
};
/* targ.c */
extern Amd64Op amd64_op[];
/* sysv.c (abi) */
extern int amd64_sysv_rsave[];
extern int amd64_sysv_rclob[];
bits amd64_sysv_retregs(Ref, int[2]);
bits amd64_sysv_argregs(Ref, int[2]);
void amd64_sysv_abi(Fn *);
/* winabi.c */
extern int amd64_winabi_rsave[];
extern int amd64_winabi_rclob[];
bits amd64_winabi_retregs(Ref, int[2]);
bits amd64_winabi_argregs(Ref, int[2]);
void amd64_winabi_abi(Fn *);
/* isel.c */
void amd64_isel(Fn *);
/* emit.c */
void amd64_sysv_emitfn(Fn *, FILE *);
void amd64_winabi_emitfn(Fn *, FILE *);
+881
View File
@@ -0,0 +1,881 @@
#include "all.h"
typedef struct E E;
struct E {
FILE *f;
Fn *fn;
int fp;
uint64_t fsz;
int nclob;
};
#define CMP(X) \
X(Ciule, "be", "a") \
X(Ciult, "b", "ae") \
X(Cisle, "le", "g") \
X(Cislt, "l", "ge") \
X(Cisgt, "g", "le") \
X(Cisge, "ge", "l") \
X(Ciugt, "a", "be") \
X(Ciuge, "ae", "b") \
X(Cieq, "z", "nz") \
X(Cine, "nz", "z") \
X(NCmpI+Cfle, "?" , "?") \
X(NCmpI+Cflt, "?", "?") \
X(NCmpI+Cfgt, "a", "be") \
X(NCmpI+Cfge, "ae", "b") \
X(NCmpI+Cfo, "np", "p") \
X(NCmpI+Cfuo, "p", "np")
enum {
SLong = 0,
SWord = 1,
SShort = 2,
SByte = 3,
Ki = -1, /* matches Kw and Kl */
Ka = -2, /* matches all classes */
};
/* Instruction format strings:
*
* if the format string starts with -, the instruction
* is assumed to be 3-address and is put in 2-address
* mode using an extra mov if necessary
*
* if the format string starts with +, the same as the
* above applies, but commutativity is also assumed
*
* %k is used to set the class of the instruction,
* it'll expand to "l", "q", "ss", "sd", depending
* on the instruction class
* %0 designates the first argument
* %1 designates the second argument
* %= designates the result
*
* if %k is not used, a prefix to 0, 1, or = must be
* added, it can be:
* M - memory reference
* L - long (64 bits)
* W - word (32 bits)
* H - short (16 bits)
* B - byte (8 bits)
* S - single precision float
* D - double precision float
*/
static struct {
short op;
short cls;
char *fmt;
} omap[] = {
{ Oadd, Ka, "+add%k %1, %=" },
{ Osub, Ka, "-sub%k %1, %=" },
{ Oand, Ki, "+and%k %1, %=" },
{ Oor, Ki, "+or%k %1, %=" },
{ Oxor, Ki, "+xor%k %1, %=" },
{ Osar, Ki, "-sar%k %B1, %=" },
{ Oshr, Ki, "-shr%k %B1, %=" },
{ Oshl, Ki, "-shl%k %B1, %=" },
{ Omul, Ki, "+imul%k %1, %=" },
{ Omul, Ks, "+mulss %1, %=" },
{ Omul, Kd, "+mulsd %1, %=" },
{ Odiv, Ka, "-div%k %1, %=" },
{ Ostorel, Ka, "movq %L0, %M1" },
{ Ostorew, Ka, "movl %W0, %M1" },
{ Ostoreh, Ka, "movw %H0, %M1" },
{ Ostoreb, Ka, "movb %B0, %M1" },
{ Ostores, Ka, "movss %S0, %M1" },
{ Ostored, Ka, "movsd %D0, %M1" },
{ Oload, Ka, "mov%k %M0, %=" },
{ Oloadsw, Kl, "movslq %M0, %L=" },
{ Oloadsw, Kw, "movl %M0, %W=" },
{ Oloaduw, Ki, "movl %M0, %W=" },
{ Oloadsh, Ki, "movsw%k %M0, %=" },
{ Oloaduh, Ki, "movzw%k %M0, %=" },
{ Oloadsb, Ki, "movsb%k %M0, %=" },
{ Oloadub, Ki, "movzb%k %M0, %=" },
{ Oextsw, Kl, "movslq %W0, %L=" },
{ Oextuw, Kl, "movl %W0, %W=" },
{ Oextsh, Ki, "movsw%k %H0, %=" },
{ Oextuh, Ki, "movzw%k %H0, %=" },
{ Oextsb, Ki, "movsb%k %B0, %=" },
{ Oextub, Ki, "movzb%k %B0, %=" },
{ Oexts, Kd, "cvtss2sd %0, %=" },
{ Otruncd, Ks, "cvtsd2ss %0, %=" },
{ Ostosi, Ki, "cvttss2si%k %0, %=" },
{ Odtosi, Ki, "cvttsd2si%k %0, %=" },
{ Oswtof, Ka, "cvtsi2%k %W0, %=" },
{ Osltof, Ka, "cvtsi2%k %L0, %=" },
{ Ocast, Ki, "movq %D0, %L=" },
{ Ocast, Ka, "movq %L0, %D=" },
{ Oaddr, Ki, "lea%k %M0, %=" },
{ Oswap, Ki, "xchg%k %0, %1" },
{ Osign, Kl, "cqto" },
{ Osign, Kw, "cltd" },
{ Oxdiv, Ki, "div%k %0" },
{ Oxidiv, Ki, "idiv%k %0" },
{ Oxcmp, Ks, "ucomiss %S0, %S1" },
{ Oxcmp, Kd, "ucomisd %D0, %D1" },
{ Oxcmp, Ki, "cmp%k %0, %1" },
{ Oxtest, Ki, "test%k %0, %1" },
#define X(c, s, _) \
{ Oflag+c, Ki, "set" s " %B=\n\tmovzb%k %B=, %=" },
CMP(X)
#undef X
{ Oflagfeq, Ki, "setz %B=\n\tmovzb%k %B=, %=" },
{ Oflagfne, Ki, "setnz %B=\n\tmovzb%k %B=, %=" },
{ NOp, 0, 0 }
};
static char cmov[][2][16] = {
#define X(c, s0, s1) \
[c] = { \
"cmov" s0 " %0, %=", \
"cmov" s1 " %1, %=", \
},
CMP(X)
#undef X
};
static char *rname[][4] = {
[RAX] = {"rax", "eax", "ax", "al"},
[RBX] = {"rbx", "ebx", "bx", "bl"},
[RCX] = {"rcx", "ecx", "cx", "cl"},
[RDX] = {"rdx", "edx", "dx", "dl"},
[RSI] = {"rsi", "esi", "si", "sil"},
[RDI] = {"rdi", "edi", "di", "dil"},
[RBP] = {"rbp", "ebp", "bp", "bpl"},
[RSP] = {"rsp", "esp", "sp", "spl"},
[R8 ] = {"r8" , "r8d", "r8w", "r8b"},
[R9 ] = {"r9" , "r9d", "r9w", "r9b"},
[R10] = {"r10", "r10d", "r10w", "r10b"},
[R11] = {"r11", "r11d", "r11w", "r11b"},
[R12] = {"r12", "r12d", "r12w", "r12b"},
[R13] = {"r13", "r13d", "r13w", "r13b"},
[R14] = {"r14", "r14d", "r14w", "r14b"},
[R15] = {"r15", "r15d", "r15w", "r15b"},
};
static int
slot(Ref r, E *e)
{
int s;
s = rsval(r);
assert(s <= e->fn->slot);
/* specific to NAlign == 3 */
if (s < 0) {
if (e->fp == RSP)
return 4*-s - 8 + e->fsz + e->nclob*8;
else
return 4*-s;
}
else if (e->fp == RSP)
return 4*s + e->nclob*8;
else if (e->fn->vararg) {
if (T.windows)
return -4 * (e->fn->slot - s);
else
return -176 + -4 * (e->fn->slot - s);
} else
return -4 * (e->fn->slot - s);
}
static void
emitcon(Con *con, E *e)
{
char *p, *l;
switch (con->type) {
case CAddr:
l = str(con->sym.id);
p = l[0] == '"' ? "" : T.assym;
if (con->sym.type == SThr) {
assert(!T.apple);
fprintf(e->f, "%%fs:%s%s@tpoff", p, l);
} else {
assert((con->sym.type & ~SExt) == SGlo);
fprintf(e->f, "%s%s", p, l);
}
if (con->bits.i)
fprintf(e->f, "%+"PRId64, con->bits.i);
break;
case CBits:
fprintf(e->f, "%"PRId64, con->bits.i);
break;
default:
die("unreachable");
}
}
static char *
regtoa(int reg, int sz)
{
static char buf[6];
assert(reg <= XMM15);
if (reg >= XMM0) {
sprintf(buf, "xmm%d", reg-XMM0);
return buf;
} else
return rname[reg][sz];
}
static Ref
getarg(char c, Ins *i)
{
switch (c) {
case '0':
return i->arg[0];
case '1':
return i->arg[1];
case '=':
return i->to;
default:
die("invalid arg letter %c", c);
}
}
static void emitins(Ins, E *);
static void
emitcopy(Ref r1, Ref r2, int k, E *e)
{
Ins icp;
icp.op = Ocopy;
icp.arg[0] = r2;
icp.to = r1;
icp.cls = k;
emitins(icp, e);
}
static void
emitf(char *s, Ins *i, E *e)
{
static char clstoa[][3] = {"l", "q", "ss", "sd"};
char c;
int sz;
Ref ref;
Mem *m;
Con off;
switch (*s) {
case '+':
if (req(i->arg[1], i->to)) {
ref = i->arg[0];
i->arg[0] = i->arg[1];
i->arg[1] = ref;
}
/* fall through */
case '-':
assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) &&
"cannot convert to 2-address");
emitcopy(i->to, i->arg[0], i->cls, e);
s++;
break;
}
fputc('\t', e->f);
Next:
while ((c = *s++) != '%')
if (!c) {
fputc('\n', e->f);
return;
} else
fputc(c, e->f);
switch ((c = *s++)) {
case '%':
fputc('%', e->f);
break;
case 'k':
fputs(clstoa[i->cls], e->f);
break;
case '0':
case '1':
case '=':
sz = KWIDE(i->cls) ? SLong : SWord;
s--;
goto Ref;
case 'D':
case 'S':
sz = SLong; /* does not matter for floats */
Ref:
c = *s++;
ref = getarg(c, i);
switch (rtype(ref)) {
case RTmp:
assert(isreg(ref));
fprintf(e->f, "%%%s", regtoa(ref.val, sz));
break;
case RSlot:
fprintf(e->f, "%d(%%%s)",
slot(ref, e),
regtoa(e->fp, SLong)
);
break;
case RMem:
Mem:
m = &e->fn->mem[ref.val];
if (rtype(m->base) == RSlot) {
off.type = CBits;
off.bits.i = slot(m->base, e);
addcon(&m->offset, &off, 1);
m->base = TMP(e->fp);
}
if (m->offset.type != CUndef)
emitcon(&m->offset, e);
fputc('(', e->f);
if (!req(m->base, R))
fprintf(e->f, "%%%s",
regtoa(m->base.val, SLong)
);
else if (m->offset.type == CAddr)
fprintf(e->f, "%%rip");
if (!req(m->index, R))
fprintf(e->f, ", %%%s, %d",
regtoa(m->index.val, SLong),
m->scale
);
fputc(')', e->f);
break;
case RCon:
fputc('$', e->f);
emitcon(&e->fn->con[ref.val], e);
break;
default:
die("unreachable");
}
break;
case 'L':
sz = SLong;
goto Ref;
case 'W':
sz = SWord;
goto Ref;
case 'H':
sz = SShort;
goto Ref;
case 'B':
sz = SByte;
goto Ref;
case 'M':
c = *s++;
ref = getarg(c, i);
switch (rtype(ref)) {
case RMem:
goto Mem;
case RSlot:
fprintf(e->f, "%d(%%%s)",
slot(ref, e),
regtoa(e->fp, SLong)
);
break;
case RCon:
off = e->fn->con[ref.val];
emitcon(&off, e);
if (off.type == CAddr)
if (off.sym.type != SThr)
fprintf(e->f, "(%%rip)");
break;
case RTmp:
assert(isreg(ref));
fprintf(e->f, "(%%%s)", regtoa(ref.val, SLong));
break;
default:
die("unreachable");
}
break;
default:
die("invalid format specifier %%%c", c);
}
goto Next;
}
static bits negmask[4] = {
[Ks] = 0x80000000,
[Kd] = 0x8000000000000000,
};
static void
emitins(Ins i, E *e)
{
Ref r;
int64_t val;
int o, t0;
Ins ineg;
Con *con;
char *sym;
switch (i.op) {
default:
if (isxsel(i.op))
goto case_Oxsel;
Table:
/* most instructions are just pulled out of
* the table omap[], some special cases are
* detailed below */
for (o=0;; o++) {
/* this linear search should really be a binary
* search */
if (omap[o].op == NOp)
die("no match for %s(%c)",
optab[i.op].name, "wlsd"[i.cls]);
if (omap[o].op == i.op)
if (omap[o].cls == i.cls
|| (omap[o].cls == Ki && KBASE(i.cls) == 0)
|| (omap[o].cls == Ka))
break;
}
emitf(omap[o].fmt, &i, e);
break;
case Onop:
/* just do nothing for nops, they are inserted
* by some passes */
break;
case Omul:
/* here, we try to use the 3-addresss form
* of multiplication when possible */
if (rtype(i.arg[1]) == RCon) {
r = i.arg[0];
i.arg[0] = i.arg[1];
i.arg[1] = r;
}
if (KBASE(i.cls) == 0 /* only available for ints */
&& rtype(i.arg[0]) == RCon
&& rtype(i.arg[1]) == RTmp) {
emitf("imul%k %0, %1, %=", &i, e);
break;
}
goto Table;
case Osub:
/* we have to use the negation trick to handle
* some 3-address subtractions */
if (req(i.to, i.arg[1]) && !req(i.arg[0], i.to)) {
ineg = (Ins){Oneg, i.cls, i.to, {i.to}};
emitins(ineg, e);
emitf("add%k %0, %=", &i, e);
break;
}
goto Table;
case Oneg:
if (!req(i.to, i.arg[0]))
emitf("mov%k %0, %=", &i, e);
if (KBASE(i.cls) == 0)
emitf("neg%k %=", &i, e);
else
fprintf(e->f,
"\txorp%c %sfp%d(%%rip), %%%s\n",
"xxsd"[i.cls],
T.asloc,
stashbits(negmask[i.cls], 16),
regtoa(i.to.val, SLong)
);
break;
case Odiv:
/* use xmm15 to adjust the instruction when the
* conversion to 2-address in emitf() would fail */
if (req(i.to, i.arg[1])) {
i.arg[1] = TMP(XMM0+15);
emitf("mov%k %=, %1", &i, e);
emitf("mov%k %0, %=", &i, e);
i.arg[0] = i.to;
}
goto Table;
case Ocopy:
/* copies are used for many things; see my note
* to understand how to load big constants:
* https://c9x.me/notes/2015-09-19.html */
assert(rtype(i.to) != RMem);
if (req(i.to, R) || req(i.arg[0], R))
break;
if (req(i.to, i.arg[0]))
break;
t0 = rtype(i.arg[0]);
if (i.cls == Kl
&& t0 == RCon
&& e->fn->con[i.arg[0].val].type == CBits) {
val = e->fn->con[i.arg[0].val].bits.i;
if (isreg(i.to))
if (val >= 0 && val <= UINT32_MAX) {
emitf("movl %W0, %W=", &i, e);
break;
}
if (rtype(i.to) == RSlot)
if (val < INT32_MIN || val > INT32_MAX) {
emitf("movl %0, %=", &i, e);
emitf("movl %0>>32, 4+%=", &i, e);
break;
}
}
if (isreg(i.to)
&& t0 == RCon
&& e->fn->con[i.arg[0].val].type == CAddr) {
emitf("lea%k %M0, %=", &i, e);
break;
}
if (rtype(i.to) == RSlot
&& (t0 == RSlot || t0 == RMem)) {
i.cls = KWIDE(i.cls) ? Kd : Ks;
i.arg[1] = TMP(XMM0+15);
emitf("mov%k %0, %1", &i, e);
emitf("mov%k %1, %=", &i, e);
break;
}
/* conveniently, the assembler knows if it
* should use movabsq when reading movq */
emitf("mov%k %0, %=", &i, e);
break;
case Oaddr:
if (rtype(i.arg[0]) != RCon)
goto Table;
con = &e->fn->con[i.arg[0].val];
assert(isreg(i.to) && con->type == CAddr);
sym = str(con->sym.id);
if (T.apple && (con->sym.type & SThr)) {
fprintf(e->f,
"\tmovq %s%s@tlvp(%%rip), %%%s\n",
sym[0] == '"' ? "" : T.assym, sym,
regtoa(i.to.val, SLong));
break;
}
if (T.windows && con->sym.type != SGlo)
die("extern/thread unsupported on amd64_win");
switch (con->sym.type) {
case SThr:
/* derive the symbol address from the TCB
* address at offset 0 of %fs */
emitf("movq %%fs:0, %L=", &i, e);
fprintf(e->f, "\tleaq %s%s@tpoff",
sym[0] == '"' ? "" : T.assym, sym);
if (con->bits.i)
fprintf(e->f, "%+"PRId64,
con->bits.i);
fprintf(e->f, "(%%%s), %%%s\n",
regtoa(i.to.val, SLong),
regtoa(i.to.val, SLong));
break;
case SExtThr:
/* initial-exec TLS: load offset from
* GOT, add to thread-base register */
assert(!con->bits.i);
emitf("movq %%fs:0, %L=", &i, e);
fprintf(e->f,
"\taddq %s%s@gottpoff(%%rip), %%%s\n",
sym[0] == '"' ? "" : T.assym, sym,
regtoa(i.to.val, SLong));
break;
case SExt:
/* load address from the GOT */
assert(!con->bits.i);
fprintf(e->f,
"\tmovq %s%s@gotpcrel(%%rip), %%%s\n",
sym[0] == '"' ? "" : T.assym, sym,
regtoa(i.to.val, SLong));
break;
default:
goto Table;
}
break;
case Ocall:
/* calls simply have a weird syntax in AT&T
* assembly... */
switch (rtype(i.arg[0])) {
case RCon:
con = &e->fn->con[i.arg[0].val];
fprintf(e->f, "\tcallq ");
emitcon(con, e);
if (con->type == CAddr
&& (con->sym.type & SExt)
&& !T.apple)
fprintf(e->f, "@plt");
fprintf(e->f, "\n");
break;
case RTmp:
emitf("callq *%L0", &i, e);
break;
default:
die("invalid call argument");
}
break;
case Osalloc:
/* there is no good reason why this is here
* maybe we should split Osalloc in 2 different
* instructions depending on the result
*/
assert(e->fp == RBP);
emitf("subq %L0, %%rsp", &i, e);
if (!req(i.to, R))
emitcopy(i.to, TMP(RSP), Kl, e);
break;
case Oswap:
if (KBASE(i.cls) == 0)
goto Table;
/* for floats, there is no swap instruction
* so we use xmm15 as a temporary
*/
emitcopy(TMP(XMM0+15), i.arg[0], i.cls, e);
emitcopy(i.arg[0], i.arg[1], i.cls, e);
emitcopy(i.arg[1], TMP(XMM0+15), i.cls, e);
break;
case Odbgloc:
emitdbgloc(i.arg[0].val, i.arg[1].val, e->f);
break;
case_Oxsel:
if (req(i.to, i.arg[1]))
emitf(cmov[i.op-Oxsel][0], &i, e);
else {
if (!req(i.to, i.arg[0]))
emitf("mov %0, %=", &i, e);
emitf(cmov[i.op-Oxsel][1], &i, e);
}
break;
}
}
static void
sysv_framesz(E *e)
{
uint64_t i, o, f;
/* specific to NAlign == 3 */
o = 0;
if (!e->fn->leaf) {
for (i=0, o=0; i<NCLR_SYSV; i++)
o ^= e->fn->reg >> amd64_sysv_rclob[i];
o &= 1;
}
f = e->fn->slot;
f = (f + 3) & -4;
if (f > 0
&& e->fp == RSP
&& e->fn->salign == 4)
f += 2;
e->fsz = 4*f + 8*o + 176*e->fn->vararg;
}
void
amd64_sysv_emitfn(Fn *fn, FILE *f)
{
static char *ctoa[][2] = {
#define X(c, s, n) [c] = {s, n},
CMP(X)
#undef X
};
static int id0;
Blk *b, *s;
Ins *i, itmp;
int *r, c, o, n, lbl;
uint p;
E *e;
e = &(E){.f = f, .fn = fn};
emitfnlnk(fn->name, &fn->lnk, f);
fputs("\tendbr64\n", f);
if (!fn->leaf || fn->vararg || fn->dynalloc) {
e->fp = RBP;
fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
} else
e->fp = RSP;
sysv_framesz(e);
if (e->fsz)
fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz);
if (fn->vararg) {
o = -176;
for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8)
fprintf(f, "\tmovq %%%s, %d(%%rbp)\n", rname[*r][0], o);
for (n=0; n<8; ++n, o+=16)
fprintf(f, "\tmovaps %%xmm%d, %d(%%rbp)\n", n, o);
}
for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR_SYSV]; r++)
if (fn->reg & BIT(*r)) {
itmp.arg[0] = TMP(*r);
emitf("pushq %L0", &itmp, e);
e->nclob++;
}
for (lbl=0, b=fn->start; b; b=b->link) {
if (lbl || b->npred > 1) {
for (p=0; p<b->npred; p++)
if (b->pred[p]->id >= b->id)
break;
if (p != b->npred)
fprintf(f, ".p2align 4\n");
fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
}
for (i=b->ins; i!=&b->ins[b->nins]; i++)
emitins(*i, e);
lbl = 1;
switch (b->jmp.type) {
case Jhlt:
fprintf(f, "\tud2\n");
break;
case Jret0:
if (fn->dynalloc)
fprintf(f,
"\tmovq %%rbp, %%rsp\n"
"\tsubq $%"PRIu64", %%rsp\n",
e->fsz + e->nclob * 8);
for (r=&amd64_sysv_rclob[NCLR_SYSV]; r>amd64_sysv_rclob;)
if (fn->reg & BIT(*--r)) {
itmp.arg[0] = TMP(*r);
emitf("popq %L0", &itmp, e);
}
if (e->fp == RBP)
fputs("\tleave\n", f);
else if (e->fsz)
fprintf(f,
"\taddq $%"PRIu64", %%rsp\n",
e->fsz);
fputs("\tret\n", f);
break;
case Jjmp:
Jmp:
if (b->s1 != b->link)
fprintf(f, "\tjmp %sbb%d\n",
T.asloc, id0+b->s1->id);
else
lbl = 0;
break;
default:
c = b->jmp.type - Jjf;
if (0 <= c && c <= NCmp) {
if (b->link == b->s2) {
s = b->s1;
b->s1 = b->s2;
b->s2 = s;
n = 0;
} else
n = 1;
fprintf(f, "\tj%s %sbb%d\n", ctoa[c][n],
T.asloc, id0+b->s2->id);
goto Jmp;
}
die("unhandled jump %d", b->jmp.type);
}
}
id0 += fn->nblk;
if (!T.apple)
elf_emitfnfin(fn->name, f);
}
static void
winabi_framesz(E *e)
{
uint64_t i, o, f;
/* specific to NAlign == 3 */
o = 0;
if (!e->fn->leaf) {
for (i=0, o=0; i<NCLR_WIN; i++)
o ^= e->fn->reg >> amd64_winabi_rclob[i];
o &= 1;
}
f = e->fn->slot;
f = (f + 3) & -4;
if (f > 0
&& e->fp == RSP
&& e->fn->salign == 4)
f += 2;
e->fsz = 4*f + 8*o;
}
void
amd64_winabi_emitfn(Fn *fn, FILE *f)
{
static char *ctoa[][2] = {
#define X(c, s, n) [c] = {s, n},
CMP(X)
#undef X
};
static int id0;
Blk *b, *s;
Ins *i, itmp;
int *r, c, n, lbl;
E *e;
e = &(E){.f = f, .fn = fn};
emitfnlnk(fn->name, &fn->lnk, f);
fputs("\tendbr64\n", f);
if (fn->vararg) {
fprintf(f, "\tmovq %%rcx, 0x8(%%rsp)\n");
fprintf(f, "\tmovq %%rdx, 0x10(%%rsp)\n");
fprintf(f, "\tmovq %%r8, 0x18(%%rsp)\n");
fprintf(f, "\tmovq %%r9, 0x20(%%rsp)\n");
}
if (!fn->leaf || fn->vararg || fn->dynalloc) {
e->fp = RBP;
fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
} else
e->fp = RSP;
winabi_framesz(e);
if (e->fsz)
fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz);
for (r=amd64_winabi_rclob; r<&amd64_winabi_rclob[NCLR_WIN]; r++)
if (fn->reg & BIT(*r)) {
itmp.arg[0] = TMP(*r);
emitf("pushq %L0", &itmp, e);
e->nclob++;
}
for (lbl=0, b=fn->start; b; b=b->link) {
if (lbl || b->npred > 1)
fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
for (i=b->ins; i!=&b->ins[b->nins]; i++)
emitins(*i, e);
lbl = 1;
switch (b->jmp.type) {
case Jhlt:
fprintf(f, "\tud2\n");
break;
case Jret0:
if (fn->dynalloc)
fprintf(f,
"\tmovq %%rbp, %%rsp\n"
"\tsubq $%"PRIu64", %%rsp\n",
e->fsz + e->nclob * 8);
for (r=&amd64_winabi_rclob[NCLR_WIN]; r>amd64_winabi_rclob;)
if (fn->reg & BIT(*--r)) {
itmp.arg[0] = TMP(*r);
emitf("popq %L0", &itmp, e);
}
if (e->fp == RBP)
fputs("\tleave\n", f);
else if (e->fsz)
fprintf(f,
"\taddq $%"PRIu64", %%rsp\n",
e->fsz);
fputs("\tret\n", f);
break;
case Jjmp:
Jmp:
if (b->s1 != b->link)
fprintf(f, "\tjmp %sbb%d\n",
T.asloc, id0+b->s1->id);
else
lbl = 0;
break;
default:
c = b->jmp.type - Jjf;
if (0 <= c && c <= NCmp) {
if (b->link == b->s2 || c >= NCmpI) {
s = b->s1;
b->s1 = b->s2;
b->s2 = s;
n = 0;
} else
n = 1;
fprintf(f, "\tj%s %sbb%d\n", ctoa[c][n],
T.asloc, id0+b->s2->id);
goto Jmp;
}
die("unhandled jump %d", b->jmp.type);
}
}
id0 += fn->nblk;
}
+944
View File
@@ -0,0 +1,944 @@
#include "all.h"
#include <limits.h>
/* For x86_64, do the following:
*
* - check that constants are used only in
* places allowed
* - ensure immediates always fit in 32b
* - expose machine register contraints
* on instructions like division.
* - implement fast locals (the streak of
* constant allocX in the first basic block)
* - recognize complex addressing modes
*
* Invariant: the use counts that are used
* in sel() must be sound. This
* is not so trivial, maybe the
* dce should be moved out...
*/
static int amatch(Addr *, Num *, Ref, Fn *);
static int
noimm(Ref r, Fn *fn)
{
int64_t val;
if (rtype(r) != RCon)
return 0;
switch (fn->con[r.val].type) {
case CAddr:
/* we only support the 'small'
* code model of the ABI, this
* means that we can always
* address data with 32bits
*/
return 0;
case CBits:
val = fn->con[r.val].bits.i;
return (val < INT32_MIN || val > INT32_MAX);
default:
die("invalid constant");
}
}
static int
rslot(Ref r, Fn *fn)
{
if (rtype(r) != RTmp)
return -1;
return fn->tmp[r.val].slot;
}
static int
hascon(Ref r, Con **pc, Fn *fn)
{
switch (rtype(r)) {
case RCon:
*pc = &fn->con[r.val];
return 1;
case RMem:
*pc = &fn->mem[r.val].offset;
return 1;
default:
return 0;
}
}
static void
fixarg(Ref *r, int k, Ins *i, Fn *fn)
{
char buf[32];
Addr a, *m;
Con cc, *c;
Ref r0, r1, r2, r3;
int s, n, op;
r1 = r0 = *r;
s = rslot(r0, fn);
op = i ? i->op : Ocopy;
if (KBASE(k) == 1 && rtype(r0) == RCon) {
/* load floating points from memory
* slots, they can't be used as
* immediates
*/
r1 = MEM(fn->nmem);
vgrow(&fn->mem, ++fn->nmem);
memset(&a, 0, sizeof a);
a.offset.type = CAddr;
n = stashbits(fn->con[r0.val].bits.i, KWIDE(k) ? 8 : 4);
/* quote the name so that we do not
* add symbol prefixes on the apple
* target variant
*/
sprintf(buf, "\"%sfp%d\"", T.asloc, n);
a.offset.sym.id = intern(buf);
fn->mem[fn->nmem-1] = a;
}
else if (op == Ocall && r == &i->arg[0]
&& rtype(r0) == RCon && fn->con[r0.val].type != CAddr) {
/* use a temporary register so that we
* produce an indirect call
*/
r1 = newtmp("isel", Kl, fn);
emit(Ocopy, Kl, r1, r0, R);
}
else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
/* load constants that do not fit in
* a 32bit signed integer into a
* long temporary
*/
r1 = newtmp("isel", Kl, fn);
emit(Ocopy, Kl, r1, r0, R);
}
else if (s != -1) {
/* load fast locals' addresses into
* temporaries right before the
* instruction
*/
r1 = newtmp("isel", Kl, fn);
emit(Oaddr, Kl, r1, SLOT(s), R);
}
else if (op != Ocall && hascon(r0, &c, fn)
&& c->type == CAddr && ((c->sym.type & SExt)
|| (T.apple && c->sym.type == SThr))) {
r1 = newtmp("isel", Kl, fn);
if (c->bits.i) {
r2 = newtmp("isel", Kl, fn);
cc = (Con){.type = CBits};
cc.bits.i = c->bits.i;
r3 = newcon(&cc, fn);
emit(Oadd, Kl, r1, r2, r3);
} else
r2 = r1;
if (T.apple && (c->sym.type & SThr)) {
emit(Ocopy, Kl, r2, TMP(RAX), R);
r2 = newtmp("isel", Kl, fn);
r3 = newtmp("isel", Kl, fn);
emit(Ocall, 0, R, r3, CALL(17));
emit(Ocopy, Kl, TMP(RDI), r2, R);
emit(Oload, Kl, r3, r2, R);
}
cc = *c;
cc.bits.i = 0;
r3 = newcon(&cc, fn);
emit(Oaddr, Kl, r2, r3, R);
if (rtype(r0) == RMem) {
m = &fn->mem[r0.val];
m->offset.type = CUndef;
m->base = r1;
r1 = r0;
}
}
else if (!(isstore(op) && r == &i->arg[1])
&& !isload(op) && op != Ocall && rtype(r0) == RCon
&& fn->con[r0.val].type == CAddr) {
/* turn address operands into
* lea/mov instructions
*/
r1 = newtmp("isel", Kl, fn);
emit(Oaddr, Kl, r1, r0, R);
}
else if (rtype(r0) == RMem) {
/* eliminate memory operands of
* the form $foo(%rip, ...)
*/
m = &fn->mem[r0.val];
if (req(m->base, R))
if (m->offset.type == CAddr) {
r0 = newtmp("isel", Kl, fn);
emit(Oaddr, Kl, r0, newcon(&m->offset, fn), R);
m->offset.type = CUndef;
m->base = r0;
}
}
else if (isxsel(op) && rtype(*r) == RCon) {
r1 = newtmp("isel", i->cls, fn);
emit(Ocopy, i->cls, r1, *r, R);
}
*r = r1;
}
static void
seladdr(Ref *r, Num *tn, Fn *fn)
{
Addr a;
Ref r0;
r0 = *r;
if (rtype(r0) == RTmp) {
memset(&a, 0, sizeof a);
if (!amatch(&a, tn, r0, fn))
return;
if (!req(a.base, R))
if (a.offset.type == CAddr) {
/* apple as does not support
* $foo(%r0, %r1, M); try to
* rewrite it or bail out if
* impossible
*/
if (!req(a.index, R) || rtype(a.base) != RTmp)
return;
else {
a.index = a.base;
a.scale = 1;
a.base = R;
}
}
chuse(r0, -1, fn);
vgrow(&fn->mem, ++fn->nmem);
fn->mem[fn->nmem-1] = a;
chuse(a.base, +1, fn);
chuse(a.index, +1, fn);
*r = MEM(fn->nmem-1);
}
}
static int
cmpswap(Ref arg[2], int op)
{
switch (op) {
case NCmpI+Cflt:
case NCmpI+Cfle:
return 1;
case NCmpI+Cfgt:
case NCmpI+Cfge:
return 0;
}
return rtype(arg[0]) == RCon;
}
static void
selcmp(Ref arg[2], int k, int swap, Fn *fn)
{
Ref r;
Ins *icmp;
if (swap) {
r = arg[1];
arg[1] = arg[0];
arg[0] = r;
}
emit(Oxcmp, k, R, arg[1], arg[0]);
icmp = curi;
if (rtype(arg[0]) == RCon) {
assert(k != Kw);
icmp->arg[1] = newtmp("isel", k, fn);
emit(Ocopy, k, icmp->arg[1], arg[0], R);
fixarg(&curi->arg[0], k, curi, fn);
}
fixarg(&icmp->arg[0], k, icmp, fn);
fixarg(&icmp->arg[1], k, icmp, fn);
}
static void
sel(Ins i, Num *tn, Fn *fn)
{
Ref r0, r1, tmp[7];
int x, j, k, kc, sh, swap;
Ins *i0, *i1;
if (rtype(i.to) == RTmp)
if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
if (fn->tmp[i.to.val].nuse == 0) {
chuse(i.arg[0], -1, fn);
chuse(i.arg[1], -1, fn);
return;
}
i0 = curi;
k = i.cls;
switch (i.op) {
case Odiv:
case Orem:
case Oudiv:
case Ourem:
if (KBASE(k) == 1)
goto Emit;
if (i.op == Odiv || i.op == Oudiv)
r0 = TMP(RAX), r1 = TMP(RDX);
else
r0 = TMP(RDX), r1 = TMP(RAX);
emit(Ocopy, k, i.to, r0, R);
emit(Ocopy, k, R, r1, R);
if (rtype(i.arg[1]) == RCon) {
/* immediates not allowed for
* divisions in x86
*/
r0 = newtmp("isel", k, fn);
} else
r0 = i.arg[1];
if (fn->tmp[r0.val].slot != -1)
err("unlikely argument %%%s in %s",
fn->tmp[r0.val].name, optab[i.op].name);
if (i.op == Odiv || i.op == Orem) {
emit(Oxidiv, k, R, r0, R);
emit(Osign, k, TMP(RDX), TMP(RAX), R);
} else {
emit(Oxdiv, k, R, r0, R);
emit(Ocopy, k, TMP(RDX), CON_Z, R);
}
emit(Ocopy, k, TMP(RAX), i.arg[0], R);
fixarg(&curi->arg[0], k, curi, fn);
if (rtype(i.arg[1]) == RCon)
emit(Ocopy, k, r0, i.arg[1], R);
break;
case Osar:
case Oshr:
case Oshl:
r0 = i.arg[1];
if (rtype(r0) == RCon)
goto Emit;
if (fn->tmp[r0.val].slot != -1)
err("unlikely argument %%%s in %s",
fn->tmp[r0.val].name, optab[i.op].name);
i.arg[1] = TMP(RCX);
emit(Ocopy, Kw, R, TMP(RCX), R);
emiti(i);
i1 = curi;
emit(Ocopy, Kw, TMP(RCX), r0, R);
fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
break;
case Ouwtof:
r0 = newtmp("utof", Kl, fn);
emit(Osltof, k, i.to, r0, R);
emit(Oextuw, Kl, r0, i.arg[0], R);
fixarg(&curi->arg[0], k, curi, fn);
break;
case Oultof:
/* %mask =l and %arg.0, 1
* %isbig =l shr %arg.0, 63
* %divided =l shr %arg.0, %isbig
* %or =l or %mask, %divided
* %float =d sltof %or
* %cast =l cast %float
* %addend =l shl %isbig, 52
* %sum =l add %cast, %addend
* %result =d cast %sum
*/
r0 = newtmp("utof", k, fn);
if (k == Ks)
kc = Kw, sh = 23;
else
kc = Kl, sh = 52;
for (j=0; j<4; j++)
tmp[j] = newtmp("utof", Kl, fn);
for (; j<7; j++)
tmp[j] = newtmp("utof", kc, fn);
emit(Ocast, k, i.to, tmp[6], R);
emit(Oadd, kc, tmp[6], tmp[4], tmp[5]);
emit(Oshl, kc, tmp[5], tmp[1], getcon(sh, fn));
emit(Ocast, kc, tmp[4], r0, R);
emit(Osltof, k, r0, tmp[3], R);
emit(Oor, Kl, tmp[3], tmp[0], tmp[2]);
emit(Oshr, Kl, tmp[2], i.arg[0], tmp[1]);
sel(*curi++, 0, fn);
emit(Oshr, Kl, tmp[1], i.arg[0], getcon(63, fn));
fixarg(&curi->arg[0], Kl, curi, fn);
emit(Oand, Kl, tmp[0], i.arg[0], getcon(1, fn));
fixarg(&curi->arg[0], Kl, curi, fn);
break;
case Ostoui:
i.op = Ostosi;
kc = Ks;
tmp[4] = getcon(0xdf000000, fn);
goto Oftoui;
case Odtoui:
i.op = Odtosi;
kc = Kd;
tmp[4] = getcon(0xc3e0000000000000, fn);
Oftoui:
if (k == Kw) {
r0 = newtmp("ftou", Kl, fn);
emit(Ocopy, Kw, i.to, r0, R);
i.cls = Kl;
i.to = r0;
goto Emit;
}
/* %try0 =l {s,d}tosi %fp
* %mask =l sar %try0, 63
*
* mask is all ones if the first
* try was oob, all zeroes o.w.
*
* %fps ={s,d} sub %fp, (1<<63)
* %try1 =l {s,d}tosi %fps
*
* %tmp =l and %mask, %try1
* %res =l or %tmp, %try0
*/
r0 = newtmp("ftou", kc, fn);
for (j=0; j<4; j++)
tmp[j] = newtmp("ftou", Kl, fn);
emit(Oor, Kl, i.to, tmp[0], tmp[3]);
emit(Oand, Kl, tmp[3], tmp[2], tmp[1]);
emit(i.op, Kl, tmp[2], r0, R);
emit(Oadd, kc, r0, tmp[4], i.arg[0]);
i1 = curi; /* fixarg() can change curi */
fixarg(&i1->arg[0], kc, i1, fn);
fixarg(&i1->arg[1], kc, i1, fn);
emit(Osar, Kl, tmp[1], tmp[0], getcon(63, fn));
emit(i.op, Kl, tmp[0], i.arg[0], R);
fixarg(&curi->arg[0], Kl, curi, fn);
break;
case Onop:
break;
case Ostored:
case Ostores:
case Ostorel:
case Ostorew:
case Ostoreh:
case Ostoreb:
if (rtype(i.arg[0]) == RCon) {
if (i.op == Ostored)
i.op = Ostorel;
if (i.op == Ostores)
i.op = Ostorew;
}
seladdr(&i.arg[1], tn, fn);
goto Emit;
case_Oload:
seladdr(&i.arg[0], tn, fn);
goto Emit;
case Odbgloc:
case Ocall:
case Osalloc:
case Ocopy:
case Oadd:
case Osub:
case Oneg:
case Omul:
case Oand:
case Oor:
case Oxor:
case Oxtest:
case Ostosi:
case Odtosi:
case Oswtof:
case Osltof:
case Oexts:
case Otruncd:
case Ocast:
case_Oxsel:
case_Oext:
Emit:
emiti(i);
i1 = curi; /* fixarg() can change curi */
fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
fixarg(&i1->arg[1], argcls(&i, 1), i1, fn);
break;
case Oalloc4:
case Oalloc8:
case Oalloc16:
salloc(i.to, i.arg[0], fn);
break;
default:
if (isext(i.op))
goto case_Oext;
if (isxsel(i.op))
goto case_Oxsel;
if (isload(i.op))
goto case_Oload;
if (iscmp(i.op, &kc, &x)) {
switch (x) {
case NCmpI+Cfeq:
/* zf is set when operands are
* unordered, so we may have to
* check pf
*/
r0 = newtmp("isel", Kw, fn);
r1 = newtmp("isel", Kw, fn);
emit(Oand, Kw, i.to, r0, r1);
emit(Oflagfo, k, r1, R, R);
i.to = r0;
break;
case NCmpI+Cfne:
r0 = newtmp("isel", Kw, fn);
r1 = newtmp("isel", Kw, fn);
emit(Oor, Kw, i.to, r0, r1);
emit(Oflagfuo, k, r1, R, R);
i.to = r0;
break;
}
swap = cmpswap(i.arg, x);
if (swap)
x = cmpop(x);
emit(Oflag+x, k, i.to, R, R);
selcmp(i.arg, kc, swap, fn);
break;
}
die("unknown instruction %s", optab[i.op].name);
}
while (i0>curi && --i0) {
assert(rslot(i0->arg[0], fn) == -1);
assert(rslot(i0->arg[1], fn) == -1);
}
}
static Ins *
flagi(Ins *i0, Ins *i)
{
while (i>i0) {
i--;
if (amd64_op[i->op].zflag)
return i;
if (amd64_op[i->op].lflag)
continue;
return 0;
}
return 0;
}
static Ins*
selsel(Fn *fn, Blk *b, Ins *i, Num *tn)
{
Ref r, cr[2];
int c, k, swap, gencmp, gencpy;
Ins *isel0, *isel1, *fi;
Tmp *t;
assert(i->op == Osel1);
for (isel0=i; b->ins<isel0; isel0--) {
if (isel0->op == Osel0)
break;
assert(isel0->op == Osel1);
}
assert(isel0->op == Osel0);
r = isel0->arg[0];
assert(rtype(r) == RTmp);
t = &fn->tmp[r.val];
fi = flagi(b->ins, isel0);
cr[0] = cr[1] = R;
gencmp = gencpy = swap = 0;
k = Kw;
c = Cine;
if (!fi || !req(fi->to, r)) {
gencmp = 1;
cr[0] = r;
cr[1] = CON_Z;
}
else if (iscmp(fi->op, &k, &c)) {
if (c == NCmpI+Cfeq
|| c == NCmpI+Cfne) {
/* these are selected as 'and'
* or 'or', so we check their
* result with Cine
*/
c = Cine;
goto Other;
}
swap = cmpswap(fi->arg, c);
if (swap)
c = cmpop(c);
if (t->nuse == 1) {
gencmp = 1;
cr[0] = fi->arg[0];
cr[1] = fi->arg[1];
*fi = (Ins){.op = Onop};
}
}
else if (fi->op == Oand && t->nuse == 1
&& (rtype(fi->arg[0]) == RTmp ||
rtype(fi->arg[1]) == RTmp)) {
fi->op = Oxtest;
fi->to = R;
if (rtype(fi->arg[1]) == RCon) {
r = fi->arg[1];
fi->arg[1] = fi->arg[0];
fi->arg[0] = r;
}
}
else {
Other:
/* since flags are not tracked in liveness,
* the result of the flag-setting instruction
* has to be marked as live
*/
if (t->nuse == 1)
gencpy = 1;
}
/* generate conditional moves */
for (isel1=i; isel0<isel1; --isel1) {
isel1->op = Oxsel+c;
sel(*isel1, tn, fn);
}
assert(!gencmp || !gencpy);
if (gencmp)
selcmp(cr, k, swap, fn);
if (gencpy)
emit(Ocopy, Kw, R, r, R);
*isel0 = (Ins){.op = Onop};
return isel0;
}
static void
seljmp(Blk *b, Fn *fn)
{
Ref r;
int c, k, swap;
Ins *fi;
Tmp *t;
if (b->jmp.type == Jret0
|| b->jmp.type == Jjmp
|| b->jmp.type == Jhlt)
return;
assert(b->jmp.type == Jjnz);
r = b->jmp.arg;
t = &fn->tmp[r.val];
b->jmp.arg = R;
assert(rtype(r) == RTmp);
if (b->s1 == b->s2) {
chuse(r, -1, fn);
b->jmp.type = Jjmp;
b->s2 = 0;
return;
}
fi = flagi(b->ins, &b->ins[b->nins]);
if (!fi || !req(fi->to, r)) {
selcmp((Ref[2]){r, CON_Z}, Kw, 0, fn);
b->jmp.type = Jjf + Cine;
}
else if (iscmp(fi->op, &k, &c)
&& c != NCmpI+Cfeq /* see sel(), selsel() */
&& c != NCmpI+Cfne) {
swap = cmpswap(fi->arg, c);
if (swap)
c = cmpop(c);
if (t->nuse == 1) {
selcmp(fi->arg, k, swap, fn);
*fi = (Ins){.op = Onop};
}
b->jmp.type = Jjf + c;
}
else if (fi->op == Oand && t->nuse == 1
&& (rtype(fi->arg[0]) == RTmp ||
rtype(fi->arg[1]) == RTmp)) {
fi->op = Oxtest;
fi->to = R;
b->jmp.type = Jjf + Cine;
if (rtype(fi->arg[1]) == RCon) {
r = fi->arg[1];
fi->arg[1] = fi->arg[0];
fi->arg[0] = r;
}
}
else {
/* since flags are not tracked in liveness,
* the result of the flag-setting instruction
* has to be marked as live
*/
if (t->nuse == 1)
emit(Ocopy, Kw, R, r, R);
b->jmp.type = Jjf + Cine;
}
}
enum {
Pob,
Pbis,
Pois,
Pobis,
Pbi1,
Pobi1,
};
/* mgen generated code
*
* (with-vars (o b i s)
* (patterns
* (ob (add (con o) (tmp b)))
* (bis (add (tmp b) (mul (tmp i) (con s 1 2 4 8))))
* (ois (add (con o) (mul (tmp i) (con s 1 2 4 8))))
* (obis (add (con o) (tmp b) (mul (tmp i) (con s 1 2 4 8))))
* (bi1 (add (tmp b) (tmp i)))
* (obi1 (add (con o) (tmp b) (tmp i)))
* ))
*/
static int
opn(int op, int l, int r)
{
static uchar Oaddtbl[91] = {
2,
2,2,
4,4,5,
6,6,8,8,
4,4,9,10,9,
7,7,5,8,9,5,
4,4,12,10,12,12,12,
4,4,9,10,9,9,12,9,
11,11,5,8,9,5,12,9,5,
7,7,5,8,9,5,12,9,5,5,
11,11,5,8,9,5,12,9,5,5,5,
4,4,9,10,9,9,12,9,9,9,9,9,
7,7,5,8,9,5,12,9,5,5,5,9,5,
};
int t;
if (l < r)
t = l, l = r, r = t;
switch (op) {
case Omul:
if (2 <= l)
if (r == 0) {
return 3;
}
return 2;
case Oadd:
return Oaddtbl[(l + l*l)/2 + r];
default:
return 2;
}
}
static int
refn(Ref r, Num *tn, Con *con)
{
int64_t n;
switch (rtype(r)) {
case RTmp:
if (!tn[r.val].n)
tn[r.val].n = 2;
return tn[r.val].n;
case RCon:
if (con[r.val].type != CBits)
return 1;
n = con[r.val].bits.i;
if (n == 8 || n == 4 || n == 2 || n == 1)
return 0;
return 1;
default:
return INT_MIN;
}
}
static bits match[13] = {
[4] = BIT(Pob),
[5] = BIT(Pbi1),
[6] = BIT(Pob) | BIT(Pois),
[7] = BIT(Pob) | BIT(Pobi1),
[8] = BIT(Pbi1) | BIT(Pbis),
[9] = BIT(Pbi1) | BIT(Pobi1),
[10] = BIT(Pbi1) | BIT(Pbis) | BIT(Pobi1) | BIT(Pobis),
[11] = BIT(Pob) | BIT(Pobi1) | BIT(Pobis),
[12] = BIT(Pbi1) | BIT(Pobi1) | BIT(Pobis),
};
static uchar *matcher[] = {
[Pbi1] = (uchar[]){
1,3,1,3,2,0
},
[Pbis] = (uchar[]){
5,1,8,5,27,1,5,1,2,5,13,3,1,1,3,3,3,2,0,1,
3,3,3,2,3,1,0,1,29
},
[Pob] = (uchar[]){
1,3,0,3,1,0
},
[Pobi1] = (uchar[]){
5,3,9,9,10,33,12,35,45,1,5,3,11,9,7,9,4,9,
17,1,3,0,3,1,3,2,0,3,1,1,3,0,34,1,37,1,5,2,
5,7,2,7,8,37,29,1,3,0,1,32
},
[Pobis] = (uchar[]){
5,2,10,7,11,19,49,1,1,3,3,3,2,1,3,0,3,1,0,
1,3,0,5,1,8,5,25,1,5,1,2,5,13,3,1,1,3,3,3,
2,0,1,3,3,3,2,26,1,51,1,5,1,6,5,9,1,3,0,51,
3,1,1,3,0,45
},
[Pois] = (uchar[]){
1,3,0,1,3,3,3,2,0
},
};
/* end of generated code */
static void
anumber(Num *tn, Blk *b, Con *con)
{
Ins *i;
Num *n;
for (i=b->ins; i<&b->ins[b->nins]; i++) {
if (rtype(i->to) != RTmp)
continue;
n = &tn[i->to.val];
n->l = i->arg[0];
n->r = i->arg[1];
n->nl = refn(n->l, tn, con);
n->nr = refn(n->r, tn, con);
n->n = opn(i->op, n->nl, n->nr);
}
}
static Ref
adisp(Con *c, Num *tn, Ref r, Fn *fn, int s)
{
Ref v[2];
int n;
while (!req(r, R)) {
assert(rtype(r) == RTmp);
n = refn(r, tn, fn->con);
if (!(match[n] & BIT(Pob)))
break;
runmatch(matcher[Pob], tn, r, v);
assert(rtype(v[0]) == RCon);
addcon(c, &fn->con[v[0].val], s);
r = v[1];
}
return r;
}
static int
amatch(Addr *a, Num *tn, Ref r, Fn *fn)
{
static int pat[] = {Pobis, Pobi1, Pbis, Pois, Pbi1, -1};
Ref ro, rb, ri, rs, v[4];
Con *c, co;
int s, n, *p;
if (rtype(r) != RTmp)
return 0;
n = refn(r, tn, fn->con);
memset(v, 0, sizeof v);
for (p=pat; *p>=0; p++)
if (match[n] & BIT(*p)) {
runmatch(matcher[*p], tn, r, v);
break;
}
if (*p < 0)
v[1] = r;
memset(&co, 0, sizeof co);
ro = v[0];
rb = adisp(&co, tn, v[1], fn, 1);
ri = v[2];
rs = v[3];
s = 1;
if (*p < 0 && co.type != CUndef)
if (amatch(a, tn, rb, fn))
return addcon(&a->offset, &co, 1);
if (!req(ro, R)) {
assert(rtype(ro) == RCon);
c = &fn->con[ro.val];
if (!addcon(&co, c, 1))
return 0;
}
if (!req(rs, R)) {
assert(rtype(rs) == RCon);
c = &fn->con[rs.val];
assert(c->type == CBits);
s = c->bits.i;
}
ri = adisp(&co, tn, ri, fn, s);
*a = (Addr){co, rb, ri, s};
if (rtype(ri) == RTmp)
if (fn->tmp[ri.val].slot != -1) {
if (a->scale != 1
|| fn->tmp[rb.val].slot != -1)
return 0;
a->base = ri;
a->index = rb;
}
if (!req(a->base, R)) {
assert(rtype(a->base) == RTmp);
s = fn->tmp[a->base.val].slot;
if (s != -1)
a->base = SLOT(s);
}
return 1;
}
/* instruction selection
* requires use counts (as given by parsing)
*/
void
amd64_isel(Fn *fn)
{
Blk *b, **sb;
Ins *i;
Phi *p;
uint a;
int n, al;
int64_t sz;
Num *num;
/* assign slots to fast allocs */
b = fn->start;
/* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
for (i=b->ins; i<&b->ins[b->nins]; i++)
if (i->op == al) {
if (rtype(i->arg[0]) != RCon)
break;
sz = fn->con[i->arg[0].val].bits.i;
if (sz < 0 || sz >= INT_MAX-15)
err("invalid alloc size %"PRId64, sz);
sz = (sz + n-1) & -n;
sz /= 4;
if (sz > INT_MAX - fn->slot)
die("alloc too large");
fn->tmp[i->to.val].slot = fn->slot;
fn->slot += sz;
fn->salign = 2 + al - Oalloc;
*i = (Ins){.op = Onop};
}
/* process basic blocks */
n = fn->ntmp;
num = emalloc(n * sizeof num[0]);
for (b=fn->start; b; b=b->link) {
curi = &insb[NIns];
for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
for (p=(*sb)->phi; p; p=p->link) {
for (a=0; p->blk[a] != b; a++)
assert(a+1 < p->narg);
fixarg(&p->arg[a], p->cls, 0, fn);
}
memset(num, 0, n * sizeof num[0]);
anumber(num, b, fn->con);
seljmp(b, fn);
for (i=&b->ins[b->nins]; i!=b->ins;) {
--i;
assert(i->op != Osel0);
if (i->op == Osel1)
i = selsel(fn, b, i, num);
else
sel(*i, num, fn);
}
idup(b, curi, &insb[NIns]-curi);
}
free(num);
if (debug['I']) {
fprintf(stderr, "\n> After instruction selection:\n");
printfn(fn, stderr);
}
}
+721
View File
@@ -0,0 +1,721 @@
#include "all.h"
typedef struct AClass AClass;
typedef struct RAlloc RAlloc;
struct AClass {
Typ *type;
int inmem;
int align;
uint size;
int cls[2];
Ref ref[2];
};
struct RAlloc {
Ins i;
RAlloc *link;
};
static void
classify(AClass *a, Typ *t, uint s)
{
Field *f;
int *cls;
uint n, s1;
for (n=0, s1=s; n<t->nunion; n++, s=s1)
for (f=t->fields[n]; f->type!=FEnd; f++) {
assert(s <= 16);
cls = &a->cls[s/8];
switch (f->type) {
case FEnd:
die("unreachable");
case FPad:
/* don't change anything */
s += f->len;
break;
case Fs:
case Fd:
if (*cls == Kx)
*cls = Kd;
s += f->len;
break;
case Fb:
case Fh:
case Fw:
case Fl:
*cls = Kl;
s += f->len;
break;
case FTyp:
classify(a, &typ[f->len], s);
s += typ[f->len].size;
break;
}
}
}
static void
typclass(AClass *a, Typ *t)
{
uint sz, al;
sz = t->size;
al = 1u << t->align;
/* the ABI requires sizes to be rounded
* up to the nearest multiple of 8, moreover
* it makes it easy load and store structures
* in registers
*/
if (al < 8)
al = 8;
sz = (sz + al-1) & -al;
a->type = t;
a->size = sz;
a->align = t->align;
if (t->isdark || sz > 16 || sz == 0) {
/* large or unaligned structures are
* required to be passed in memory
*/
a->inmem = 1;
return;
}
a->cls[0] = Kx;
a->cls[1] = Kx;
a->inmem = 0;
classify(a, t, 0);
}
static int
retr(Ref reg[2], AClass *aret)
{
static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
int n, k, ca, nr[2];
nr[0] = nr[1] = 0;
ca = 0;
for (n=0; (uint)n*8<aret->size; n++) {
k = KBASE(aret->cls[n]);
reg[n] = TMP(retreg[k][nr[k]++]);
ca += 1 << (2 * k);
}
return ca;
}
static void
selret(Blk *b, Fn *fn)
{
int j, k, ca;
Ref r, r0, reg[2];
AClass aret;
j = b->jmp.type;
if (!isret(j) || j == Jret0)
return;
r0 = b->jmp.arg;
b->jmp.type = Jret0;
if (j == Jretc) {
typclass(&aret, &typ[fn->retty]);
if (aret.inmem) {
assert(rtype(fn->retr) == RTmp);
emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
emit(Oblit1, 0, R, INT(aret.type->size), R);
emit(Oblit0, 0, R, r0, fn->retr);
ca = 1;
} else {
ca = retr(reg, &aret);
if (aret.size > 8) {
r = newtmp("abi", Kl, fn);
emit(Oload, Kl, reg[1], r, R);
emit(Oadd, Kl, r, r0, getcon(8, fn));
}
emit(Oload, Kl, reg[0], r0, R);
}
} else {
k = j - Jretw;
if (KBASE(k) == 0) {
emit(Ocopy, k, TMP(RAX), r0, R);
ca = 1;
} else {
emit(Ocopy, k, TMP(XMM0), r0, R);
ca = 1 << 2;
}
}
b->jmp.arg = CALL(ca);
}
static int
argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env)
{
int varc, envc, nint, ni, nsse, ns, n, *pn;
AClass *a;
Ins *i;
if (aret && aret->inmem)
nint = 5; /* hidden argument */
else
nint = 6;
nsse = 8;
varc = 0;
envc = 0;
for (i=i0, a=ac; i<i1; i++, a++)
switch (i->op - op + Oarg) {
case Oarg:
if (KBASE(i->cls) == 0)
pn = &nint;
else
pn = &nsse;
if (*pn > 0) {
--*pn;
a->inmem = 0;
} else
a->inmem = 2;
a->align = 3;
a->size = 8;
a->cls[0] = i->cls;
break;
case Oargc:
n = i->arg[0].val;
typclass(a, &typ[n]);
if (a->inmem)
continue;
ni = ns = 0;
for (n=0; (uint)n*8<a->size; n++)
if (KBASE(a->cls[n]) == 0)
ni++;
else
ns++;
if (nint >= ni && nsse >= ns) {
nint -= ni;
nsse -= ns;
} else
a->inmem = 1;
break;
case Oarge:
envc = 1;
if (op == Opar)
*env = i->to;
else
*env = i->arg[0];
break;
case Oargv:
varc = 1;
break;
default:
die("unreachable");
}
if (varc && envc)
err("sysv abi does not support variadic env calls");
return ((varc|envc) << 12) | ((6-nint) << 4) | ((8-nsse) << 8);
}
int amd64_sysv_rsave[] = {
RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1
};
int amd64_sysv_rclob[] = {RBX, R12, R13, R14, R15, -1};
MAKESURE(sysv_arrays_ok,
sizeof amd64_sysv_rsave == (NGPS_SYSV+NFPS+1) * sizeof(int) &&
sizeof amd64_sysv_rclob == (NCLR_SYSV+1) * sizeof(int)
);
/* layout of call's second argument (RCall)
*
* 29 12 8 4 3 0
* |0...00|x|xxxx|xxxx|xx|xx| range
* | | | | ` gp regs returned (0..2)
* | | | ` sse regs returned (0..2)
* | | ` gp regs passed (0..6)
* | ` sse regs passed (0..8)
* ` 1 if rax is used to pass data (0..1)
*/
bits
amd64_sysv_retregs(Ref r, int p[2])
{
bits b;
int ni, nf;
assert(rtype(r) == RCall);
b = 0;
ni = r.val & 3;
nf = (r.val >> 2) & 3;
if (ni >= 1)
b |= BIT(RAX);
if (ni >= 2)
b |= BIT(RDX);
if (nf >= 1)
b |= BIT(XMM0);
if (nf >= 2)
b |= BIT(XMM1);
if (p) {
p[0] = ni;
p[1] = nf;
}
return b;
}
bits
amd64_sysv_argregs(Ref r, int p[2])
{
bits b;
int j, ni, nf, ra;
assert(rtype(r) == RCall);
b = 0;
ni = (r.val >> 4) & 15;
nf = (r.val >> 8) & 15;
ra = (r.val >> 12) & 1;
for (j=0; j<ni; j++)
b |= BIT(amd64_sysv_rsave[j]);
for (j=0; j<nf; j++)
b |= BIT(XMM0+j);
if (p) {
p[0] = ni + ra;
p[1] = nf;
}
return b | (ra ? BIT(RAX) : 0);
}
static Ref
rarg(int ty, int *ni, int *ns)
{
if (KBASE(ty) == 0)
return TMP(amd64_sysv_rsave[(*ni)++]);
else
return TMP(XMM0 + (*ns)++);
}
static void
selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
{
Ins *i;
AClass *ac, *a, aret;
int ca, ni, ns, al;
uint stk, off;
Ref r, r1, r2, reg[2], env;
RAlloc *ra;
env = R;
ac = alloc((i1-i0) * sizeof ac[0]);
if (!req(i1->arg[1], R)) {
assert(rtype(i1->arg[1]) == RType);
typclass(&aret, &typ[i1->arg[1].val]);
ca = argsclass(i0, i1, ac, Oarg, &aret, &env);
} else
ca = argsclass(i0, i1, ac, Oarg, 0, &env);
for (stk=0, a=&ac[i1-i0]; a>ac;)
if ((--a)->inmem) {
if (a->align > 4)
err("sysv abi requires alignments of 16 or less");
stk += a->size;
if (a->align == 4)
stk += stk & 15;
}
stk += stk & 15;
if (stk) {
r = getcon(-(int64_t)stk, fn);
emit(Osalloc, Kl, R, r, R);
}
if (!req(i1->arg[1], R)) {
if (aret.inmem) {
/* get the return location from eax
* it saves one callee-save reg */
r1 = newtmp("abi", Kl, fn);
emit(Ocopy, Kl, i1->to, TMP(RAX), R);
ca += 1;
} else {
/* todo, may read out of bounds.
* gcc did this up until 5.2, but
* this should still be fixed.
*/
if (aret.size > 8) {
r = newtmp("abi", Kl, fn);
aret.ref[1] = newtmp("abi", aret.cls[1], fn);
emit(Ostorel, 0, R, aret.ref[1], r);
emit(Oadd, Kl, r, i1->to, getcon(8, fn));
}
aret.ref[0] = newtmp("abi", aret.cls[0], fn);
emit(Ostorel, 0, R, aret.ref[0], i1->to);
ca += retr(reg, &aret);
if (aret.size > 8)
emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R);
emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R);
r1 = i1->to;
}
/* allocate return pad */
ra = alloc(sizeof *ra);
/* specific to NAlign == 3 */
al = aret.align >= 2 ? aret.align - 2 : 0;
ra->i = (Ins){Oalloc+al, Kl, r1, {getcon(aret.size, fn)}};
ra->link = (*rap);
*rap = ra;
} else {
ra = 0;
if (KBASE(i1->cls) == 0) {
emit(Ocopy, i1->cls, i1->to, TMP(RAX), R);
ca += 1;
} else {
emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R);
ca += 1 << 2;
}
}
emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca));
if (!req(R, env))
emit(Ocopy, Kl, TMP(RAX), env, R);
else if ((ca >> 12) & 1) /* vararg call */
emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
ni = ns = 0;
if (ra && aret.inmem)
emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
for (i=i0, a=ac; i<i1; i++, a++) {
if (i->op >= Oarge || a->inmem)
continue;
r1 = rarg(a->cls[0], &ni, &ns);
if (i->op == Oargc) {
if (a->size > 8) {
r2 = rarg(a->cls[1], &ni, &ns);
r = newtmp("abi", Kl, fn);
emit(Oload, a->cls[1], r2, r, R);
emit(Oadd, Kl, r, i->arg[1], getcon(8, fn));
}
emit(Oload, a->cls[0], r1, i->arg[1], R);
} else
emit(Ocopy, i->cls, r1, i->arg[0], R);
}
if (!stk)
return;
r = newtmp("abi", Kl, fn);
for (i=i0, a=ac, off=0; i<i1; i++, a++) {
if (i->op >= Oarge || !a->inmem)
continue;
r1 = newtmp("abi", Kl, fn);
if (i->op == Oargc) {
if (a->align == 4)
off += off & 15;
emit(Oblit1, 0, R, INT(a->type->size), R);
emit(Oblit0, 0, R, i->arg[1], r1);
} else
emit(Ostorel, 0, R, i->arg[0], r1);
emit(Oadd, Kl, r1, r, getcon(off, fn));
off += a->size;
}
emit(Osalloc, Kl, r, getcon(stk, fn), R);
}
static int
selpar(Fn *fn, Ins *i0, Ins *i1)
{
AClass *ac, *a, aret;
Ins *i;
int ni, ns, s, al, fa;
Ref r, env;
env = R;
ac = alloc((i1-i0) * sizeof ac[0]);
curi = &insb[NIns];
ni = ns = 0;
if (fn->retty >= 0) {
typclass(&aret, &typ[fn->retty]);
fa = argsclass(i0, i1, ac, Opar, &aret, &env);
} else
fa = argsclass(i0, i1, ac, Opar, 0, &env);
fn->reg = amd64_sysv_argregs(CALL(fa), 0);
for (i=i0, a=ac; i<i1; i++, a++) {
if (i->op != Oparc || a->inmem)
continue;
if (a->size > 8) {
r = newtmp("abi", Kl, fn);
a->ref[1] = newtmp("abi", Kl, fn);
emit(Ostorel, 0, R, a->ref[1], r);
emit(Oadd, Kl, r, i->to, getcon(8, fn));
}
a->ref[0] = newtmp("abi", Kl, fn);
emit(Ostorel, 0, R, a->ref[0], i->to);
/* specific to NAlign == 3 */
al = a->align >= 2 ? a->align - 2 : 0;
emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R);
}
if (fn->retty >= 0 && aret.inmem) {
r = newtmp("abi", Kl, fn);
emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R);
fn->retr = r;
}
for (i=i0, a=ac, s=4; i<i1; i++, a++) {
switch (a->inmem) {
case 1:
if (a->align > 4)
err("sysv abi requires alignments of 16 or less");
if (a->align == 4)
s = (s+3) & -4;
fn->tmp[i->to.val].slot = -s;
s += a->size / 4;
continue;
case 2:
emit(Oload, i->cls, i->to, SLOT(-s), R);
s += 2;
continue;
}
if (i->op == Opare)
continue;
r = rarg(a->cls[0], &ni, &ns);
if (i->op == Oparc) {
emit(Ocopy, a->cls[0], a->ref[0], r, R);
if (a->size > 8) {
r = rarg(a->cls[1], &ni, &ns);
emit(Ocopy, a->cls[1], a->ref[1], r, R);
}
} else
emit(Ocopy, i->cls, i->to, r, R);
}
if (!req(R, env))
emit(Ocopy, Kl, env, TMP(RAX), R);
return fa | (s*4)<<12;
}
static Blk *
split(Fn *fn, Blk *b)
{
Blk *bn;
++fn->nblk;
bn = newblk();
idup(bn, curi, &insb[NIns]-curi);
curi = &insb[NIns];
bn->visit = ++b->visit;
bn->name = strf(PFn, "%s.%d", b->name, b->visit);
bn->loop = b->loop;
bn->link = b->link;
b->link = bn;
return bn;
}
static void
chpred(Blk *b, Blk *bp, Blk *bp1)
{
Phi *p;
uint a;
for (p=b->phi; p; p=p->link) {
for (a=0; p->blk[a]!=bp; a++)
assert(a+1<p->narg);
p->blk[a] = bp1;
}
}
static void
selvaarg(Fn *fn, Blk *b, Ins *i)
{
Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap;
Blk *b0, *bstk, *breg;
int isint;
c4 = getcon(4, fn);
c8 = getcon(8, fn);
c16 = getcon(16, fn);
ap = i->arg[0];
isint = KBASE(i->cls) == 0;
/* @b [...]
r0 =l add ap, (0 or 4)
nr =l loadsw r0
r1 =w cultw nr, (48 or 176)
jnz r1, @breg, @bstk
@breg
r0 =l add ap, 16
r1 =l loadl r0
lreg =l add r1, nr
r0 =w add nr, (8 or 16)
r1 =l add ap, (0 or 4)
storew r0, r1
@bstk
r0 =l add ap, 8
lstk =l loadl r0
r1 =l add lstk, 8
storel r1, r0
@b0
%loc =l phi @breg %lreg, @bstk %lstk
i->to =(i->cls) load %loc
*/
loc = newtmp("abi", Kl, fn);
emit(Oload, i->cls, i->to, loc, R);
b0 = split(fn, b);
b0->jmp = b->jmp;
b0->s1 = b->s1;
b0->s2 = b->s2;
if (b->s1)
chpred(b->s1, b, b0);
if (b->s2 && b->s2 != b->s1)
chpred(b->s2, b, b0);
lreg = newtmp("abi", Kl, fn);
nr = newtmp("abi", Kl, fn);
r0 = newtmp("abi", Kw, fn);
r1 = newtmp("abi", Kl, fn);
emit(Ostorew, Kw, R, r0, r1);
emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4);
emit(Oadd, Kw, r0, nr, isint ? c8 : c16);
r0 = newtmp("abi", Kl, fn);
r1 = newtmp("abi", Kl, fn);
emit(Oadd, Kl, lreg, r1, nr);
emit(Oload, Kl, r1, r0, R);
emit(Oadd, Kl, r0, ap, c16);
breg = split(fn, b);
breg->jmp.type = Jjmp;
breg->s1 = b0;
lstk = newtmp("abi", Kl, fn);
r0 = newtmp("abi", Kl, fn);
r1 = newtmp("abi", Kl, fn);
emit(Ostorel, Kw, R, r1, r0);
emit(Oadd, Kl, r1, lstk, c8);
emit(Oload, Kl, lstk, r0, R);
emit(Oadd, Kl, r0, ap, c8);
bstk = split(fn, b);
bstk->jmp.type = Jjmp;
bstk->s1 = b0;
b0->phi = alloc(sizeof *b0->phi);
*b0->phi = (Phi){
.cls = Kl, .to = loc,
.narg = 2,
.blk = vnew(2, sizeof b0->phi->blk[0], PFn),
.arg = vnew(2, sizeof b0->phi->arg[0], PFn),
};
b0->phi->blk[0] = bstk;
b0->phi->blk[1] = breg;
b0->phi->arg[0] = lstk;
b0->phi->arg[1] = lreg;
r0 = newtmp("abi", Kl, fn);
r1 = newtmp("abi", Kw, fn);
b->jmp.type = Jjnz;
b->jmp.arg = r1;
b->s1 = breg;
b->s2 = bstk;
c = getcon(isint ? 48 : 176, fn);
emit(Ocmpw+Ciult, Kw, r1, nr, c);
emit(Oloadsw, Kl, nr, r0, R);
emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4);
}
static void
selvastart(Fn *fn, int fa, Ref ap)
{
Ref r0, r1;
int gp, fp, sp;
gp = ((fa >> 4) & 15) * 8;
fp = 48 + ((fa >> 8) & 15) * 16;
sp = fa >> 12;
r0 = newtmp("abi", Kl, fn);
r1 = newtmp("abi", Kl, fn);
emit(Ostorel, Kw, R, r1, r0);
emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn));
emit(Oadd, Kl, r0, ap, getcon(16, fn));
r0 = newtmp("abi", Kl, fn);
r1 = newtmp("abi", Kl, fn);
emit(Ostorel, Kw, R, r1, r0);
emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn));
emit(Oadd, Kl, r0, ap, getcon(8, fn));
r0 = newtmp("abi", Kl, fn);
emit(Ostorew, Kw, R, getcon(fp, fn), r0);
emit(Oadd, Kl, r0, ap, getcon(4, fn));
emit(Ostorew, Kw, R, getcon(gp, fn), ap);
}
void
amd64_sysv_abi(Fn *fn)
{
Blk *b;
Ins *i, *i0;
RAlloc *ral;
int n0, n1, ioff, fa;
for (b=fn->start; b; b=b->link)
b->visit = 0;
/* lower parameters */
for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
if (!ispar(i->op))
break;
fa = selpar(fn, b->ins, i);
n0 = &insb[NIns] - curi;
ioff = i - b->ins;
n1 = b->nins - ioff;
vgrow(&b->ins, n0+n1);
icpy(b->ins+n0, b->ins+ioff, n1);
icpy(b->ins, curi, n0);
b->nins = n0+n1;
/* lower calls, returns, and vararg instructions */
ral = 0;
b = fn->start;
do {
if (!(b = b->link))
b = fn->start; /* do it last */
if (b->visit)
continue;
curi = &insb[NIns];
selret(b, fn);
for (i=&b->ins[b->nins]; i!=b->ins;)
switch ((--i)->op) {
default:
emiti(*i);
break;
case Ocall:
for (i0=i; i0>b->ins; i0--)
if (!isarg((i0-1)->op))
break;
selcall(fn, i0, i, &ral);
i = i0;
break;
case Ovastart:
selvastart(fn, fa, i->arg[0]);
break;
case Ovaarg:
selvaarg(fn, b, i);
break;
case Oarg:
case Oargc:
die("unreachable");
}
if (b == fn->start)
for (; ral; ral=ral->link)
emiti(ral->i);
idup(b, curi, &insb[NIns]-curi);
} while (b != fn->start);
if (debug['A']) {
fprintf(stderr, "\n> After ABI lowering:\n");
printfn(fn, stderr);
}
}
+67
View File
@@ -0,0 +1,67 @@
#include "all.h"
Amd64Op amd64_op[NOp] = {
#define O(op, t, x) [O##op] =
#define X(nm, zf, lf) { nm, zf, lf, },
#include "../ops.h"
};
static int
amd64_memargs(int op)
{
return amd64_op[op].nmem;
}
#define AMD64_COMMON \
.gpr0 = RAX, \
.ngpr = NGPR, \
.fpr0 = XMM0, \
.nfpr = NFPR, \
.rglob = BIT(RBP) | BIT(RSP), \
.nrglob = 2, \
.memargs = amd64_memargs, \
.abi0 = elimsb, \
.isel = amd64_isel, \
.cansel = 1,
Target T_amd64_sysv = {
.name = "amd64_sysv",
.emitfin = elf_emitfin,
.asloc = ".L",
.abi1 = amd64_sysv_abi,
.rsave = amd64_sysv_rsave,
.nrsave = {NGPS_SYSV, NFPS},
.retregs = amd64_sysv_retregs,
.argregs = amd64_sysv_argregs,
.emitfn = amd64_sysv_emitfn,
AMD64_COMMON
};
Target T_amd64_apple = {
.name = "amd64_apple",
.apple = 1,
.emitfin = macho_emitfin,
.asloc = "L",
.assym = "_",
.abi1 = amd64_sysv_abi,
.rsave = amd64_sysv_rsave,
.nrsave = {NGPS_SYSV, NFPS},
.retregs = amd64_sysv_retregs,
.argregs = amd64_sysv_argregs,
.emitfn = amd64_sysv_emitfn,
AMD64_COMMON
};
Target T_amd64_win = {
.name = "amd64_win",
.windows = 1,
.emitfin = pe_emitfin,
.asloc = "L",
.abi1 = amd64_winabi_abi,
.rsave = amd64_winabi_rsave,
.nrsave = {NGPS_WIN, NFPS},
.retregs = amd64_winabi_retregs,
.argregs = amd64_winabi_argregs,
.emitfn = amd64_winabi_emitfn,
AMD64_COMMON
};
Executable
+763
View File
@@ -0,0 +1,763 @@
#include "all.h"
#include <stdbool.h>
typedef enum ArgPassStyle {
APS_Invalid = 0,
APS_Register,
APS_InlineOnStack,
APS_CopyAndPointerInRegister,
APS_CopyAndPointerOnStack,
APS_VarargsTag,
APS_EnvTag,
} ArgPassStyle;
typedef struct ArgClass {
Typ* type;
ArgPassStyle style;
int align;
uint size;
int cls;
Ref ref;
} ArgClass;
typedef struct ExtraAlloc ExtraAlloc;
struct ExtraAlloc {
Ins instr;
ExtraAlloc* link;
};
#define ALIGN_DOWN(n, a) ((n) & ~((a)-1))
#define ALIGN_UP(n, a) ALIGN_DOWN((n) + (a)-1, (a))
// Number of stack bytes required be reserved for the callee.
#define SHADOW_SPACE_SIZE 32
int amd64_winabi_rsave[] = {RCX, RDX, R8, R9, R10, R11, RAX, XMM0,
XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8,
XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1};
int amd64_winabi_rclob[] = {RBX, R12, R13, R14, R15, RSI, RDI, -1};
MAKESURE(winabi_arrays_ok,
sizeof amd64_winabi_rsave == (NGPS_WIN + NFPS + 1) * sizeof(int) &&
sizeof amd64_winabi_rclob == (NCLR_WIN + 1) * sizeof(int));
// layout of call's second argument (RCall)
//
// bit 0: rax returned
// bit 1: xmm0 returned
// bits 23: 0
// bits 4567: rcx, rdx, r8, r9 passed
// bits 89ab: xmm0,1,2,3 passed
// bit c: env call (rax passed)
// bits d..1f: 0
bits amd64_winabi_retregs(Ref r, int p[2]) {
assert(rtype(r) == RCall);
bits b = 0;
int num_int_returns = r.val & 1;
int num_float_returns = r.val & 2;
if (num_int_returns == 1) {
b |= BIT(RAX);
} else {
b |= BIT(XMM0);
}
if (p) {
p[0] = num_int_returns;
p[1] = num_float_returns;
}
return b;
}
static uint popcnt(bits b) {
b = (b & 0x5555555555555555) + ((b >> 1) & 0x5555555555555555);
b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
b = (b & 0x0f0f0f0f0f0f0f0f) + ((b >> 4) & 0x0f0f0f0f0f0f0f0f);
b += (b >> 8);
b += (b >> 16);
b += (b >> 32);
return b & 0xff;
}
bits amd64_winabi_argregs(Ref r, int p[2]) {
assert(rtype(r) == RCall);
// On SysV, these are counts. Here, a count isn't sufficient, we actually need
// to know which ones are in use because they're not necessarily contiguous.
int int_passed = (r.val >> 4) & 15;
int float_passed = (r.val >> 8) & 15;
bool env_param = (r.val >> 12) & 1;
bits b = 0;
b |= (int_passed & 1) ? BIT(RCX) : 0;
b |= (int_passed & 2) ? BIT(RDX) : 0;
b |= (int_passed & 4) ? BIT(R8) : 0;
b |= (int_passed & 8) ? BIT(R9) : 0;
b |= (float_passed & 1) ? BIT(XMM0) : 0;
b |= (float_passed & 2) ? BIT(XMM1) : 0;
b |= (float_passed & 4) ? BIT(XMM2) : 0;
b |= (float_passed & 8) ? BIT(XMM3) : 0;
b |= env_param ? BIT(RAX) : 0;
if (p) {
// TODO: The only place this is used is live.c. I'm not sure what should be
// returned here wrt to using the same counter for int/float regs on win.
// For now, try the number of registers in use even though they're not
// contiguous.
p[0] = popcnt(int_passed);
p[1] = popcnt(float_passed);
}
return b;
}
typedef struct RegisterUsage {
// Counter for both int/float as they're counted together. Only if the bool's
// set in regs_passed is the given register *actually* needed for a value
// (i.e. needs to be saved, etc.).
int num_regs_passed;
// Indexed first by 0=int, 1=float, use KBASE(cls).
// Indexed second by register index in calling convention, so for integer,
// 0=RCX, 1=RDX, 2=R8, 3=R9, and for float XMM0, XMM1, XMM2, XMM3.
bool regs_passed[2][4];
bool rax_returned;
bool xmm0_returned;
// This is also used as where the va_start will start for varargs functions
// (there's no 'Oparv', so we need to keep track of a count here.)
int num_named_args_passed;
// This is set when classifying the arguments for a call (but not when
// classifying the parameters of a function definition).
bool is_varargs_call;
bool has_env;
} RegisterUsage;
static int register_usage_to_call_arg_value(RegisterUsage reg_usage) {
return (reg_usage.rax_returned << 0) | //
(reg_usage.xmm0_returned << 1) | //
(reg_usage.regs_passed[0][0] << 4) | //
(reg_usage.regs_passed[0][1] << 5) | //
(reg_usage.regs_passed[0][2] << 6) | //
(reg_usage.regs_passed[0][3] << 7) | //
(reg_usage.regs_passed[1][0] << 8) | //
(reg_usage.regs_passed[1][1] << 9) | //
(reg_usage.regs_passed[1][2] << 10) | //
(reg_usage.regs_passed[1][3] << 11) | //
(reg_usage.has_env << 12);
}
// Assigns the argument to a register if there's any left according to the
// calling convention, and updates the regs_passed bools. Otherwise marks the
// value as needing stack space to be passed.
static void assign_register_or_stack(RegisterUsage* reg_usage,
ArgClass* arg,
bool is_float,
bool by_copy) {
if (reg_usage->num_regs_passed == 4) {
arg->style = by_copy ? APS_CopyAndPointerOnStack : APS_InlineOnStack;
} else {
reg_usage->regs_passed[is_float][reg_usage->num_regs_passed] = true;
++reg_usage->num_regs_passed;
arg->style = by_copy ? APS_CopyAndPointerInRegister : APS_Register;
}
++reg_usage->num_named_args_passed;
}
static bool type_is_by_copy(Typ* type) {
// Note that only these sizes are passed by register, even though e.g. a
// 5 byte struct would "fit", it still is passed by copy-and-pointer.
return type->isdark || (type->size != 1 && type->size != 2 &&
type->size != 4 && type->size != 8);
}
// This function is used for both arguments and parameters.
// begin_instr should either point at the first Oarg or Opar, and end_instr
// should point past the last one (so to the Ocall for arguments, or to the
// first 'real' instruction of the function for parameters).
static void classify_arguments(RegisterUsage* reg_usage,
Ins* begin_instr,
Ins* end_instr,
ArgClass* arg_classes,
Ref* env) {
ArgClass* arg = arg_classes;
// For each argument, determine how it will be passed (int, float, stack)
// and update the `reg_usage` counts. Additionally, fill out arg_classes for
// each argument.
for (Ins* instr = begin_instr; instr < end_instr; ++instr, ++arg) {
switch (instr->op) {
case Oarg:
case Opar:
assign_register_or_stack(reg_usage, arg, KBASE(instr->cls),
/*by_copy=*/false);
arg->cls = instr->cls;
arg->align = 3;
arg->size = 8;
break;
case Oargc:
case Oparc: {
int typ_index = instr->arg[0].val;
Typ* type = &typ[typ_index];
bool by_copy = type_is_by_copy(type);
assign_register_or_stack(reg_usage, arg, /*is_float=*/false, by_copy);
arg->cls = Kl;
if (!by_copy && type->size <= 4) {
arg->cls = Kw;
}
arg->align = 3;
arg->size = type->size;
break;
}
case Oarge:
*env = instr->arg[0];
arg->style = APS_EnvTag;
reg_usage->has_env = true;
break;
case Opare:
*env = instr->to;
arg->style = APS_EnvTag;
reg_usage->has_env = true;
break;
case Oargv:
reg_usage->is_varargs_call = true;
arg->style = APS_VarargsTag;
break;
}
}
if (reg_usage->has_env && reg_usage->is_varargs_call) {
die("can't use env with varargs");
}
// During a varargs call, float arguments have to be duplicated to their
// associated integer register, so mark them as in-use too.
if (reg_usage->is_varargs_call) {
for (int i = 0; i < 4; ++i) {
if (reg_usage->regs_passed[/*float*/ 1][i]) {
reg_usage->regs_passed[/*int*/ 0][i] = true;
}
}
}
}
static bool is_integer_type(int ty) {
assert(ty >= 0 && ty < 4 && "expecting Kw Kl Ks Kd");
return KBASE(ty) == 0;
}
static Ref register_for_arg(int cls, int counter) {
assert(counter < 4);
if (is_integer_type(cls)) {
return TMP(amd64_winabi_rsave[counter]);
} else {
return TMP(XMM0 + counter);
}
}
static Ins* lower_call(Fn* func,
Blk* block,
Ins* call_instr,
ExtraAlloc** pextra_alloc) {
// Call arguments are instructions. Walk through them to find the end of the
// call+args that we need to process (and return the instruction past the body
// of the instruction for continuing processing).
Ins* instr_past_args = call_instr - 1;
for (; instr_past_args >= block->ins; --instr_past_args) {
if (!isarg(instr_past_args->op)) {
break;
}
}
Ins* earliest_arg_instr = instr_past_args + 1;
// Don't need an ArgClass for the call itself, so one less than the total
// number of instructions we're dealing with.
uint num_args = call_instr - earliest_arg_instr;
ArgClass* arg_classes = alloc(num_args * sizeof(ArgClass));
RegisterUsage reg_usage = {0};
ArgClass ret_arg_class = {0};
// Ocall's two arguments are the the function to be called in 0, and, if the
// the function returns a non-basic type, then arg[1] is a reference to the
// type of the return. req checks if Refs are equal; `R` is 0.
bool il_has_struct_return = !req(call_instr->arg[1], R);
bool is_struct_return = false;
if (il_has_struct_return) {
Typ* ret_type = &typ[call_instr->arg[1].val];
is_struct_return = type_is_by_copy(ret_type);
if (is_struct_return) {
assign_register_or_stack(&reg_usage, &ret_arg_class, /*is_float=*/false,
/*by_copy=*/true);
}
ret_arg_class.size = ret_type->size;
}
Ref env = R;
classify_arguments(&reg_usage, earliest_arg_instr, call_instr, arg_classes,
&env);
// We now know which arguments are on the stack and which are in registers, so
// we can allocate the correct amount of space to stash the stack-located ones
// into.
uint stack_usage = 0;
for (uint i = 0; i < num_args; ++i) {
ArgClass* arg = &arg_classes[i];
// stack_usage only accounts for pushes that are for values that don't have
// enough registers. Large struct copies are alloca'd separately, and then
// only have (potentially) 8 bytes to add to stack_usage here.
if (arg->style == APS_InlineOnStack) {
if (arg->align > 4) {
err("win abi cannot pass alignments > 16");
}
stack_usage += arg->size;
} else if (arg->style == APS_CopyAndPointerOnStack) {
stack_usage += 8;
}
}
stack_usage = ALIGN_UP(stack_usage, 16);
// Note that here we're logically 'after' the call (due to emitting
// instructions in reverse order), so we're doing a negative stack
// allocation to clean up after the call.
Ref stack_size_ref =
getcon(-(int64_t)(stack_usage + SHADOW_SPACE_SIZE), func);
emit(Osalloc, Kl, R, stack_size_ref, R);
ExtraAlloc* return_pad = NULL;
if (is_struct_return) {
return_pad = alloc(sizeof(ExtraAlloc));
Ref ret_pad_ref = newtmp("abi.ret_pad", Kl, func);
return_pad->instr =
(Ins){Oalloc8, Kl, ret_pad_ref, {getcon(ret_arg_class.size, func)}};
return_pad->link = (*pextra_alloc);
*pextra_alloc = return_pad;
reg_usage.rax_returned = true;
emit(Ocopy, call_instr->cls, call_instr->to, TMP(RAX), R);
} else {
if (il_has_struct_return) {
// In the case that at the IL level, a struct return was specified, but as
// far as the calling convention is concerned it's not actually by
// pointer, we need to store the return value into an alloca because
// subsequent IL will still be treating the function return as a pointer.
ExtraAlloc* return_copy = alloc(sizeof(ExtraAlloc));
return_copy->instr =
(Ins){Oalloc8, Kl, call_instr->to, {getcon(8, func)}};
return_copy->link = (*pextra_alloc);
*pextra_alloc = return_copy;
Ref copy = newtmp("abi.copy", Kl, func);
emit(Ostorel, 0, R, copy, call_instr->to);
emit(Ocopy, Kl, copy, TMP(RAX), R);
reg_usage.rax_returned = true;
} else if (is_integer_type(call_instr->cls)) {
// Only a basic type returned from the call, integer.
emit(Ocopy, call_instr->cls, call_instr->to, TMP(RAX), R);
reg_usage.rax_returned = true;
} else {
// Basic type, floating point.
emit(Ocopy, call_instr->cls, call_instr->to, TMP(XMM0), R);
reg_usage.xmm0_returned = true;
}
}
// Emit the actual call instruction. There's no 'to' value by this point
// because we've lowered it into register manipulation (that's the `R`),
// arg[0] of the call is the function, and arg[1] is register usage is
// documented as above (copied from SysV).
emit(Ocall, call_instr->cls, R, call_instr->arg[0],
CALL(register_usage_to_call_arg_value(reg_usage)));
if (!req(R, env)) {
// If there's an env arg to be passed, it gets stashed in RAX.
emit(Ocopy, Kl, TMP(RAX), env, R);
}
if (reg_usage.is_varargs_call) {
// Any float arguments need to be duplicated to integer registers. This is
// required by the calling convention so that dumping to shadow space can be
// done without a prototype and for varargs.
#define DUP_IF_USED(index, floatreg, intreg) \
if (reg_usage.regs_passed[/*float*/ 1][index]) { \
emit(Ocast, Kl, TMP(intreg), TMP(floatreg), R); \
}
DUP_IF_USED(0, XMM0, RCX);
DUP_IF_USED(1, XMM1, RDX);
DUP_IF_USED(2, XMM2, R8);
DUP_IF_USED(3, XMM3, R9);
#undef DUP_IF_USED
}
int reg_counter = 0;
if (is_struct_return) {
Ref first_reg = register_for_arg(Kl, reg_counter++);
emit(Ocopy, Kl, first_reg, return_pad->instr.to, R);
}
// This is where we actually do the load of values into registers or into
// stack slots.
Ref arg_stack_slots = newtmp("abi.args", Kl, func);
uint slot_offset = SHADOW_SPACE_SIZE;
ArgClass* arg = arg_classes;
for (Ins* instr = earliest_arg_instr; instr != call_instr; ++instr, ++arg) {
switch (arg->style) {
case APS_Register: {
Ref into = register_for_arg(arg->cls, reg_counter++);
if (instr->op == Oargc) {
// If this is a small struct being passed by value. The value in the
// instruction in this case is a pointer, but it needs to be loaded
// into the register.
emit(Oload, arg->cls, into, instr->arg[1], R);
} else {
// Otherwise, a normal value passed in a register.
emit(Ocopy, instr->cls, into, instr->arg[0], R);
}
break;
}
case APS_InlineOnStack: {
Ref slot = newtmp("abi.off", Kl, func);
if (instr->op == Oargc) {
// This is a small struct, so it's not passed by copy, but the
// instruction is a pointer. So we need to copy it into the stack
// slot. (And, remember that these are emitted backwards, so store,
// then load.)
Ref smalltmp = newtmp("abi.smalltmp", arg->cls, func);
emit(Ostorel, 0, R, smalltmp, slot);
emit(Oload, arg->cls, smalltmp, instr->arg[1], R);
} else {
// Stash the value into the stack slot.
emit(Ostorel, 0, R, instr->arg[0], slot);
}
emit(Oadd, Kl, slot, arg_stack_slots, getcon(slot_offset, func));
slot_offset += arg->size;
break;
}
case APS_CopyAndPointerInRegister:
case APS_CopyAndPointerOnStack: {
// Alloca a space to copy into, and blit the value from the instr to the
// copied location.
ExtraAlloc* arg_copy = alloc(sizeof(ExtraAlloc));
Ref copy_ref = newtmp("abi.copy", Kl, func);
arg_copy->instr =
(Ins){Oalloc8, Kl, copy_ref, {getcon(arg->size, func)}};
arg_copy->link = (*pextra_alloc);
*pextra_alloc = arg_copy;
emit(Oblit1, 0, R, INT(arg->size), R);
emit(Oblit0, 0, R, instr->arg[1], copy_ref);
// Now load the pointer into the correct register or stack slot.
if (arg->style == APS_CopyAndPointerInRegister) {
Ref into = register_for_arg(arg->cls, reg_counter++);
emit(Ocopy, Kl, into, copy_ref, R);
} else {
assert(arg->style == APS_CopyAndPointerOnStack);
Ref slot = newtmp("abi.off", Kl, func);
emit(Ostorel, 0, R, copy_ref, slot);
emit(Oadd, Kl, slot, arg_stack_slots, getcon(slot_offset, func));
slot_offset += 8;
}
break;
}
case APS_EnvTag:
case APS_VarargsTag:
// Nothing to do here, see right before the call for reg dupe.
break;
case APS_Invalid:
die("unreachable");
}
}
if (stack_usage) {
// The last (first in call order) thing we do is allocate the the stack
// space we're going to fill with temporaries.
emit(Osalloc, Kl, arg_stack_slots,
getcon(stack_usage + SHADOW_SPACE_SIZE, func), R);
} else {
// When there's no usage for temporaries, we can add this into the other
// alloca, but otherwise emit it separately (not storing into a reference)
// so that it doesn't get removed later for being useless.
emit(Osalloc, Kl, R, getcon(SHADOW_SPACE_SIZE, func), R);
}
return instr_past_args;
}
static void lower_block_return(Fn* func, Blk* block) {
int jmp_type = block->jmp.type;
if (!isret(jmp_type) || jmp_type == Jret0) {
return;
}
// Save the argument, and set the block to be a void return because once it's
// lowered it's handled by the the register/stack manipulation.
Ref ret_arg = block->jmp.arg;
block->jmp.type = Jret0;
RegisterUsage reg_usage = {0};
if (jmp_type == Jretc) {
Typ* type = &typ[func->retty];
if (type_is_by_copy(type)) {
assert(rtype(func->retr) == RTmp);
emit(Ocopy, Kl, TMP(RAX), func->retr, R);
emit(Oblit1, 0, R, INT(type->size), R);
emit(Oblit0, 0, R, ret_arg, func->retr);
} else {
emit(Oload, Kl, TMP(RAX), ret_arg, R);
}
reg_usage.rax_returned = true;
} else {
int k = jmp_type - Jretw;
if (is_integer_type(k)) {
emit(Ocopy, k, TMP(RAX), ret_arg, R);
reg_usage.rax_returned = true;
} else {
emit(Ocopy, k, TMP(XMM0), ret_arg, R);
reg_usage.xmm0_returned = true;
}
}
block->jmp.arg = CALL(register_usage_to_call_arg_value(reg_usage));
}
static void lower_vastart(Fn* func,
RegisterUsage* param_reg_usage,
Ref valist) {
assert(func->vararg);
// In varargs functions:
// 1. the int registers are already dumped to the shadow stack space;
// 2. any parameters passed in floating point registers have
// been duplicated to the integer registers
// 3. we ensure (later) that for varargs functions we're always using an rbp
// frame pointer.
// So, the ... argument is just indexed past rbp by the number of named values
// that were actually passed.
Ref offset = newtmp("abi.vastart", Kl, func);
emit(Ostorel, 0, R, offset, valist);
// *8 for sizeof(u64), +16 because the return address and rbp have been pushed
// by the time we get to the body of the function.
emit(Oadd, Kl, offset, TMP(RBP),
getcon(param_reg_usage->num_named_args_passed * 8 + 16, func));
}
static void lower_vaarg(Fn* func, Ins* vaarg_instr) {
// va_list is just a void** on winx64, so load the pointer, then load the
// argument from that pointer, then increment the pointer to the next arg.
// (All emitted backwards as usual.)
Ref inc = newtmp("abi.vaarg.inc", Kl, func);
Ref ptr = newtmp("abi.vaarg.ptr", Kl, func);
emit(Ostorel, 0, R, inc, vaarg_instr->arg[0]);
emit(Oadd, Kl, inc, ptr, getcon(8, func));
emit(Oload, vaarg_instr->cls, vaarg_instr->to, ptr, R);
emit(Oload, Kl, ptr, vaarg_instr->arg[0], R);
}
static void lower_args_for_block(Fn* func,
Blk* block,
RegisterUsage* param_reg_usage,
ExtraAlloc** pextra_alloc) {
// global temporary buffer used by emit. Reset to the end, and predecremented
// when adding to it.
curi = &insb[NIns];
lower_block_return(func, block);
if (block->nins) {
// Work backwards through the instructions, either copying them unchanged,
// or modifying as necessary.
for (Ins* instr = &block->ins[block->nins - 1]; instr >= block->ins;) {
switch (instr->op) {
case Ocall:
instr = lower_call(func, block, instr, pextra_alloc);
break;
case Ovastart:
lower_vastart(func, param_reg_usage, instr->arg[0]);
--instr;
break;
case Ovaarg:
lower_vaarg(func, instr);
--instr;
break;
case Oarg:
case Oargc:
die("unreachable");
default:
emiti(*instr);
--instr;
break;
}
}
}
// This it the start block, which is processed last. Add any allocas that
// other blocks needed.
bool is_start_block = block == func->start;
if (is_start_block) {
for (ExtraAlloc* ea = *pextra_alloc; ea; ea = ea->link) {
emiti(ea->instr);
}
}
// emit/emiti add instructions from the end to the beginning of the temporary
// global buffer. dup the final version into the final block storage.
block->nins = &insb[NIns] - curi;
idup(block, curi, block->nins);
}
static Ins* find_end_of_func_parameters(Blk* start_block) {
Ins* i;
for (i = start_block->ins; i < &start_block->ins[start_block->nins]; ++i) {
if (!ispar(i->op)) {
break;
}
}
return i;
}
// Copy from registers/stack into values.
static RegisterUsage lower_func_parameters(Fn* func) {
// This is half-open, so end points after the last Opar.
Blk* start_block = func->start;
Ins* start_of_params = start_block->ins;
Ins* end_of_params = find_end_of_func_parameters(start_block);
size_t num_params = end_of_params - start_of_params;
ArgClass* arg_classes = alloc(num_params * sizeof(ArgClass));
ArgClass arg_ret = {0};
// global temporary buffer used by emit. Reset to the end, and predecremented
// when adding to it.
curi = &insb[NIns];
int reg_counter = 0;
RegisterUsage reg_usage = {0};
if (func->retty >= 0) {
bool by_copy = type_is_by_copy(&typ[func->retty]);
if (by_copy) {
assign_register_or_stack(&reg_usage, &arg_ret, /*is_float=*/false,
by_copy);
Ref ret_ref = newtmp("abi.ret", Kl, func);
emit(Ocopy, Kl, ret_ref, TMP(RCX), R);
func->retr = ret_ref;
++reg_counter;
}
}
Ref env = R;
classify_arguments(&reg_usage, start_of_params, end_of_params, arg_classes,
&env);
func->reg = amd64_winabi_argregs(
CALL(register_usage_to_call_arg_value(reg_usage)), NULL);
// Copy from the registers or stack slots into the named parameters. Depending
// on how they're passed, they either need to be copied or loaded.
ArgClass* arg = arg_classes;
uint slot_offset = SHADOW_SPACE_SIZE / 4 + 4;
for (Ins* instr = start_of_params; instr < end_of_params; ++instr, ++arg) {
switch (arg->style) {
case APS_Register: {
Ref from = register_for_arg(arg->cls, reg_counter++);
// If it's a struct at the IL level, we need to copy the register into
// an alloca so we have something to point at (same for InlineOnStack).
if (instr->op == Oparc) {
arg->ref = newtmp("abi", Kl, func);
emit(Ostorel, 0, R, arg->ref, instr->to);
emit(Ocopy, instr->cls, arg->ref, from, R);
emit(Oalloc8, Kl, instr->to, getcon(arg->size, func), R);
} else {
emit(Ocopy, instr->cls, instr->to, from, R);
}
break;
}
case APS_InlineOnStack:
if (instr->op == Oparc) {
arg->ref = newtmp("abi", Kl, func);
emit(Ostorel, 0, R, arg->ref, instr->to);
emit(Ocopy, instr->cls, arg->ref, SLOT(-slot_offset), R);
emit(Oalloc8, Kl, instr->to, getcon(arg->size, func), R);
} else {
emit(Ocopy, Kl, instr->to, SLOT(-slot_offset), R);
}
slot_offset += 2;
break;
case APS_CopyAndPointerOnStack:
emit(Oload, Kl, instr->to, SLOT(-slot_offset), R);
slot_offset += 2;
break;
case APS_CopyAndPointerInRegister: {
// Because this has to be a copy (that we own), it is sufficient to just
// copy the register to the target.
Ref from = register_for_arg(Kl, reg_counter++);
emit(Ocopy, Kl, instr->to, from, R);
break;
}
case APS_EnvTag:
break;
case APS_VarargsTag:
case APS_Invalid:
die("unreachable");
}
}
// If there was an `env`, it was passed in RAX, so copy it into the env ref.
if (!req(R, env)) {
emit(Ocopy, Kl, env, TMP(RAX), R);
}
int num_created_instrs = &insb[NIns] - curi;
int num_other_after_instrs = (int)(start_block->nins - num_params);
int new_total_instrs = num_other_after_instrs + num_created_instrs;
Ins* new_instrs = vnew(new_total_instrs, sizeof(Ins), PFn);
Ins* instr_p = icpy(new_instrs, curi, num_created_instrs);
icpy(instr_p, end_of_params, num_other_after_instrs);
start_block->nins = new_total_instrs;
start_block->ins = new_instrs;
return reg_usage;
}
// The main job of this function is to lower generic instructions into the
// specific details of how arguments are passed, and parameters are
// interpreted for win x64. A useful reference is
// https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention .
//
// Some of the major differences from SysV if you're comparing the code
// (non-exhaustive):
// - only 4 int and 4 float regs are used
// - when an int register is assigned a value, its associated float register is
// left unused (and vice versa). i.e. there's only one counter as you assign
// arguments to registers.
// - any structs that aren't 1/2/4/8 bytes in size are passed by pointer, not
// by copying them into the stack. So e.g. if you pass something like
// `struct { void*, int64_t }` by value, it first needs to be copied to
// another alloca (in order to maintain value semantics at the language
// level), then the pointer to that copy is treated as a regular integer
// argument (which then itself may *also* be copied to the stack in the case
// there's no integer register remaining.)
// - when calling a varargs functions, floating point values must be duplicated
// integer registers. Along with the above restrictions, this makes varargs
// handling simpler for the callee than SysV.
void amd64_winabi_abi(Fn* func) {
// The first thing to do is lower incoming parameters to this function.
RegisterUsage param_reg_usage = lower_func_parameters(func);
// This is the second larger part of the job. We walk all blocks, and rewrite
// instructions returns, calls, and handling of varargs into their win x64
// specific versions. Any other instructions are just passed through unchanged
// by using `emiti`.
// Skip over the entry block, and do it at the end so that our later
// modifications can add allocations to the start block. In particular, we
// need to add stack allocas for copies when structs are passed or returned by
// value.
ExtraAlloc* extra_alloc = NULL;
for (Blk* block = func->start->link; block; block = block->link) {
lower_args_for_block(func, block, &param_reg_usage, &extra_alloc);
}
lower_args_for_block(func, func->start, &param_reg_usage, &extra_alloc);
if (debug['A']) {
fprintf(stderr, "\n> After ABI lowering:\n");
printfn(func, stderr);
}
}