nixify

2026-06-11 10:59:54 -06:00
commit 8650a71f67
159 changed files with 78653 additions and 0 deletions
@@ -0,0 +1,82 @@
+#include "../all.h"
+
+typedef struct Amd64Op Amd64Op;
+
+enum Amd64Reg {
+	RAX = RXX+1, /* caller-save */
+	RCX,  /* caller-save */
+	RDX,  /* caller-save */
+	RSI,  /* caller-save on sysv, callee-save on win */
+	RDI,  /* caller-save on sysv, callee-save on win */
+	R8,  /* caller-save */
+	R9,  /* caller-save */
+	R10,  /* caller-save */
+	R11,  /* caller-save */
+
+	RBX, /* callee-save */
+	R12,
+	R13,
+	R14,
+	R15,
+
+	RBP, /* globally live */
+	RSP,
+
+	XMM0, /* sse */
+	XMM1,
+	XMM2,
+	XMM3,
+	XMM4,
+	XMM5,
+	XMM6,
+	XMM7,
+	XMM8,
+	XMM9,
+	XMM10,
+	XMM11,
+	XMM12,
+	XMM13,
+	XMM14,
+	XMM15,
+
+	NFPR = XMM14 - XMM0 + 1, /* reserve XMM15 */
+	NGPR = RSP - RAX + 1,
+	NFPS = NFPR,
+
+	NGPS_SYSV = R11 - RAX + 1,
+	NCLR_SYSV = R15 - RBX + 1,
+
+	NGPS_WIN = R11 - RAX + 1 - 2,  /* -2 for RDI/RDI */
+	NCLR_WIN = R15 - RBX + 1 + 2,  /* +2 for RDI/RDI */
+};
+MAKESURE(reg_not_tmp, XMM15 < (int)Tmp0);
+
+struct Amd64Op {
+	char nmem;
+	char zflag;
+	char lflag;
+};
+
+/* targ.c */
+extern Amd64Op amd64_op[];
+
+/* sysv.c (abi) */
+extern int amd64_sysv_rsave[];
+extern int amd64_sysv_rclob[];
+bits amd64_sysv_retregs(Ref, int[2]);
+bits amd64_sysv_argregs(Ref, int[2]);
+void amd64_sysv_abi(Fn *);
+
+/* winabi.c */
+extern int amd64_winabi_rsave[];
+extern int amd64_winabi_rclob[];
+bits amd64_winabi_retregs(Ref, int[2]);
+bits amd64_winabi_argregs(Ref, int[2]);
+void amd64_winabi_abi(Fn *);
+
+/* isel.c */
+void amd64_isel(Fn *);
+
+/* emit.c */
+void amd64_sysv_emitfn(Fn *, FILE *);
+void amd64_winabi_emitfn(Fn *, FILE *);
@@ -0,0 +1,881 @@
+#include "all.h"
+
+
+typedef struct E E;
+
+struct E {
+	FILE *f;
+	Fn *fn;
+	int fp;
+	uint64_t fsz;
+	int nclob;
+};
+
+#define CMP(X) \
+	X(Ciule,      "be", "a") \
+	X(Ciult,      "b", "ae") \
+	X(Cisle,      "le", "g") \
+	X(Cislt,      "l", "ge") \
+	X(Cisgt,      "g", "le") \
+	X(Cisge,      "ge", "l") \
+	X(Ciugt,      "a", "be") \
+	X(Ciuge,      "ae", "b") \
+	X(Cieq,       "z", "nz") \
+	X(Cine,       "nz", "z") \
+	X(NCmpI+Cfle, "?" , "?") \
+	X(NCmpI+Cflt, "?",  "?") \
+	X(NCmpI+Cfgt, "a", "be") \
+	X(NCmpI+Cfge, "ae", "b") \
+	X(NCmpI+Cfo,  "np", "p") \
+	X(NCmpI+Cfuo, "p", "np")
+
+enum {
+	SLong = 0,
+	SWord = 1,
+	SShort = 2,
+	SByte = 3,
+
+	Ki = -1, /* matches Kw and Kl */
+	Ka = -2, /* matches all classes */
+};
+
+/* Instruction format strings:
+ *
+ * if the format string starts with -, the instruction
+ * is assumed to be 3-address and is put in 2-address
+ * mode using an extra mov if necessary
+ *
+ * if the format string starts with +, the same as the
+ * above applies, but commutativity is also assumed
+ *
+ * %k  is used to set the class of the instruction,
+ *     it'll expand to "l", "q", "ss", "sd", depending
+ *     on the instruction class
+ * %0  designates the first argument
+ * %1  designates the second argument
+ * %=  designates the result
+ *
+ * if %k is not used, a prefix to 0, 1, or = must be
+ * added, it can be:
+ *   M - memory reference
+ *   L - long  (64 bits)
+ *   W - word  (32 bits)
+ *   H - short (16 bits)
+ *   B - byte  (8 bits)
+ *   S - single precision float
+ *   D - double precision float
+ */
+static struct {
+	short op;
+	short cls;
+	char *fmt;
+} omap[] = {
+	{ Oadd,     Ka, "+add%k %1, %=" },
+	{ Osub,     Ka, "-sub%k %1, %=" },
+	{ Oand,     Ki, "+and%k %1, %=" },
+	{ Oor,      Ki, "+or%k %1, %=" },
+	{ Oxor,     Ki, "+xor%k %1, %=" },
+	{ Osar,     Ki, "-sar%k %B1, %=" },
+	{ Oshr,     Ki, "-shr%k %B1, %=" },
+	{ Oshl,     Ki, "-shl%k %B1, %=" },
+	{ Omul,     Ki, "+imul%k %1, %=" },
+	{ Omul,     Ks, "+mulss %1, %=" },
+	{ Omul,     Kd, "+mulsd %1, %=" },
+	{ Odiv,     Ka, "-div%k %1, %=" },
+	{ Ostorel,  Ka, "movq %L0, %M1" },
+	{ Ostorew,  Ka, "movl %W0, %M1" },
+	{ Ostoreh,  Ka, "movw %H0, %M1" },
+	{ Ostoreb,  Ka, "movb %B0, %M1" },
+	{ Ostores,  Ka, "movss %S0, %M1" },
+	{ Ostored,  Ka, "movsd %D0, %M1" },
+	{ Oload,    Ka, "mov%k %M0, %=" },
+	{ Oloadsw,  Kl, "movslq %M0, %L=" },
+	{ Oloadsw,  Kw, "movl %M0, %W=" },
+	{ Oloaduw,  Ki, "movl %M0, %W=" },
+	{ Oloadsh,  Ki, "movsw%k %M0, %=" },
+	{ Oloaduh,  Ki, "movzw%k %M0, %=" },
+	{ Oloadsb,  Ki, "movsb%k %M0, %=" },
+	{ Oloadub,  Ki, "movzb%k %M0, %=" },
+	{ Oextsw,   Kl, "movslq %W0, %L=" },
+	{ Oextuw,   Kl, "movl %W0, %W=" },
+	{ Oextsh,   Ki, "movsw%k %H0, %=" },
+	{ Oextuh,   Ki, "movzw%k %H0, %=" },
+	{ Oextsb,   Ki, "movsb%k %B0, %=" },
+	{ Oextub,   Ki, "movzb%k %B0, %=" },
+
+	{ Oexts,    Kd, "cvtss2sd %0, %=" },
+	{ Otruncd,  Ks, "cvtsd2ss %0, %=" },
+	{ Ostosi,   Ki, "cvttss2si%k %0, %=" },
+	{ Odtosi,   Ki, "cvttsd2si%k %0, %=" },
+	{ Oswtof,   Ka, "cvtsi2%k %W0, %=" },
+	{ Osltof,   Ka, "cvtsi2%k %L0, %=" },
+	{ Ocast,    Ki, "movq %D0, %L=" },
+	{ Ocast,    Ka, "movq %L0, %D=" },
+
+	{ Oaddr,    Ki, "lea%k %M0, %=" },
+	{ Oswap,    Ki, "xchg%k %0, %1" },
+	{ Osign,    Kl, "cqto" },
+	{ Osign,    Kw, "cltd" },
+	{ Oxdiv,    Ki, "div%k %0" },
+	{ Oxidiv,   Ki, "idiv%k %0" },
+	{ Oxcmp,    Ks, "ucomiss %S0, %S1" },
+	{ Oxcmp,    Kd, "ucomisd %D0, %D1" },
+	{ Oxcmp,    Ki, "cmp%k %0, %1" },
+	{ Oxtest,   Ki, "test%k %0, %1" },
+#define X(c, s, _) \
+	{ Oflag+c,  Ki, "set" s " %B=\n\tmovzb%k %B=, %=" },
+	CMP(X)
+#undef X
+	{ Oflagfeq, Ki, "setz %B=\n\tmovzb%k %B=, %=" },
+	{ Oflagfne, Ki, "setnz %B=\n\tmovzb%k %B=, %=" },
+	{ NOp, 0, 0 }
+};
+
+static char cmov[][2][16] = {
+#define X(c, s0, s1) \
+	[c] = { \
+		"cmov" s0 " %0, %=", \
+		"cmov" s1 " %1, %=", \
+	},
+	CMP(X)
+#undef X
+};
+
+static char *rname[][4] = {
+	[RAX] = {"rax", "eax", "ax", "al"},
+	[RBX] = {"rbx", "ebx", "bx", "bl"},
+	[RCX] = {"rcx", "ecx", "cx", "cl"},
+	[RDX] = {"rdx", "edx", "dx", "dl"},
+	[RSI] = {"rsi", "esi", "si", "sil"},
+	[RDI] = {"rdi", "edi", "di", "dil"},
+	[RBP] = {"rbp", "ebp", "bp", "bpl"},
+	[RSP] = {"rsp", "esp", "sp", "spl"},
+	[R8 ] = {"r8" , "r8d", "r8w", "r8b"},
+	[R9 ] = {"r9" , "r9d", "r9w", "r9b"},
+	[R10] = {"r10", "r10d", "r10w", "r10b"},
+	[R11] = {"r11", "r11d", "r11w", "r11b"},
+	[R12] = {"r12", "r12d", "r12w", "r12b"},
+	[R13] = {"r13", "r13d", "r13w", "r13b"},
+	[R14] = {"r14", "r14d", "r14w", "r14b"},
+	[R15] = {"r15", "r15d", "r15w", "r15b"},
+};
+
+
+static int
+slot(Ref r, E *e)
+{
+	int s;
+
+	s = rsval(r);
+	assert(s <= e->fn->slot);
+	/* specific to NAlign == 3 */
+	if (s < 0) {
+		if (e->fp == RSP)
+			return 4*-s - 8 + e->fsz + e->nclob*8;
+		else
+			return 4*-s;
+	}
+	else if (e->fp == RSP)
+		return 4*s + e->nclob*8;
+	else if (e->fn->vararg) {
+		if (T.windows)
+			return -4 * (e->fn->slot - s);
+		else
+			return -176 + -4 * (e->fn->slot - s);
+	} else
+		return -4 * (e->fn->slot - s);
+}
+
+static void
+emitcon(Con *con, E *e)
+{
+	char *p, *l;
+
+	switch (con->type) {
+	case CAddr:
+		l = str(con->sym.id);
+		p = l[0] == '"' ? "" : T.assym;
+		if (con->sym.type == SThr) {
+			assert(!T.apple);
+			fprintf(e->f, "%%fs:%s%s@tpoff", p, l);
+		} else {
+			assert((con->sym.type & ~SExt) == SGlo);
+			fprintf(e->f, "%s%s", p, l);
+		}
+		if (con->bits.i)
+			fprintf(e->f, "%+"PRId64, con->bits.i);
+		break;
+	case CBits:
+		fprintf(e->f, "%"PRId64, con->bits.i);
+		break;
+	default:
+		die("unreachable");
+	}
+}
+
+static char *
+regtoa(int reg, int sz)
+{
+	static char buf[6];
+
+	assert(reg <= XMM15);
+	if (reg >= XMM0) {
+		sprintf(buf, "xmm%d", reg-XMM0);
+		return buf;
+	} else
+		return rname[reg][sz];
+}
+
+static Ref
+getarg(char c, Ins *i)
+{
+	switch (c) {
+	case '0':
+		return i->arg[0];
+	case '1':
+		return i->arg[1];
+	case '=':
+		return i->to;
+	default:
+		die("invalid arg letter %c", c);
+	}
+}
+
+static void emitins(Ins, E *);
+
+static void
+emitcopy(Ref r1, Ref r2, int k, E *e)
+{
+	Ins icp;
+
+	icp.op = Ocopy;
+	icp.arg[0] = r2;
+	icp.to = r1;
+	icp.cls = k;
+	emitins(icp, e);
+}
+
+static void
+emitf(char *s, Ins *i, E *e)
+{
+	static char clstoa[][3] = {"l", "q", "ss", "sd"};
+	char c;
+	int sz;
+	Ref ref;
+	Mem *m;
+	Con off;
+
+	switch (*s) {
+	case '+':
+		if (req(i->arg[1], i->to)) {
+			ref = i->arg[0];
+			i->arg[0] = i->arg[1];
+			i->arg[1] = ref;
+		}
+		/* fall through */
+	case '-':
+		assert((!req(i->arg[1], i->to) || req(i->arg[0], i->to)) &&
+			"cannot convert to 2-address");
+		emitcopy(i->to, i->arg[0], i->cls, e);
+		s++;
+		break;
+	}
+
+	fputc('\t', e->f);
+Next:
+	while ((c = *s++) != '%')
+		if (!c) {
+			fputc('\n', e->f);
+			return;
+		} else
+			fputc(c, e->f);
+	switch ((c = *s++)) {
+	case '%':
+		fputc('%', e->f);
+		break;
+	case 'k':
+		fputs(clstoa[i->cls], e->f);
+		break;
+	case '0':
+	case '1':
+	case '=':
+		sz = KWIDE(i->cls) ? SLong : SWord;
+		s--;
+		goto Ref;
+	case 'D':
+	case 'S':
+		sz = SLong; /* does not matter for floats */
+	Ref:
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(e->f, "%%%s", regtoa(ref.val, sz));
+			break;
+		case RSlot:
+			fprintf(e->f, "%d(%%%s)",
+				slot(ref, e),
+				regtoa(e->fp, SLong)
+			);
+			break;
+		case RMem:
+		Mem:
+			m = &e->fn->mem[ref.val];
+			if (rtype(m->base) == RSlot) {
+				off.type = CBits;
+				off.bits.i = slot(m->base, e);
+				addcon(&m->offset, &off, 1);
+				m->base = TMP(e->fp);
+			}
+			if (m->offset.type != CUndef)
+				emitcon(&m->offset, e);
+			fputc('(', e->f);
+			if (!req(m->base, R))
+				fprintf(e->f, "%%%s",
+					regtoa(m->base.val, SLong)
+				);
+			else if (m->offset.type == CAddr)
+				fprintf(e->f, "%%rip");
+			if (!req(m->index, R))
+				fprintf(e->f, ", %%%s, %d",
+					regtoa(m->index.val, SLong),
+					m->scale
+				);
+			fputc(')', e->f);
+			break;
+		case RCon:
+			fputc('$', e->f);
+			emitcon(&e->fn->con[ref.val], e);
+			break;
+		default:
+			die("unreachable");
+		}
+		break;
+	case 'L':
+		sz = SLong;
+		goto Ref;
+	case 'W':
+		sz = SWord;
+		goto Ref;
+	case 'H':
+		sz = SShort;
+		goto Ref;
+	case 'B':
+		sz = SByte;
+		goto Ref;
+	case 'M':
+		c = *s++;
+		ref = getarg(c, i);
+		switch (rtype(ref)) {
+		case RMem:
+			goto Mem;
+		case RSlot:
+			fprintf(e->f, "%d(%%%s)",
+				slot(ref, e),
+				regtoa(e->fp, SLong)
+			);
+			break;
+		case RCon:
+			off = e->fn->con[ref.val];
+			emitcon(&off, e);
+			if (off.type == CAddr)
+			if (off.sym.type != SThr)
+				fprintf(e->f, "(%%rip)");
+			break;
+		case RTmp:
+			assert(isreg(ref));
+			fprintf(e->f, "(%%%s)", regtoa(ref.val, SLong));
+			break;
+		default:
+			die("unreachable");
+		}
+		break;
+	default:
+		die("invalid format specifier %%%c", c);
+	}
+	goto Next;
+}
+
+static bits negmask[4] = {
+	[Ks] = 0x80000000,
+	[Kd] = 0x8000000000000000,
+};
+
+static void
+emitins(Ins i, E *e)
+{
+	Ref r;
+	int64_t val;
+	int o, t0;
+	Ins ineg;
+	Con *con;
+	char *sym;
+
+	switch (i.op) {
+	default:
+		if (isxsel(i.op))
+			goto case_Oxsel;
+	Table:
+		/* most instructions are just pulled out of
+		 * the table omap[], some special cases are
+		 * detailed below */
+		for (o=0;; o++) {
+			/* this linear search should really be a binary
+			 * search */
+			if (omap[o].op == NOp)
+				die("no match for %s(%c)",
+					optab[i.op].name, "wlsd"[i.cls]);
+			if (omap[o].op == i.op)
+			if (omap[o].cls == i.cls
+			|| (omap[o].cls == Ki && KBASE(i.cls) == 0)
+			|| (omap[o].cls == Ka))
+				break;
+		}
+		emitf(omap[o].fmt, &i, e);
+		break;
+	case Onop:
+		/* just do nothing for nops, they are inserted
+		 * by some passes */
+		break;
+	case Omul:
+		/* here, we try to use the 3-addresss form
+		 * of multiplication when possible */
+		if (rtype(i.arg[1]) == RCon) {
+			r = i.arg[0];
+			i.arg[0] = i.arg[1];
+			i.arg[1] = r;
+		}
+		if (KBASE(i.cls) == 0 /* only available for ints */
+		&& rtype(i.arg[0]) == RCon
+		&& rtype(i.arg[1]) == RTmp) {
+			emitf("imul%k %0, %1, %=", &i, e);
+			break;
+		}
+		goto Table;
+	case Osub:
+		/* we have to use the negation trick to handle
+		 * some 3-address subtractions */
+		if (req(i.to, i.arg[1]) && !req(i.arg[0], i.to)) {
+			ineg = (Ins){Oneg, i.cls, i.to, {i.to}};
+			emitins(ineg, e);
+			emitf("add%k %0, %=", &i, e);
+			break;
+		}
+		goto Table;
+	case Oneg:
+		if (!req(i.to, i.arg[0]))
+			emitf("mov%k %0, %=", &i, e);
+		if (KBASE(i.cls) == 0)
+			emitf("neg%k %=", &i, e);
+		else
+			fprintf(e->f,
+				"\txorp%c %sfp%d(%%rip), %%%s\n",
+				"xxsd"[i.cls],
+				T.asloc,
+				stashbits(negmask[i.cls], 16),
+				regtoa(i.to.val, SLong)
+			);
+		break;
+	case Odiv:
+		/* use xmm15 to adjust the instruction when the
+		 * conversion to 2-address in emitf() would fail */
+		if (req(i.to, i.arg[1])) {
+			i.arg[1] = TMP(XMM0+15);
+			emitf("mov%k %=, %1", &i, e);
+			emitf("mov%k %0, %=", &i, e);
+			i.arg[0] = i.to;
+		}
+		goto Table;
+	case Ocopy:
+		/* copies are used for many things; see my note
+		 * to understand how to load big constants:
+		 * https://c9x.me/notes/2015-09-19.html */
+		assert(rtype(i.to) != RMem);
+		if (req(i.to, R) || req(i.arg[0], R))
+			break;
+		if (req(i.to, i.arg[0]))
+			break;
+		t0 = rtype(i.arg[0]);
+		if (i.cls == Kl
+		&& t0 == RCon
+		&& e->fn->con[i.arg[0].val].type == CBits) {
+			val = e->fn->con[i.arg[0].val].bits.i;
+			if (isreg(i.to))
+			if (val >= 0 && val <= UINT32_MAX) {
+				emitf("movl %W0, %W=", &i, e);
+				break;
+			}
+			if (rtype(i.to) == RSlot)
+			if (val < INT32_MIN || val > INT32_MAX) {
+				emitf("movl %0, %=", &i, e);
+				emitf("movl %0>>32, 4+%=", &i, e);
+				break;
+			}
+		}
+		if (isreg(i.to)
+		&& t0 == RCon
+		&& e->fn->con[i.arg[0].val].type == CAddr) {
+			emitf("lea%k %M0, %=", &i, e);
+			break;
+		}
+		if (rtype(i.to) == RSlot
+		&& (t0 == RSlot || t0 == RMem)) {
+			i.cls = KWIDE(i.cls) ? Kd : Ks;
+			i.arg[1] = TMP(XMM0+15);
+			emitf("mov%k %0, %1", &i, e);
+			emitf("mov%k %1, %=", &i, e);
+			break;
+		}
+		/* conveniently, the assembler knows if it
+		 * should use movabsq when reading movq */
+		emitf("mov%k %0, %=", &i, e);
+		break;
+	case Oaddr:
+		if (rtype(i.arg[0]) != RCon)
+			goto Table;
+		con = &e->fn->con[i.arg[0].val];
+		assert(isreg(i.to) && con->type == CAddr);
+		sym = str(con->sym.id);
+		if (T.apple && (con->sym.type & SThr)) {
+			fprintf(e->f,
+				"\tmovq %s%s@tlvp(%%rip), %%%s\n",
+				sym[0] == '"' ? "" : T.assym, sym,
+				regtoa(i.to.val, SLong));
+			break;
+		}
+		if (T.windows && con->sym.type != SGlo)
+			die("extern/thread unsupported on amd64_win");
+		switch (con->sym.type) {
+		case SThr:
+			/* derive the symbol address from the TCB
+			 * address at offset 0 of %fs */
+			emitf("movq %%fs:0, %L=", &i, e);
+			fprintf(e->f, "\tleaq %s%s@tpoff",
+				sym[0] == '"' ? "" : T.assym, sym);
+			if (con->bits.i)
+				fprintf(e->f, "%+"PRId64,
+					con->bits.i);
+			fprintf(e->f, "(%%%s), %%%s\n",
+				regtoa(i.to.val, SLong),
+				regtoa(i.to.val, SLong));
+			break;
+		case SExtThr:
+			/* initial-exec TLS: load offset from
+			 * GOT, add to thread-base register */
+			assert(!con->bits.i);
+			emitf("movq %%fs:0, %L=", &i, e);
+			fprintf(e->f,
+				"\taddq %s%s@gottpoff(%%rip), %%%s\n",
+				sym[0] == '"' ? "" : T.assym, sym,
+				regtoa(i.to.val, SLong));
+			break;
+		case SExt:
+			/* load address from the GOT */
+			assert(!con->bits.i);
+			fprintf(e->f,
+				"\tmovq %s%s@gotpcrel(%%rip), %%%s\n",
+				sym[0] == '"' ? "" : T.assym, sym,
+				regtoa(i.to.val, SLong));
+			break;
+		default:
+			goto Table;
+		}
+		break;
+	case Ocall:
+		/* calls simply have a weird syntax in AT&T
+		 * assembly... */
+		switch (rtype(i.arg[0])) {
+		case RCon:
+			con = &e->fn->con[i.arg[0].val];
+			fprintf(e->f, "\tcallq ");
+			emitcon(con, e);
+			if (con->type == CAddr
+			&& (con->sym.type & SExt)
+			&& !T.apple)
+				fprintf(e->f, "@plt");
+			fprintf(e->f, "\n");
+			break;
+		case RTmp:
+			emitf("callq *%L0", &i, e);
+			break;
+		default:
+			die("invalid call argument");
+		}
+		break;
+	case Osalloc:
+		/* there is no good reason why this is here
+		 * maybe we should split Osalloc in 2 different
+		 * instructions depending on the result
+		 */
+		assert(e->fp == RBP);
+		emitf("subq %L0, %%rsp", &i, e);
+		if (!req(i.to, R))
+			emitcopy(i.to, TMP(RSP), Kl, e);
+		break;
+	case Oswap:
+		if (KBASE(i.cls) == 0)
+			goto Table;
+		/* for floats, there is no swap instruction
+		 * so we use xmm15 as a temporary
+		 */
+		emitcopy(TMP(XMM0+15), i.arg[0], i.cls, e);
+		emitcopy(i.arg[0], i.arg[1], i.cls, e);
+		emitcopy(i.arg[1], TMP(XMM0+15), i.cls, e);
+		break;
+	case Odbgloc:
+		emitdbgloc(i.arg[0].val, i.arg[1].val, e->f);
+		break;
+	case_Oxsel:
+		if (req(i.to, i.arg[1]))
+			emitf(cmov[i.op-Oxsel][0], &i, e);
+		else {
+			if (!req(i.to, i.arg[0]))
+				emitf("mov %0, %=", &i, e);
+			emitf(cmov[i.op-Oxsel][1], &i, e);
+		}
+		break;
+	}
+}
+
+static void
+sysv_framesz(E *e)
+{
+	uint64_t i, o, f;
+
+	/* specific to NAlign == 3 */
+	o = 0;
+	if (!e->fn->leaf) {
+		for (i=0, o=0; i<NCLR_SYSV; i++)
+			o ^= e->fn->reg >> amd64_sysv_rclob[i];
+		o &= 1;
+	}
+	f = e->fn->slot;
+	f = (f + 3) & -4;
+	if (f > 0
+	&& e->fp == RSP
+	&& e->fn->salign == 4)
+		f += 2;
+	e->fsz = 4*f + 8*o + 176*e->fn->vararg;
+}
+
+void
+amd64_sysv_emitfn(Fn *fn, FILE *f)
+{
+	static char *ctoa[][2] = {
+	#define X(c, s, n) [c] = {s, n},
+		CMP(X)
+	#undef X
+	};
+	static int id0;
+	Blk *b, *s;
+	Ins *i, itmp;
+	int *r, c, o, n, lbl;
+	uint p;
+	E *e;
+
+	e = &(E){.f = f, .fn = fn};
+	emitfnlnk(fn->name, &fn->lnk, f);
+	fputs("\tendbr64\n", f);
+	if (!fn->leaf || fn->vararg || fn->dynalloc) {
+		e->fp = RBP;
+		fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
+	} else
+		e->fp = RSP;
+	sysv_framesz(e);
+	if (e->fsz)
+		fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz);
+	if (fn->vararg) {
+		o = -176;
+		for (r=amd64_sysv_rsave; r<&amd64_sysv_rsave[6]; r++, o+=8)
+			fprintf(f, "\tmovq %%%s, %d(%%rbp)\n", rname[*r][0], o);
+		for (n=0; n<8; ++n, o+=16)
+			fprintf(f, "\tmovaps %%xmm%d, %d(%%rbp)\n", n, o);
+	}
+	for (r=amd64_sysv_rclob; r<&amd64_sysv_rclob[NCLR_SYSV]; r++)
+		if (fn->reg & BIT(*r)) {
+			itmp.arg[0] = TMP(*r);
+			emitf("pushq %L0", &itmp, e);
+			e->nclob++;
+		}
+
+	for (lbl=0, b=fn->start; b; b=b->link) {
+		if (lbl || b->npred > 1) {
+			for (p=0; p<b->npred; p++)
+				if (b->pred[p]->id >= b->id)
+					break;
+			if (p != b->npred)
+				fprintf(f, ".p2align 4\n");
+			fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
+		}
+		for (i=b->ins; i!=&b->ins[b->nins]; i++)
+			emitins(*i, e);
+		lbl = 1;
+		switch (b->jmp.type) {
+		case Jhlt:
+			fprintf(f, "\tud2\n");
+			break;
+		case Jret0:
+			if (fn->dynalloc)
+				fprintf(f,
+					"\tmovq %%rbp, %%rsp\n"
+					"\tsubq $%"PRIu64", %%rsp\n",
+					e->fsz + e->nclob * 8);
+			for (r=&amd64_sysv_rclob[NCLR_SYSV]; r>amd64_sysv_rclob;)
+				if (fn->reg & BIT(*--r)) {
+					itmp.arg[0] = TMP(*r);
+					emitf("popq %L0", &itmp, e);
+				}
+			if (e->fp == RBP)
+				fputs("\tleave\n", f);
+			else if (e->fsz)
+				fprintf(f,
+					"\taddq $%"PRIu64", %%rsp\n",
+					e->fsz);
+			fputs("\tret\n", f);
+			break;
+		case Jjmp:
+		Jmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp %sbb%d\n",
+					T.asloc, id0+b->s1->id);
+			else
+				lbl = 0;
+			break;
+		default:
+			c = b->jmp.type - Jjf;
+			if (0 <= c && c <= NCmp) {
+				if (b->link == b->s2) {
+					s = b->s1;
+					b->s1 = b->s2;
+					b->s2 = s;
+					n = 0;
+				} else
+					n = 1;
+				fprintf(f, "\tj%s %sbb%d\n", ctoa[c][n],
+					T.asloc, id0+b->s2->id);
+				goto Jmp;
+			}
+			die("unhandled jump %d", b->jmp.type);
+		}
+	}
+	id0 += fn->nblk;
+	if (!T.apple)
+		elf_emitfnfin(fn->name, f);
+}
+
+static void
+winabi_framesz(E *e)
+{
+	uint64_t i, o, f;
+
+	/* specific to NAlign == 3 */
+	o = 0;
+	if (!e->fn->leaf) {
+		for (i=0, o=0; i<NCLR_WIN; i++)
+			o ^= e->fn->reg >> amd64_winabi_rclob[i];
+		o &= 1;
+	}
+	f = e->fn->slot;
+	f = (f + 3) & -4;
+	if (f > 0
+	&& e->fp == RSP
+	&& e->fn->salign == 4)
+		f += 2;
+	e->fsz = 4*f + 8*o;
+}
+
+void
+amd64_winabi_emitfn(Fn *fn, FILE *f)
+{
+	static char *ctoa[][2] = {
+	#define X(c, s, n) [c] = {s, n},
+		CMP(X)
+	#undef X
+	};
+	static int id0;
+	Blk *b, *s;
+	Ins *i, itmp;
+	int *r, c, n, lbl;
+	E *e;
+
+	e = &(E){.f = f, .fn = fn};
+	emitfnlnk(fn->name, &fn->lnk, f);
+	fputs("\tendbr64\n", f);
+	if (fn->vararg) {
+		fprintf(f, "\tmovq %%rcx, 0x8(%%rsp)\n");
+		fprintf(f, "\tmovq %%rdx, 0x10(%%rsp)\n");
+		fprintf(f, "\tmovq %%r8, 0x18(%%rsp)\n");
+		fprintf(f, "\tmovq %%r9, 0x20(%%rsp)\n");
+	}
+	if (!fn->leaf || fn->vararg || fn->dynalloc) {
+		e->fp = RBP;
+		fputs("\tpushq %rbp\n\tmovq %rsp, %rbp\n", f);
+	} else
+		e->fp = RSP;
+	winabi_framesz(e);
+	if (e->fsz)
+		fprintf(f, "\tsubq $%"PRIu64", %%rsp\n", e->fsz);
+	for (r=amd64_winabi_rclob; r<&amd64_winabi_rclob[NCLR_WIN]; r++)
+		if (fn->reg & BIT(*r)) {
+			itmp.arg[0] = TMP(*r);
+			emitf("pushq %L0", &itmp, e);
+			e->nclob++;
+		}
+
+	for (lbl=0, b=fn->start; b; b=b->link) {
+		if (lbl || b->npred > 1)
+			fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
+		for (i=b->ins; i!=&b->ins[b->nins]; i++)
+			emitins(*i, e);
+		lbl = 1;
+		switch (b->jmp.type) {
+		case Jhlt:
+			fprintf(f, "\tud2\n");
+			break;
+		case Jret0:
+			if (fn->dynalloc)
+				fprintf(f,
+					"\tmovq %%rbp, %%rsp\n"
+					"\tsubq $%"PRIu64", %%rsp\n",
+					e->fsz + e->nclob * 8);
+			for (r=&amd64_winabi_rclob[NCLR_WIN]; r>amd64_winabi_rclob;)
+				if (fn->reg & BIT(*--r)) {
+					itmp.arg[0] = TMP(*r);
+					emitf("popq %L0", &itmp, e);
+				}
+			if (e->fp == RBP)
+				fputs("\tleave\n", f);
+			else if (e->fsz)
+				fprintf(f,
+					"\taddq $%"PRIu64", %%rsp\n",
+					e->fsz);
+			fputs("\tret\n", f);
+			break;
+		case Jjmp:
+		Jmp:
+			if (b->s1 != b->link)
+				fprintf(f, "\tjmp %sbb%d\n",
+					T.asloc, id0+b->s1->id);
+			else
+				lbl = 0;
+			break;
+		default:
+			c = b->jmp.type - Jjf;
+			if (0 <= c && c <= NCmp) {
+				if (b->link == b->s2 || c >= NCmpI) {
+					s = b->s1;
+					b->s1 = b->s2;
+					b->s2 = s;
+					n = 0;
+				} else
+					n = 1;
+				fprintf(f, "\tj%s %sbb%d\n", ctoa[c][n],
+					T.asloc, id0+b->s2->id);
+				goto Jmp;
+			}
+			die("unhandled jump %d", b->jmp.type);
+		}
+	}
+	id0 += fn->nblk;
+}
@@ -0,0 +1,944 @@
+#include "all.h"
+#include <limits.h>
+
+/* For x86_64, do the following:
+ *
+ * - check that constants are used only in
+ *   places allowed
+ * - ensure immediates always fit in 32b
+ * - expose machine register contraints
+ *   on instructions like division.
+ * - implement fast locals (the streak of
+ *   constant allocX in the first basic block)
+ * - recognize complex addressing modes
+ *
+ * Invariant: the use counts that are used
+ *            in sel() must be sound.  This
+ *            is not so trivial, maybe the
+ *            dce should be moved out...
+ */
+
+static int amatch(Addr *, Num *, Ref, Fn *);
+
+static int
+noimm(Ref r, Fn *fn)
+{
+	int64_t val;
+
+	if (rtype(r) != RCon)
+		return 0;
+	switch (fn->con[r.val].type) {
+	case CAddr:
+		/* we only support the 'small'
+		 * code model of the ABI, this
+		 * means that we can always
+		 * address data with 32bits
+		 */
+		return 0;
+	case CBits:
+		val = fn->con[r.val].bits.i;
+		return (val < INT32_MIN || val > INT32_MAX);
+	default:
+		die("invalid constant");
+	}
+}
+
+static int
+rslot(Ref r, Fn *fn)
+{
+	if (rtype(r) != RTmp)
+		return -1;
+	return fn->tmp[r.val].slot;
+}
+
+static int
+hascon(Ref r, Con **pc, Fn *fn)
+{
+	switch (rtype(r)) {
+	case RCon:
+		*pc = &fn->con[r.val];
+		return 1;
+	case RMem:
+		*pc = &fn->mem[r.val].offset;
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static void
+fixarg(Ref *r, int k, Ins *i, Fn *fn)
+{
+	char buf[32];
+	Addr a, *m;
+	Con cc, *c;
+	Ref r0, r1, r2, r3;
+	int s, n, op;
+
+	r1 = r0 = *r;
+	s = rslot(r0, fn);
+	op = i ? i->op : Ocopy;
+	if (KBASE(k) == 1 && rtype(r0) == RCon) {
+		/* load floating points from memory
+		 * slots, they can't be used as
+		 * immediates
+		 */
+		r1 = MEM(fn->nmem);
+		vgrow(&fn->mem, ++fn->nmem);
+		memset(&a, 0, sizeof a);
+		a.offset.type = CAddr;
+		n = stashbits(fn->con[r0.val].bits.i, KWIDE(k) ? 8 : 4);
+		/* quote the name so that we do not
+		 * add symbol prefixes on the apple
+		 * target variant
+		 */
+		sprintf(buf, "\"%sfp%d\"", T.asloc, n);
+		a.offset.sym.id = intern(buf);
+		fn->mem[fn->nmem-1] = a;
+	}
+	else if (op == Ocall && r == &i->arg[0]
+	&& rtype(r0) == RCon && fn->con[r0.val].type != CAddr) {
+		/* use a temporary register so that we
+		 * produce an indirect call
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Ocopy, Kl, r1, r0, R);
+	}
+	else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
+		/* load constants that do not fit in
+		 * a 32bit signed integer into a
+		 * long temporary
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Ocopy, Kl, r1, r0, R);
+	}
+	else if (s != -1) {
+		/* load fast locals' addresses into
+		 * temporaries right before the
+		 * instruction
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Oaddr, Kl, r1, SLOT(s), R);
+	}
+	else if (op != Ocall && hascon(r0, &c, fn)
+	&& c->type == CAddr && ((c->sym.type & SExt)
+	 || (T.apple && c->sym.type == SThr))) {
+		r1 = newtmp("isel", Kl, fn);
+		if (c->bits.i) {
+			r2 = newtmp("isel", Kl, fn);
+			cc = (Con){.type = CBits};
+			cc.bits.i = c->bits.i;
+			r3 = newcon(&cc, fn);
+			emit(Oadd, Kl, r1, r2, r3);
+		} else
+			r2 = r1;
+		if (T.apple && (c->sym.type & SThr)) {
+			emit(Ocopy, Kl, r2, TMP(RAX), R);
+			r2 = newtmp("isel", Kl, fn);
+			r3 = newtmp("isel", Kl, fn);
+			emit(Ocall, 0, R, r3, CALL(17));
+			emit(Ocopy, Kl, TMP(RDI), r2, R);
+			emit(Oload, Kl, r3, r2, R);
+		}
+		cc = *c;
+		cc.bits.i = 0;
+		r3 = newcon(&cc, fn);
+		emit(Oaddr, Kl, r2, r3, R);
+		if (rtype(r0) == RMem) {
+			m = &fn->mem[r0.val];
+			m->offset.type = CUndef;
+			m->base = r1;
+			r1 = r0;
+		}
+	}
+	else if (!(isstore(op) && r == &i->arg[1])
+	&& !isload(op) && op != Ocall && rtype(r0) == RCon
+	&& fn->con[r0.val].type == CAddr) {
+		/* turn address operands into
+		 * lea/mov instructions
+		 */
+		r1 = newtmp("isel", Kl, fn);
+		emit(Oaddr, Kl, r1, r0, R);
+	}
+	else if (rtype(r0) == RMem) {
+		/* eliminate memory operands of
+		 * the form $foo(%rip, ...)
+		 */
+		m = &fn->mem[r0.val];
+		if (req(m->base, R))
+		if (m->offset.type == CAddr) {
+			r0 = newtmp("isel", Kl, fn);
+			emit(Oaddr, Kl, r0, newcon(&m->offset, fn), R);
+			m->offset.type = CUndef;
+			m->base = r0;
+		}
+	}
+	else if (isxsel(op) && rtype(*r) == RCon) {
+		r1 = newtmp("isel", i->cls, fn);
+		emit(Ocopy, i->cls, r1, *r, R);
+	}
+	*r = r1;
+}
+
+static void
+seladdr(Ref *r, Num *tn, Fn *fn)
+{
+	Addr a;
+	Ref r0;
+
+	r0 = *r;
+	if (rtype(r0) == RTmp) {
+		memset(&a, 0, sizeof a);
+		if (!amatch(&a, tn, r0, fn))
+			return;
+		if (!req(a.base, R))
+		if (a.offset.type == CAddr) {
+			/* apple as does not support
+			 * $foo(%r0, %r1, M); try to
+			 * rewrite it or bail out if
+			 * impossible
+			 */
+			if (!req(a.index, R) || rtype(a.base) != RTmp)
+				return;
+			else {
+				a.index = a.base;
+				a.scale = 1;
+				a.base = R;
+			}
+		}
+		chuse(r0, -1, fn);
+		vgrow(&fn->mem, ++fn->nmem);
+		fn->mem[fn->nmem-1] = a;
+		chuse(a.base, +1, fn);
+		chuse(a.index, +1, fn);
+		*r = MEM(fn->nmem-1);
+	}
+}
+
+static int
+cmpswap(Ref arg[2], int op)
+{
+	switch (op) {
+	case NCmpI+Cflt:
+	case NCmpI+Cfle:
+		return 1;
+	case NCmpI+Cfgt:
+	case NCmpI+Cfge:
+		return 0;
+	}
+	return rtype(arg[0]) == RCon;
+}
+
+static void
+selcmp(Ref arg[2], int k, int swap, Fn *fn)
+{
+	Ref r;
+	Ins *icmp;
+
+	if (swap) {
+		r = arg[1];
+		arg[1] = arg[0];
+		arg[0] = r;
+	}
+	emit(Oxcmp, k, R, arg[1], arg[0]);
+	icmp = curi;
+	if (rtype(arg[0]) == RCon) {
+		assert(k != Kw);
+		icmp->arg[1] = newtmp("isel", k, fn);
+		emit(Ocopy, k, icmp->arg[1], arg[0], R);
+		fixarg(&curi->arg[0], k, curi, fn);
+	}
+	fixarg(&icmp->arg[0], k, icmp, fn);
+	fixarg(&icmp->arg[1], k, icmp, fn);
+}
+
+static void
+sel(Ins i, Num *tn, Fn *fn)
+{
+	Ref r0, r1, tmp[7];
+	int x, j, k, kc, sh, swap;
+	Ins *i0, *i1;
+
+	if (rtype(i.to) == RTmp)
+	if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
+	if (fn->tmp[i.to.val].nuse == 0) {
+		chuse(i.arg[0], -1, fn);
+		chuse(i.arg[1], -1, fn);
+		return;
+	}
+	i0 = curi;
+	k = i.cls;
+	switch (i.op) {
+	case Odiv:
+	case Orem:
+	case Oudiv:
+	case Ourem:
+		if (KBASE(k) == 1)
+			goto Emit;
+		if (i.op == Odiv || i.op == Oudiv)
+			r0 = TMP(RAX), r1 = TMP(RDX);
+		else
+			r0 = TMP(RDX), r1 = TMP(RAX);
+		emit(Ocopy, k, i.to, r0, R);
+		emit(Ocopy, k, R, r1, R);
+		if (rtype(i.arg[1]) == RCon) {
+			/* immediates not allowed for
+			 * divisions in x86
+			 */
+			r0 = newtmp("isel", k, fn);
+		} else
+			r0 = i.arg[1];
+		if (fn->tmp[r0.val].slot != -1)
+			err("unlikely argument %%%s in %s",
+				fn->tmp[r0.val].name, optab[i.op].name);
+		if (i.op == Odiv || i.op == Orem) {
+			emit(Oxidiv, k, R, r0, R);
+			emit(Osign, k, TMP(RDX), TMP(RAX), R);
+		} else {
+			emit(Oxdiv, k, R, r0, R);
+			emit(Ocopy, k, TMP(RDX), CON_Z, R);
+		}
+		emit(Ocopy, k, TMP(RAX), i.arg[0], R);
+		fixarg(&curi->arg[0], k, curi, fn);
+		if (rtype(i.arg[1]) == RCon)
+			emit(Ocopy, k, r0, i.arg[1], R);
+		break;
+	case Osar:
+	case Oshr:
+	case Oshl:
+		r0 = i.arg[1];
+		if (rtype(r0) == RCon)
+			goto Emit;
+		if (fn->tmp[r0.val].slot != -1)
+			err("unlikely argument %%%s in %s",
+				fn->tmp[r0.val].name, optab[i.op].name);
+		i.arg[1] = TMP(RCX);
+		emit(Ocopy, Kw, R, TMP(RCX), R);
+		emiti(i);
+		i1 = curi;
+		emit(Ocopy, Kw, TMP(RCX), r0, R);
+		fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
+		break;
+	case Ouwtof:
+		r0 = newtmp("utof", Kl, fn);
+		emit(Osltof, k, i.to, r0, R);
+		emit(Oextuw, Kl, r0, i.arg[0], R);
+		fixarg(&curi->arg[0], k, curi, fn);
+		break;
+	case Oultof:
+		/* %mask =l and %arg.0, 1
+		 * %isbig =l shr %arg.0, 63
+		 * %divided =l shr %arg.0, %isbig
+		 * %or =l or %mask, %divided
+		 * %float =d sltof %or
+		 * %cast =l cast %float
+		 * %addend =l shl %isbig, 52
+		 * %sum =l add %cast, %addend
+		 * %result =d cast %sum
+		 */
+		r0 = newtmp("utof", k, fn);
+		if (k == Ks)
+			kc = Kw, sh = 23;
+		else
+			kc = Kl, sh = 52;
+		for (j=0; j<4; j++)
+			tmp[j] = newtmp("utof", Kl, fn);
+		for (; j<7; j++)
+			tmp[j] = newtmp("utof", kc, fn);
+		emit(Ocast, k, i.to, tmp[6], R);
+		emit(Oadd, kc, tmp[6], tmp[4], tmp[5]);
+		emit(Oshl, kc, tmp[5], tmp[1], getcon(sh, fn));
+		emit(Ocast, kc, tmp[4], r0, R);
+		emit(Osltof, k, r0, tmp[3], R);
+		emit(Oor, Kl, tmp[3], tmp[0], tmp[2]);
+		emit(Oshr, Kl, tmp[2], i.arg[0], tmp[1]);
+		sel(*curi++, 0, fn);
+		emit(Oshr, Kl, tmp[1], i.arg[0], getcon(63, fn));
+		fixarg(&curi->arg[0], Kl, curi, fn);
+		emit(Oand, Kl, tmp[0], i.arg[0], getcon(1, fn));
+		fixarg(&curi->arg[0], Kl, curi, fn);
+		break;
+	case Ostoui:
+		i.op = Ostosi;
+		kc = Ks;
+		tmp[4] = getcon(0xdf000000, fn);
+		goto Oftoui;
+	case Odtoui:
+		i.op = Odtosi;
+		kc = Kd;
+		tmp[4] = getcon(0xc3e0000000000000, fn);
+	Oftoui:
+		if (k == Kw) {
+			r0 = newtmp("ftou", Kl, fn);
+			emit(Ocopy, Kw, i.to, r0, R);
+			i.cls = Kl;
+			i.to = r0;
+			goto Emit;
+		}
+		/* %try0 =l {s,d}tosi %fp
+		 * %mask =l sar %try0, 63
+		 *
+		 *    mask is all ones if the first
+		 *    try was oob, all zeroes o.w.
+		 *
+		 * %fps ={s,d} sub %fp, (1<<63)
+		 * %try1 =l {s,d}tosi %fps
+		 *
+		 * %tmp =l and %mask, %try1
+		 * %res =l or %tmp, %try0
+		 */
+		r0 = newtmp("ftou", kc, fn);
+		for (j=0; j<4; j++)
+			tmp[j] = newtmp("ftou", Kl, fn);
+		emit(Oor, Kl, i.to, tmp[0], tmp[3]);
+		emit(Oand, Kl, tmp[3], tmp[2], tmp[1]);
+		emit(i.op, Kl, tmp[2], r0, R);
+		emit(Oadd, kc, r0, tmp[4], i.arg[0]);
+		i1 = curi; /* fixarg() can change curi */
+		fixarg(&i1->arg[0], kc, i1, fn);
+		fixarg(&i1->arg[1], kc, i1, fn);
+		emit(Osar, Kl, tmp[1], tmp[0], getcon(63, fn));
+		emit(i.op, Kl, tmp[0], i.arg[0], R);
+		fixarg(&curi->arg[0], Kl, curi, fn);
+		break;
+	case Onop:
+		break;
+	case Ostored:
+	case Ostores:
+	case Ostorel:
+	case Ostorew:
+	case Ostoreh:
+	case Ostoreb:
+		if (rtype(i.arg[0]) == RCon) {
+			if (i.op == Ostored)
+				i.op = Ostorel;
+			if (i.op == Ostores)
+				i.op = Ostorew;
+		}
+		seladdr(&i.arg[1], tn, fn);
+		goto Emit;
+	case_Oload:
+		seladdr(&i.arg[0], tn, fn);
+		goto Emit;
+	case Odbgloc:
+	case Ocall:
+	case Osalloc:
+	case Ocopy:
+	case Oadd:
+	case Osub:
+	case Oneg:
+	case Omul:
+	case Oand:
+	case Oor:
+	case Oxor:
+	case Oxtest:
+	case Ostosi:
+	case Odtosi:
+	case Oswtof:
+	case Osltof:
+	case Oexts:
+	case Otruncd:
+	case Ocast:
+	case_Oxsel:
+	case_Oext:
+Emit:
+		emiti(i);
+		i1 = curi; /* fixarg() can change curi */
+		fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
+		fixarg(&i1->arg[1], argcls(&i, 1), i1, fn);
+		break;
+	case Oalloc4:
+	case Oalloc8:
+	case Oalloc16:
+		salloc(i.to, i.arg[0], fn);
+		break;
+	default:
+		if (isext(i.op))
+			goto case_Oext;
+		if (isxsel(i.op))
+			goto case_Oxsel;
+		if (isload(i.op))
+			goto case_Oload;
+		if (iscmp(i.op, &kc, &x)) {
+			switch (x) {
+			case NCmpI+Cfeq:
+				/* zf is set when operands are
+				 * unordered, so we may have to
+				 * check pf
+				 */
+				r0 = newtmp("isel", Kw, fn);
+				r1 = newtmp("isel", Kw, fn);
+				emit(Oand, Kw, i.to, r0, r1);
+				emit(Oflagfo, k, r1, R, R);
+				i.to = r0;
+				break;
+			case NCmpI+Cfne:
+				r0 = newtmp("isel", Kw, fn);
+				r1 = newtmp("isel", Kw, fn);
+				emit(Oor, Kw, i.to, r0, r1);
+				emit(Oflagfuo, k, r1, R, R);
+				i.to = r0;
+				break;
+			}
+			swap = cmpswap(i.arg, x);
+			if (swap)
+				x = cmpop(x);
+			emit(Oflag+x, k, i.to, R, R);
+			selcmp(i.arg, kc, swap, fn);
+			break;
+		}
+		die("unknown instruction %s", optab[i.op].name);
+	}
+
+	while (i0>curi && --i0) {
+		assert(rslot(i0->arg[0], fn) == -1);
+		assert(rslot(i0->arg[1], fn) == -1);
+	}
+}
+
+static Ins *
+flagi(Ins *i0, Ins *i)
+{
+	while (i>i0) {
+		i--;
+		if (amd64_op[i->op].zflag)
+			return i;
+		if (amd64_op[i->op].lflag)
+			continue;
+		return 0;
+	}
+	return 0;
+}
+
+static Ins*
+selsel(Fn *fn, Blk *b, Ins *i, Num *tn)
+{
+	Ref r, cr[2];
+	int c, k, swap, gencmp, gencpy;
+	Ins *isel0, *isel1, *fi;
+	Tmp *t;
+
+	assert(i->op == Osel1);
+	for (isel0=i; b->ins<isel0; isel0--) {
+		if (isel0->op == Osel0)
+			break;
+		assert(isel0->op == Osel1);
+	}
+	assert(isel0->op == Osel0);
+	r = isel0->arg[0];
+	assert(rtype(r) == RTmp);
+	t = &fn->tmp[r.val];
+	fi = flagi(b->ins, isel0);
+	cr[0] = cr[1] = R;
+	gencmp = gencpy = swap = 0;
+	k = Kw;
+	c = Cine;
+	if (!fi || !req(fi->to, r)) {
+		gencmp = 1;
+		cr[0] = r;
+		cr[1] = CON_Z;
+	}
+	else if (iscmp(fi->op, &k, &c)) {
+		if (c == NCmpI+Cfeq
+		|| c == NCmpI+Cfne) {
+			/* these are selected as 'and'
+			 * or 'or', so we check their
+			 * result with Cine
+			 */
+			c = Cine;
+			goto Other;
+		}
+		swap = cmpswap(fi->arg, c);
+		if (swap)
+			c = cmpop(c);
+		if (t->nuse == 1) {
+			gencmp = 1;
+			cr[0] = fi->arg[0];
+			cr[1] = fi->arg[1];
+			*fi = (Ins){.op = Onop};
+		}
+	}
+	else if (fi->op == Oand && t->nuse == 1
+	     && (rtype(fi->arg[0]) == RTmp ||
+	         rtype(fi->arg[1]) == RTmp)) {
+		fi->op = Oxtest;
+		fi->to = R;
+		if (rtype(fi->arg[1]) == RCon) {
+			r = fi->arg[1];
+			fi->arg[1] = fi->arg[0];
+			fi->arg[0] = r;
+		}
+	}
+	else {
+	Other:
+		/* since flags are not tracked in liveness,
+		 * the result of the flag-setting instruction
+		 * has to be marked as live
+		 */
+		if (t->nuse == 1)
+			gencpy = 1;
+	}
+	/* generate conditional moves */
+	for (isel1=i; isel0<isel1; --isel1) {
+		isel1->op = Oxsel+c;
+		sel(*isel1, tn, fn);
+	}
+	assert(!gencmp || !gencpy);
+	if (gencmp)
+		selcmp(cr, k, swap, fn);
+	if (gencpy)
+		emit(Ocopy, Kw, R, r, R);
+	*isel0 = (Ins){.op = Onop};
+	return isel0;
+}
+
+static void
+seljmp(Blk *b, Fn *fn)
+{
+	Ref r;
+	int c, k, swap;
+	Ins *fi;
+	Tmp *t;
+
+	if (b->jmp.type == Jret0
+	|| b->jmp.type == Jjmp
+	|| b->jmp.type == Jhlt)
+		return;
+	assert(b->jmp.type == Jjnz);
+	r = b->jmp.arg;
+	t = &fn->tmp[r.val];
+	b->jmp.arg = R;
+	assert(rtype(r) == RTmp);
+	if (b->s1 == b->s2) {
+		chuse(r, -1, fn);
+		b->jmp.type = Jjmp;
+		b->s2 = 0;
+		return;
+	}
+	fi = flagi(b->ins, &b->ins[b->nins]);
+	if (!fi || !req(fi->to, r)) {
+		selcmp((Ref[2]){r, CON_Z}, Kw, 0, fn);
+		b->jmp.type = Jjf + Cine;
+	}
+	else if (iscmp(fi->op, &k, &c)
+	     && c != NCmpI+Cfeq /* see sel(), selsel() */
+	     && c != NCmpI+Cfne) {
+		swap = cmpswap(fi->arg, c);
+		if (swap)
+			c = cmpop(c);
+		if (t->nuse == 1) {
+			selcmp(fi->arg, k, swap, fn);
+			*fi = (Ins){.op = Onop};
+		}
+		b->jmp.type = Jjf + c;
+	}
+	else if (fi->op == Oand && t->nuse == 1
+	     && (rtype(fi->arg[0]) == RTmp ||
+	         rtype(fi->arg[1]) == RTmp)) {
+		fi->op = Oxtest;
+		fi->to = R;
+		b->jmp.type = Jjf + Cine;
+		if (rtype(fi->arg[1]) == RCon) {
+			r = fi->arg[1];
+			fi->arg[1] = fi->arg[0];
+			fi->arg[0] = r;
+		}
+	}
+	else {
+		/* since flags are not tracked in liveness,
+		 * the result of the flag-setting instruction
+		 * has to be marked as live
+		 */
+		if (t->nuse == 1)
+			emit(Ocopy, Kw, R, r, R);
+		b->jmp.type = Jjf + Cine;
+	}
+}
+
+enum {
+	Pob,
+	Pbis,
+	Pois,
+	Pobis,
+	Pbi1,
+	Pobi1,
+};
+
+/* mgen generated code
+ *
+ * (with-vars (o b i s)
+ *   (patterns
+ *     (ob   (add (con o) (tmp b)))
+ *     (bis  (add (tmp b) (mul (tmp i) (con s 1 2 4 8))))
+ *     (ois  (add (con o) (mul (tmp i) (con s 1 2 4 8))))
+ *     (obis (add (con o) (tmp b) (mul (tmp i) (con s 1 2 4 8))))
+ *     (bi1  (add (tmp b) (tmp i)))
+ *     (obi1 (add (con o) (tmp b) (tmp i)))
+ * ))
+ */
+
+static int
+opn(int op, int l, int r)
+{
+	static uchar Oaddtbl[91] = {
+		2,
+		2,2,
+		4,4,5,
+		6,6,8,8,
+		4,4,9,10,9,
+		7,7,5,8,9,5,
+		4,4,12,10,12,12,12,
+		4,4,9,10,9,9,12,9,
+		11,11,5,8,9,5,12,9,5,
+		7,7,5,8,9,5,12,9,5,5,
+		11,11,5,8,9,5,12,9,5,5,5,
+		4,4,9,10,9,9,12,9,9,9,9,9,
+		7,7,5,8,9,5,12,9,5,5,5,9,5,
+	};
+	int t;
+
+	if (l < r)
+		t = l, l = r, r = t;
+	switch (op) {
+	case Omul:
+		if (2 <= l)
+		if (r == 0) {
+			return 3;
+		}
+		return 2;
+	case Oadd:
+		return Oaddtbl[(l + l*l)/2 + r];
+	default:
+		return 2;
+	}
+}
+
+static int
+refn(Ref r, Num *tn, Con *con)
+{
+	int64_t n;
+
+	switch (rtype(r)) {
+	case RTmp:
+		if (!tn[r.val].n)
+			tn[r.val].n = 2;
+		return tn[r.val].n;
+	case RCon:
+		if (con[r.val].type != CBits)
+			return 1;
+		n = con[r.val].bits.i;
+		if (n == 8 || n == 4 || n == 2 || n == 1)
+			return 0;
+		return 1;
+	default:
+		return INT_MIN;
+	}
+}
+
+static bits match[13] = {
+	[4] = BIT(Pob),
+	[5] = BIT(Pbi1),
+	[6] = BIT(Pob) | BIT(Pois),
+	[7] = BIT(Pob) | BIT(Pobi1),
+	[8] = BIT(Pbi1) | BIT(Pbis),
+	[9] = BIT(Pbi1) | BIT(Pobi1),
+	[10] = BIT(Pbi1) | BIT(Pbis) | BIT(Pobi1) | BIT(Pobis),
+	[11] = BIT(Pob) | BIT(Pobi1) | BIT(Pobis),
+	[12] = BIT(Pbi1) | BIT(Pobi1) | BIT(Pobis),
+};
+
+static uchar *matcher[] = {
+	[Pbi1] = (uchar[]){
+		1,3,1,3,2,0
+	},
+	[Pbis] = (uchar[]){
+		5,1,8,5,27,1,5,1,2,5,13,3,1,1,3,3,3,2,0,1,
+		3,3,3,2,3,1,0,1,29
+	},
+	[Pob] = (uchar[]){
+		1,3,0,3,1,0
+	},
+	[Pobi1] = (uchar[]){
+		5,3,9,9,10,33,12,35,45,1,5,3,11,9,7,9,4,9,
+		17,1,3,0,3,1,3,2,0,3,1,1,3,0,34,1,37,1,5,2,
+		5,7,2,7,8,37,29,1,3,0,1,32
+	},
+	[Pobis] = (uchar[]){
+		5,2,10,7,11,19,49,1,1,3,3,3,2,1,3,0,3,1,0,
+		1,3,0,5,1,8,5,25,1,5,1,2,5,13,3,1,1,3,3,3,
+		2,0,1,3,3,3,2,26,1,51,1,5,1,6,5,9,1,3,0,51,
+		3,1,1,3,0,45
+	},
+	[Pois] = (uchar[]){
+		1,3,0,1,3,3,3,2,0
+	},
+};
+
+/* end of generated code */
+
+static void
+anumber(Num *tn, Blk *b, Con *con)
+{
+	Ins *i;
+	Num *n;
+
+	for (i=b->ins; i<&b->ins[b->nins]; i++) {
+		if (rtype(i->to) != RTmp)
+			continue;
+		n = &tn[i->to.val];
+		n->l = i->arg[0];
+		n->r = i->arg[1];
+		n->nl = refn(n->l, tn, con);
+		n->nr = refn(n->r, tn, con);
+		n->n = opn(i->op, n->nl, n->nr);
+	}
+}
+
+static Ref
+adisp(Con *c, Num *tn, Ref r, Fn *fn, int s)
+{
+	Ref v[2];
+	int n;
+
+	while (!req(r, R)) {
+		assert(rtype(r) == RTmp);
+		n = refn(r, tn, fn->con);
+		if (!(match[n] & BIT(Pob)))
+			break;
+		runmatch(matcher[Pob], tn, r, v);
+		assert(rtype(v[0]) == RCon);
+		addcon(c, &fn->con[v[0].val], s);
+		r = v[1];
+	}
+	return r;
+}
+
+static int
+amatch(Addr *a, Num *tn, Ref r, Fn *fn)
+{
+	static int pat[] = {Pobis, Pobi1, Pbis, Pois, Pbi1, -1};
+	Ref ro, rb, ri, rs, v[4];
+	Con *c, co;
+	int s, n, *p;
+
+	if (rtype(r) != RTmp)
+		return 0;
+
+	n = refn(r, tn, fn->con);
+	memset(v, 0, sizeof v);
+	for (p=pat; *p>=0; p++)
+		if (match[n] & BIT(*p)) {
+			runmatch(matcher[*p], tn, r, v);
+			break;
+		}
+	if (*p < 0)
+		v[1] = r;
+
+	memset(&co, 0, sizeof co);
+	ro = v[0];
+	rb = adisp(&co, tn, v[1], fn, 1);
+	ri = v[2];
+	rs = v[3];
+	s = 1;
+
+	if (*p < 0 && co.type != CUndef)
+	if (amatch(a, tn, rb, fn))
+		return addcon(&a->offset, &co, 1);
+	if (!req(ro, R)) {
+		assert(rtype(ro) == RCon);
+		c = &fn->con[ro.val];
+		if (!addcon(&co, c, 1))
+			return 0;
+	}
+	if (!req(rs, R)) {
+		assert(rtype(rs) == RCon);
+		c = &fn->con[rs.val];
+		assert(c->type == CBits);
+		s = c->bits.i;
+	}
+	ri = adisp(&co, tn, ri, fn, s);
+	*a = (Addr){co, rb, ri, s};
+
+	if (rtype(ri) == RTmp)
+	if (fn->tmp[ri.val].slot != -1) {
+		if (a->scale != 1
+		|| fn->tmp[rb.val].slot != -1)
+			return 0;
+		a->base = ri;
+		a->index = rb;
+	}
+	if (!req(a->base, R)) {
+		assert(rtype(a->base) == RTmp);
+		s = fn->tmp[a->base.val].slot;
+		if (s != -1)
+			a->base = SLOT(s);
+	}
+	return 1;
+}
+
+/* instruction selection
+ * requires use counts (as given by parsing)
+ */
+void
+amd64_isel(Fn *fn)
+{
+	Blk *b, **sb;
+	Ins *i;
+	Phi *p;
+	uint a;
+	int n, al;
+	int64_t sz;
+	Num *num;
+
+	/* assign slots to fast allocs */
+	b = fn->start;
+	/* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
+	for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
+		for (i=b->ins; i<&b->ins[b->nins]; i++)
+			if (i->op == al) {
+				if (rtype(i->arg[0]) != RCon)
+					break;
+				sz = fn->con[i->arg[0].val].bits.i;
+				if (sz < 0 || sz >= INT_MAX-15)
+					err("invalid alloc size %"PRId64, sz);
+				sz = (sz + n-1) & -n;
+				sz /= 4;
+				if (sz > INT_MAX - fn->slot)
+					die("alloc too large");
+				fn->tmp[i->to.val].slot = fn->slot;
+				fn->slot += sz;
+				fn->salign = 2 + al - Oalloc;
+				*i = (Ins){.op = Onop};
+			}
+
+	/* process basic blocks */
+	n = fn->ntmp;
+	num = emalloc(n * sizeof num[0]);
+	for (b=fn->start; b; b=b->link) {
+		curi = &insb[NIns];
+		for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
+			for (p=(*sb)->phi; p; p=p->link) {
+				for (a=0; p->blk[a] != b; a++)
+					assert(a+1 < p->narg);
+				fixarg(&p->arg[a], p->cls, 0, fn);
+			}
+		memset(num, 0, n * sizeof num[0]);
+		anumber(num, b, fn->con);
+		seljmp(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;) {
+			--i;
+			assert(i->op != Osel0);
+			if (i->op == Osel1)
+				i = selsel(fn, b, i, num);
+			else
+				sel(*i, num, fn);
+		}
+		idup(b, curi, &insb[NIns]-curi);
+	}
+	free(num);
+
+	if (debug['I']) {
+		fprintf(stderr, "\n> After instruction selection:\n");
+		printfn(fn, stderr);
+	}
+}
@@ -0,0 +1,721 @@
+#include "all.h"
+
+typedef struct AClass AClass;
+typedef struct RAlloc RAlloc;
+
+struct AClass {
+	Typ *type;
+	int inmem;
+	int align;
+	uint size;
+	int cls[2];
+	Ref ref[2];
+};
+
+struct RAlloc {
+	Ins i;
+	RAlloc *link;
+};
+
+static void
+classify(AClass *a, Typ *t, uint s)
+{
+	Field *f;
+	int *cls;
+	uint n, s1;
+
+	for (n=0, s1=s; n<t->nunion; n++, s=s1)
+		for (f=t->fields[n]; f->type!=FEnd; f++) {
+			assert(s <= 16);
+			cls = &a->cls[s/8];
+			switch (f->type) {
+			case FEnd:
+				die("unreachable");
+			case FPad:
+				/* don't change anything */
+				s += f->len;
+				break;
+			case Fs:
+			case Fd:
+				if (*cls == Kx)
+					*cls = Kd;
+				s += f->len;
+				break;
+			case Fb:
+			case Fh:
+			case Fw:
+			case Fl:
+				*cls = Kl;
+				s += f->len;
+				break;
+			case FTyp:
+				classify(a, &typ[f->len], s);
+				s += typ[f->len].size;
+				break;
+			}
+		}
+}
+
+static void
+typclass(AClass *a, Typ *t)
+{
+	uint sz, al;
+
+	sz = t->size;
+	al = 1u << t->align;
+
+	/* the ABI requires sizes to be rounded
+	 * up to the nearest multiple of 8, moreover
+	 * it makes it easy load and store structures
+	 * in registers
+	 */
+	if (al < 8)
+		al = 8;
+	sz = (sz + al-1) & -al;
+
+	a->type = t;
+	a->size = sz;
+	a->align = t->align;
+
+	if (t->isdark || sz > 16 || sz == 0) {
+		/* large or unaligned structures are
+		 * required to be passed in memory
+		 */
+		a->inmem = 1;
+		return;
+	}
+
+	a->cls[0] = Kx;
+	a->cls[1] = Kx;
+	a->inmem = 0;
+	classify(a, t, 0);
+}
+
+static int
+retr(Ref reg[2], AClass *aret)
+{
+	static int retreg[2][2] = {{RAX, RDX}, {XMM0, XMM0+1}};
+	int n, k, ca, nr[2];
+
+	nr[0] = nr[1] = 0;
+	ca = 0;
+	for (n=0; (uint)n*8<aret->size; n++) {
+		k = KBASE(aret->cls[n]);
+		reg[n] = TMP(retreg[k][nr[k]++]);
+		ca += 1 << (2 * k);
+	}
+	return ca;
+}
+
+static void
+selret(Blk *b, Fn *fn)
+{
+	int j, k, ca;
+	Ref r, r0, reg[2];
+	AClass aret;
+
+	j = b->jmp.type;
+
+	if (!isret(j) || j == Jret0)
+		return;
+
+	r0 = b->jmp.arg;
+	b->jmp.type = Jret0;
+
+	if (j == Jretc) {
+		typclass(&aret, &typ[fn->retty]);
+		if (aret.inmem) {
+			assert(rtype(fn->retr) == RTmp);
+			emit(Ocopy, Kl, TMP(RAX), fn->retr, R);
+			emit(Oblit1, 0, R, INT(aret.type->size), R);
+			emit(Oblit0, 0, R, r0, fn->retr);
+			ca = 1;
+		} else {
+			ca = retr(reg, &aret);
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, Kl, reg[1], r, R);
+				emit(Oadd, Kl, r, r0, getcon(8, fn));
+			}
+			emit(Oload, Kl, reg[0], r0, R);
+		}
+	} else {
+		k = j - Jretw;
+		if (KBASE(k) == 0) {
+			emit(Ocopy, k, TMP(RAX), r0, R);
+			ca = 1;
+		} else {
+			emit(Ocopy, k, TMP(XMM0), r0, R);
+			ca = 1 << 2;
+		}
+	}
+
+	b->jmp.arg = CALL(ca);
+}
+
+static int
+argsclass(Ins *i0, Ins *i1, AClass *ac, int op, AClass *aret, Ref *env)
+{
+	int varc, envc, nint, ni, nsse, ns, n, *pn;
+	AClass *a;
+	Ins *i;
+
+	if (aret && aret->inmem)
+		nint = 5; /* hidden argument */
+	else
+		nint = 6;
+	nsse = 8;
+	varc = 0;
+	envc = 0;
+	for (i=i0, a=ac; i<i1; i++, a++)
+		switch (i->op - op + Oarg) {
+		case Oarg:
+			if (KBASE(i->cls) == 0)
+				pn = &nint;
+			else
+				pn = &nsse;
+			if (*pn > 0) {
+				--*pn;
+				a->inmem = 0;
+			} else
+				a->inmem = 2;
+			a->align = 3;
+			a->size = 8;
+			a->cls[0] = i->cls;
+			break;
+		case Oargc:
+			n = i->arg[0].val;
+			typclass(a, &typ[n]);
+			if (a->inmem)
+				continue;
+			ni = ns = 0;
+			for (n=0; (uint)n*8<a->size; n++)
+				if (KBASE(a->cls[n]) == 0)
+					ni++;
+				else
+					ns++;
+			if (nint >= ni && nsse >= ns) {
+				nint -= ni;
+				nsse -= ns;
+			} else
+				a->inmem = 1;
+			break;
+		case Oarge:
+			envc = 1;
+			if (op == Opar)
+				*env = i->to;
+			else
+				*env = i->arg[0];
+			break;
+		case Oargv:
+			varc = 1;
+			break;
+		default:
+			die("unreachable");
+		}
+
+	if (varc && envc)
+		err("sysv abi does not support variadic env calls");
+
+	return ((varc|envc) << 12) | ((6-nint) << 4) | ((8-nsse) << 8);
+}
+
+int amd64_sysv_rsave[] = {
+	RDI, RSI, RDX, RCX, R8, R9, R10, R11, RAX,
+	XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1
+};
+int amd64_sysv_rclob[] = {RBX, R12, R13, R14, R15, -1};
+
+MAKESURE(sysv_arrays_ok,
+	sizeof amd64_sysv_rsave == (NGPS_SYSV+NFPS+1) * sizeof(int) &&
+	sizeof amd64_sysv_rclob == (NCLR_SYSV+1) * sizeof(int)
+);
+
+/* layout of call's second argument (RCall)
+ *
+ *  29     12    8    4  3  0
+ *  |0...00|x|xxxx|xxxx|xx|xx|                  range
+ *          |    |    |  |  ` gp regs returned (0..2)
+ *          |    |    |  ` sse regs returned   (0..2)
+ *          |    |    ` gp regs passed         (0..6)
+ *          |    ` sse regs passed             (0..8)
+ *          ` 1 if rax is used to pass data    (0..1)
+ */
+
+bits
+amd64_sysv_retregs(Ref r, int p[2])
+{
+	bits b;
+	int ni, nf;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = r.val & 3;
+	nf = (r.val >> 2) & 3;
+	if (ni >= 1)
+		b |= BIT(RAX);
+	if (ni >= 2)
+		b |= BIT(RDX);
+	if (nf >= 1)
+		b |= BIT(XMM0);
+	if (nf >= 2)
+		b |= BIT(XMM1);
+	if (p) {
+		p[0] = ni;
+		p[1] = nf;
+	}
+	return b;
+}
+
+bits
+amd64_sysv_argregs(Ref r, int p[2])
+{
+	bits b;
+	int j, ni, nf, ra;
+
+	assert(rtype(r) == RCall);
+	b = 0;
+	ni = (r.val >> 4) & 15;
+	nf = (r.val >> 8) & 15;
+	ra = (r.val >> 12) & 1;
+	for (j=0; j<ni; j++)
+		b |= BIT(amd64_sysv_rsave[j]);
+	for (j=0; j<nf; j++)
+		b |= BIT(XMM0+j);
+	if (p) {
+		p[0] = ni + ra;
+		p[1] = nf;
+	}
+	return b | (ra ? BIT(RAX) : 0);
+}
+
+static Ref
+rarg(int ty, int *ni, int *ns)
+{
+	if (KBASE(ty) == 0)
+		return TMP(amd64_sysv_rsave[(*ni)++]);
+	else
+		return TMP(XMM0 + (*ns)++);
+}
+
+static void
+selcall(Fn *fn, Ins *i0, Ins *i1, RAlloc **rap)
+{
+	Ins *i;
+	AClass *ac, *a, aret;
+	int ca, ni, ns, al;
+	uint stk, off;
+	Ref r, r1, r2, reg[2], env;
+	RAlloc *ra;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+
+	if (!req(i1->arg[1], R)) {
+		assert(rtype(i1->arg[1]) == RType);
+		typclass(&aret, &typ[i1->arg[1].val]);
+		ca = argsclass(i0, i1, ac, Oarg, &aret, &env);
+	} else
+		ca = argsclass(i0, i1, ac, Oarg, 0, &env);
+
+	for (stk=0, a=&ac[i1-i0]; a>ac;)
+		if ((--a)->inmem) {
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			stk += a->size;
+			if (a->align == 4)
+				stk += stk & 15;
+		}
+	stk += stk & 15;
+	if (stk) {
+		r = getcon(-(int64_t)stk, fn);
+		emit(Osalloc, Kl, R, r, R);
+	}
+
+	if (!req(i1->arg[1], R)) {
+		if (aret.inmem) {
+			/* get the return location from eax
+			 * it saves one callee-save reg */
+			r1 = newtmp("abi", Kl, fn);
+			emit(Ocopy, Kl, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			/* todo, may read out of bounds.
+			 * gcc did this up until 5.2, but
+			 * this should still be fixed.
+			 */
+			if (aret.size > 8) {
+				r = newtmp("abi", Kl, fn);
+				aret.ref[1] = newtmp("abi", aret.cls[1], fn);
+				emit(Ostorel, 0, R, aret.ref[1], r);
+				emit(Oadd, Kl, r, i1->to, getcon(8, fn));
+			}
+			aret.ref[0] = newtmp("abi", aret.cls[0], fn);
+			emit(Ostorel, 0, R, aret.ref[0], i1->to);
+			ca += retr(reg, &aret);
+			if (aret.size > 8)
+				emit(Ocopy, aret.cls[1], aret.ref[1], reg[1], R);
+			emit(Ocopy, aret.cls[0], aret.ref[0], reg[0], R);
+			r1 = i1->to;
+		}
+		/* allocate return pad */
+		ra = alloc(sizeof *ra);
+		/* specific to NAlign == 3 */
+		al = aret.align >= 2 ? aret.align - 2 : 0;
+		ra->i = (Ins){Oalloc+al, Kl, r1, {getcon(aret.size, fn)}};
+		ra->link = (*rap);
+		*rap = ra;
+	} else {
+		ra = 0;
+		if (KBASE(i1->cls) == 0) {
+			emit(Ocopy, i1->cls, i1->to, TMP(RAX), R);
+			ca += 1;
+		} else {
+			emit(Ocopy, i1->cls, i1->to, TMP(XMM0), R);
+			ca += 1 << 2;
+		}
+	}
+
+	emit(Ocall, i1->cls, R, i1->arg[0], CALL(ca));
+
+	if (!req(R, env))
+		emit(Ocopy, Kl, TMP(RAX), env, R);
+	else if ((ca >> 12) & 1) /* vararg call */
+		emit(Ocopy, Kw, TMP(RAX), getcon((ca >> 8) & 15, fn), R);
+
+	ni = ns = 0;
+	if (ra && aret.inmem)
+		emit(Ocopy, Kl, rarg(Kl, &ni, &ns), ra->i.to, R); /* pass hidden argument */
+
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op >= Oarge || a->inmem)
+			continue;
+		r1 = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oargc) {
+			if (a->size > 8) {
+				r2 = rarg(a->cls[1], &ni, &ns);
+				r = newtmp("abi", Kl, fn);
+				emit(Oload, a->cls[1], r2, r, R);
+				emit(Oadd, Kl, r, i->arg[1], getcon(8, fn));
+			}
+			emit(Oload, a->cls[0], r1, i->arg[1], R);
+		} else
+			emit(Ocopy, i->cls, r1, i->arg[0], R);
+	}
+
+	if (!stk)
+		return;
+
+	r = newtmp("abi", Kl, fn);
+	for (i=i0, a=ac, off=0; i<i1; i++, a++) {
+		if (i->op >= Oarge || !a->inmem)
+			continue;
+		r1 = newtmp("abi", Kl, fn);
+		if (i->op == Oargc) {
+			if (a->align == 4)
+				off += off & 15;
+			emit(Oblit1, 0, R, INT(a->type->size), R);
+			emit(Oblit0, 0, R, i->arg[1], r1);
+		} else
+			emit(Ostorel, 0, R, i->arg[0], r1);
+		emit(Oadd, Kl, r1, r, getcon(off, fn));
+		off += a->size;
+	}
+	emit(Osalloc, Kl, r, getcon(stk, fn), R);
+}
+
+static int
+selpar(Fn *fn, Ins *i0, Ins *i1)
+{
+	AClass *ac, *a, aret;
+	Ins *i;
+	int ni, ns, s, al, fa;
+	Ref r, env;
+
+	env = R;
+	ac = alloc((i1-i0) * sizeof ac[0]);
+	curi = &insb[NIns];
+	ni = ns = 0;
+
+	if (fn->retty >= 0) {
+		typclass(&aret, &typ[fn->retty]);
+		fa = argsclass(i0, i1, ac, Opar, &aret, &env);
+	} else
+		fa = argsclass(i0, i1, ac, Opar, 0, &env);
+	fn->reg = amd64_sysv_argregs(CALL(fa), 0);
+
+	for (i=i0, a=ac; i<i1; i++, a++) {
+		if (i->op != Oparc || a->inmem)
+			continue;
+		if (a->size > 8) {
+			r = newtmp("abi", Kl, fn);
+			a->ref[1] = newtmp("abi", Kl, fn);
+			emit(Ostorel, 0, R, a->ref[1], r);
+			emit(Oadd, Kl, r, i->to, getcon(8, fn));
+		}
+		a->ref[0] = newtmp("abi", Kl, fn);
+		emit(Ostorel, 0, R, a->ref[0], i->to);
+		/* specific to NAlign == 3 */
+		al = a->align >= 2 ? a->align - 2 : 0;
+		emit(Oalloc+al, Kl, i->to, getcon(a->size, fn), R);
+	}
+
+	if (fn->retty >= 0 && aret.inmem) {
+		r = newtmp("abi", Kl, fn);
+		emit(Ocopy, Kl, r, rarg(Kl, &ni, &ns), R);
+		fn->retr = r;
+	}
+
+	for (i=i0, a=ac, s=4; i<i1; i++, a++) {
+		switch (a->inmem) {
+		case 1:
+			if (a->align > 4)
+				err("sysv abi requires alignments of 16 or less");
+			if (a->align == 4)
+				s = (s+3) & -4;
+			fn->tmp[i->to.val].slot = -s;
+			s += a->size / 4;
+			continue;
+		case 2:
+			emit(Oload, i->cls, i->to, SLOT(-s), R);
+			s += 2;
+			continue;
+		}
+		if (i->op == Opare)
+			continue;
+		r = rarg(a->cls[0], &ni, &ns);
+		if (i->op == Oparc) {
+			emit(Ocopy, a->cls[0], a->ref[0], r, R);
+			if (a->size > 8) {
+				r = rarg(a->cls[1], &ni, &ns);
+				emit(Ocopy, a->cls[1], a->ref[1], r, R);
+			}
+		} else
+			emit(Ocopy, i->cls, i->to, r, R);
+	}
+
+	if (!req(R, env))
+		emit(Ocopy, Kl, env, TMP(RAX), R);
+
+	return fa | (s*4)<<12;
+}
+
+static Blk *
+split(Fn *fn, Blk *b)
+{
+	Blk *bn;
+
+	++fn->nblk;
+	bn = newblk();
+	idup(bn, curi, &insb[NIns]-curi);
+	curi = &insb[NIns];
+	bn->visit = ++b->visit;
+	bn->name = strf(PFn, "%s.%d", b->name, b->visit);
+	bn->loop = b->loop;
+	bn->link = b->link;
+	b->link = bn;
+	return bn;
+}
+
+static void
+chpred(Blk *b, Blk *bp, Blk *bp1)
+{
+	Phi *p;
+	uint a;
+
+	for (p=b->phi; p; p=p->link) {
+		for (a=0; p->blk[a]!=bp; a++)
+			assert(a+1<p->narg);
+		p->blk[a] = bp1;
+	}
+}
+
+static void
+selvaarg(Fn *fn, Blk *b, Ins *i)
+{
+	Ref loc, lreg, lstk, nr, r0, r1, c4, c8, c16, c, ap;
+	Blk *b0, *bstk, *breg;
+	int isint;
+
+	c4 = getcon(4, fn);
+	c8 = getcon(8, fn);
+	c16 = getcon(16, fn);
+	ap = i->arg[0];
+	isint = KBASE(i->cls) == 0;
+
+	/* @b [...]
+	       r0 =l add ap, (0 or 4)
+	       nr =l loadsw r0
+	       r1 =w cultw nr, (48 or 176)
+	       jnz r1, @breg, @bstk
+	   @breg
+	       r0 =l add ap, 16
+	       r1 =l loadl r0
+	       lreg =l add r1, nr
+	       r0 =w add nr, (8 or 16)
+	       r1 =l add ap, (0 or 4)
+	       storew r0, r1
+	   @bstk
+	       r0 =l add ap, 8
+	       lstk =l loadl r0
+	       r1 =l add lstk, 8
+	       storel r1, r0
+	   @b0
+	       %loc =l phi @breg %lreg, @bstk %lstk
+	       i->to =(i->cls) load %loc
+	*/
+
+	loc = newtmp("abi", Kl, fn);
+	emit(Oload, i->cls, i->to, loc, R);
+	b0 = split(fn, b);
+	b0->jmp = b->jmp;
+	b0->s1 = b->s1;
+	b0->s2 = b->s2;
+	if (b->s1)
+		chpred(b->s1, b, b0);
+	if (b->s2 && b->s2 != b->s1)
+		chpred(b->s2, b, b0);
+
+	lreg = newtmp("abi", Kl, fn);
+	nr = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kw, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, r0, r1);
+	emit(Oadd, Kl, r1, ap, isint ? CON_Z : c4);
+	emit(Oadd, Kw, r0, nr, isint ? c8 : c16);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Oadd, Kl, lreg, r1, nr);
+	emit(Oload, Kl, r1, r0, R);
+	emit(Oadd, Kl, r0, ap, c16);
+	breg = split(fn, b);
+	breg->jmp.type = Jjmp;
+	breg->s1 = b0;
+
+	lstk = newtmp("abi", Kl, fn);
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, lstk, c8);
+	emit(Oload, Kl, lstk, r0, R);
+	emit(Oadd, Kl, r0, ap, c8);
+	bstk = split(fn, b);
+	bstk->jmp.type = Jjmp;
+	bstk->s1 = b0;
+
+	b0->phi = alloc(sizeof *b0->phi);
+	*b0->phi = (Phi){
+		.cls = Kl, .to = loc,
+		.narg = 2,
+		.blk = vnew(2, sizeof b0->phi->blk[0], PFn),
+		.arg = vnew(2, sizeof b0->phi->arg[0], PFn),
+	};
+	b0->phi->blk[0] = bstk;
+	b0->phi->blk[1] = breg;
+	b0->phi->arg[0] = lstk;
+	b0->phi->arg[1] = lreg;
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kw, fn);
+	b->jmp.type = Jjnz;
+	b->jmp.arg = r1;
+	b->s1 = breg;
+	b->s2 = bstk;
+	c = getcon(isint ? 48 : 176, fn);
+	emit(Ocmpw+Ciult, Kw, r1, nr, c);
+	emit(Oloadsw, Kl, nr, r0, R);
+	emit(Oadd, Kl, r0, ap, isint ? CON_Z : c4);
+}
+
+static void
+selvastart(Fn *fn, int fa, Ref ap)
+{
+	Ref r0, r1;
+	int gp, fp, sp;
+
+	gp = ((fa >> 4) & 15) * 8;
+	fp = 48 + ((fa >> 8) & 15) * 16;
+	sp = fa >> 12;
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(-176, fn));
+	emit(Oadd, Kl, r0, ap, getcon(16, fn));
+	r0 = newtmp("abi", Kl, fn);
+	r1 = newtmp("abi", Kl, fn);
+	emit(Ostorel, Kw, R, r1, r0);
+	emit(Oadd, Kl, r1, TMP(RBP), getcon(sp, fn));
+	emit(Oadd, Kl, r0, ap, getcon(8, fn));
+	r0 = newtmp("abi", Kl, fn);
+	emit(Ostorew, Kw, R, getcon(fp, fn), r0);
+	emit(Oadd, Kl, r0, ap, getcon(4, fn));
+	emit(Ostorew, Kw, R, getcon(gp, fn), ap);
+}
+
+void
+amd64_sysv_abi(Fn *fn)
+{
+	Blk *b;
+	Ins *i, *i0;
+	RAlloc *ral;
+	int n0, n1, ioff, fa;
+
+	for (b=fn->start; b; b=b->link)
+		b->visit = 0;
+
+	/* lower parameters */
+	for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
+		if (!ispar(i->op))
+			break;
+	fa = selpar(fn, b->ins, i);
+	n0 = &insb[NIns] - curi;
+	ioff = i - b->ins;
+	n1 = b->nins - ioff;
+	vgrow(&b->ins, n0+n1);
+	icpy(b->ins+n0, b->ins+ioff, n1);
+	icpy(b->ins, curi, n0);
+	b->nins = n0+n1;
+
+	/* lower calls, returns, and vararg instructions */
+	ral = 0;
+	b = fn->start;
+	do {
+		if (!(b = b->link))
+			b = fn->start; /* do it last */
+		if (b->visit)
+			continue;
+		curi = &insb[NIns];
+		selret(b, fn);
+		for (i=&b->ins[b->nins]; i!=b->ins;)
+			switch ((--i)->op) {
+			default:
+				emiti(*i);
+				break;
+			case Ocall:
+				for (i0=i; i0>b->ins; i0--)
+					if (!isarg((i0-1)->op))
+						break;
+				selcall(fn, i0, i, &ral);
+				i = i0;
+				break;
+			case Ovastart:
+				selvastart(fn, fa, i->arg[0]);
+				break;
+			case Ovaarg:
+				selvaarg(fn, b, i);
+				break;
+			case Oarg:
+			case Oargc:
+				die("unreachable");
+			}
+		if (b == fn->start)
+			for (; ral; ral=ral->link)
+				emiti(ral->i);
+		idup(b, curi, &insb[NIns]-curi);
+	} while (b != fn->start);
+
+	if (debug['A']) {
+		fprintf(stderr, "\n> After ABI lowering:\n");
+		printfn(fn, stderr);
+	}
+}
@@ -0,0 +1,67 @@
+#include "all.h"
+
+Amd64Op amd64_op[NOp] = {
+#define O(op, t, x) [O##op] =
+#define X(nm, zf, lf) { nm, zf, lf, },
+	#include "../ops.h"
+};
+
+static int
+amd64_memargs(int op)
+{
+	return amd64_op[op].nmem;
+}
+
+#define AMD64_COMMON \
+	.gpr0 = RAX, \
+	.ngpr = NGPR, \
+	.fpr0 = XMM0, \
+	.nfpr = NFPR, \
+	.rglob = BIT(RBP) | BIT(RSP), \
+	.nrglob = 2, \
+	.memargs = amd64_memargs, \
+	.abi0 = elimsb, \
+	.isel = amd64_isel, \
+	.cansel = 1,
+
+Target T_amd64_sysv = {
+	.name = "amd64_sysv",
+	.emitfin = elf_emitfin,
+	.asloc = ".L",
+	.abi1 = amd64_sysv_abi,
+	.rsave = amd64_sysv_rsave,
+	.nrsave = {NGPS_SYSV, NFPS},
+	.retregs = amd64_sysv_retregs,
+	.argregs = amd64_sysv_argregs,
+	.emitfn = amd64_sysv_emitfn,
+	AMD64_COMMON
+};
+
+Target T_amd64_apple = {
+	.name = "amd64_apple",
+	.apple = 1,
+	.emitfin = macho_emitfin,
+	.asloc = "L",
+	.assym = "_",
+	.abi1 = amd64_sysv_abi,
+	.rsave = amd64_sysv_rsave,
+	.nrsave = {NGPS_SYSV, NFPS},
+	.retregs = amd64_sysv_retregs,
+	.argregs = amd64_sysv_argregs,
+	.emitfn = amd64_sysv_emitfn,
+	AMD64_COMMON
+};
+
+Target T_amd64_win = {
+	.name = "amd64_win",
+	.windows = 1,
+	.emitfin = pe_emitfin,
+	.asloc = "L",
+	.abi1 = amd64_winabi_abi,
+	.rsave = amd64_winabi_rsave,
+	.nrsave = {NGPS_WIN, NFPS},
+	.retregs = amd64_winabi_retregs,
+	.argregs = amd64_winabi_argregs,
+	.emitfn = amd64_winabi_emitfn,
+	AMD64_COMMON
+};
@@ -0,0 +1,763 @@
+#include "all.h"
+
+#include <stdbool.h>
+
+typedef enum ArgPassStyle {
+  APS_Invalid = 0,
+  APS_Register,
+  APS_InlineOnStack,
+  APS_CopyAndPointerInRegister,
+  APS_CopyAndPointerOnStack,
+  APS_VarargsTag,
+  APS_EnvTag,
+} ArgPassStyle;
+
+typedef struct ArgClass {
+  Typ* type;
+  ArgPassStyle style;
+  int align;
+  uint size;
+  int cls;
+  Ref ref;
+} ArgClass;
+
+typedef struct ExtraAlloc ExtraAlloc;
+struct ExtraAlloc {
+  Ins instr;
+  ExtraAlloc* link;
+};
+
+#define ALIGN_DOWN(n, a) ((n) & ~((a)-1))
+#define ALIGN_UP(n, a) ALIGN_DOWN((n) + (a)-1, (a))
+
+// Number of stack bytes required be reserved for the callee.
+#define SHADOW_SPACE_SIZE 32
+
+int amd64_winabi_rsave[] = {RCX,  RDX,   R8,    R9,    R10,   R11,   RAX,  XMM0,
+                            XMM1, XMM2,  XMM3,  XMM4,  XMM5,  XMM6,  XMM7, XMM8,
+                            XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, -1};
+int amd64_winabi_rclob[] = {RBX, R12, R13, R14, R15, RSI, RDI, -1};
+
+MAKESURE(winabi_arrays_ok,
+         sizeof amd64_winabi_rsave == (NGPS_WIN + NFPS + 1) * sizeof(int) &&
+             sizeof amd64_winabi_rclob == (NCLR_WIN + 1) * sizeof(int));
+
+// layout of call's second argument (RCall)
+//
+// bit 0: rax returned
+// bit 1: xmm0 returned
+// bits 23: 0
+// bits 4567: rcx, rdx, r8, r9 passed
+// bits 89ab: xmm0,1,2,3 passed
+// bit c: env call (rax passed)
+// bits d..1f: 0
+
+bits amd64_winabi_retregs(Ref r, int p[2]) {
+  assert(rtype(r) == RCall);
+
+  bits b = 0;
+  int num_int_returns = r.val & 1;
+  int num_float_returns = r.val & 2;
+  if (num_int_returns == 1) {
+    b |= BIT(RAX);
+  } else {
+    b |= BIT(XMM0);
+  }
+  if (p) {
+    p[0] = num_int_returns;
+    p[1] = num_float_returns;
+  }
+  return b;
+}
+
+static uint popcnt(bits b) {
+  b = (b & 0x5555555555555555) + ((b >> 1) & 0x5555555555555555);
+  b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333);
+  b = (b & 0x0f0f0f0f0f0f0f0f) + ((b >> 4) & 0x0f0f0f0f0f0f0f0f);
+  b += (b >> 8);
+  b += (b >> 16);
+  b += (b >> 32);
+  return b & 0xff;
+}
+
+bits amd64_winabi_argregs(Ref r, int p[2]) {
+  assert(rtype(r) == RCall);
+
+  // On SysV, these are counts. Here, a count isn't sufficient, we actually need
+  // to know which ones are in use because they're not necessarily contiguous.
+  int int_passed = (r.val >> 4) & 15;
+  int float_passed = (r.val >> 8) & 15;
+  bool env_param = (r.val >> 12) & 1;
+
+  bits b = 0;
+  b |= (int_passed & 1) ? BIT(RCX) : 0;
+  b |= (int_passed & 2) ? BIT(RDX) : 0;
+  b |= (int_passed & 4) ? BIT(R8) : 0;
+  b |= (int_passed & 8) ? BIT(R9) : 0;
+  b |= (float_passed & 1) ? BIT(XMM0) : 0;
+  b |= (float_passed & 2) ? BIT(XMM1) : 0;
+  b |= (float_passed & 4) ? BIT(XMM2) : 0;
+  b |= (float_passed & 8) ? BIT(XMM3) : 0;
+  b |= env_param ? BIT(RAX) : 0;
+  if (p) {
+    // TODO: The only place this is used is live.c. I'm not sure what should be
+    // returned here wrt to using the same counter for int/float regs on win.
+    // For now, try the number of registers in use even though they're not
+    // contiguous.
+    p[0] = popcnt(int_passed);
+    p[1] = popcnt(float_passed);
+  }
+  return b;
+}
+
+typedef struct RegisterUsage {
+  // Counter for both int/float as they're counted together. Only if the bool's
+  // set in regs_passed is the given register *actually* needed for a value
+  // (i.e. needs to be saved, etc.).
+  int num_regs_passed;
+
+  // Indexed first by 0=int, 1=float, use KBASE(cls).
+  // Indexed second by register index in calling convention, so for integer,
+  // 0=RCX, 1=RDX, 2=R8, 3=R9, and for float XMM0, XMM1, XMM2, XMM3.
+  bool regs_passed[2][4];
+
+  bool rax_returned;
+  bool xmm0_returned;
+
+  // This is also used as where the va_start will start for varargs functions
+  // (there's no 'Oparv', so we need to keep track of a count here.)
+  int num_named_args_passed;
+
+  // This is set when classifying the arguments for a call (but not when
+  // classifying the parameters of a function definition).
+  bool is_varargs_call;
+
+  bool has_env;
+} RegisterUsage;
+
+static int register_usage_to_call_arg_value(RegisterUsage reg_usage) {
+  return (reg_usage.rax_returned << 0) |        //
+         (reg_usage.xmm0_returned << 1) |       //
+         (reg_usage.regs_passed[0][0] << 4) |   //
+         (reg_usage.regs_passed[0][1] << 5) |   //
+         (reg_usage.regs_passed[0][2] << 6) |   //
+         (reg_usage.regs_passed[0][3] << 7) |   //
+         (reg_usage.regs_passed[1][0] << 8) |   //
+         (reg_usage.regs_passed[1][1] << 9) |   //
+         (reg_usage.regs_passed[1][2] << 10) |  //
+         (reg_usage.regs_passed[1][3] << 11) |  //
+         (reg_usage.has_env << 12);
+}
+
+// Assigns the argument to a register if there's any left according to the
+// calling convention, and updates the regs_passed bools. Otherwise marks the
+// value as needing stack space to be passed.
+static void assign_register_or_stack(RegisterUsage* reg_usage,
+                                     ArgClass* arg,
+                                     bool is_float,
+                                     bool by_copy) {
+  if (reg_usage->num_regs_passed == 4) {
+    arg->style = by_copy ? APS_CopyAndPointerOnStack : APS_InlineOnStack;
+  } else {
+    reg_usage->regs_passed[is_float][reg_usage->num_regs_passed] = true;
+    ++reg_usage->num_regs_passed;
+    arg->style = by_copy ? APS_CopyAndPointerInRegister : APS_Register;
+  }
+  ++reg_usage->num_named_args_passed;
+}
+
+static bool type_is_by_copy(Typ* type) {
+  // Note that only these sizes are passed by register, even though e.g. a
+  // 5 byte struct would "fit", it still is passed by copy-and-pointer.
+  return type->isdark || (type->size != 1 && type->size != 2 &&
+                          type->size != 4 && type->size != 8);
+}
+
+// This function is used for both arguments and parameters.
+// begin_instr should either point at the first Oarg or Opar, and end_instr
+// should point past the last one (so to the Ocall for arguments, or to the
+// first 'real' instruction of the function for parameters).
+static void classify_arguments(RegisterUsage* reg_usage,
+                               Ins* begin_instr,
+                               Ins* end_instr,
+                               ArgClass* arg_classes,
+                               Ref* env) {
+  ArgClass* arg = arg_classes;
+  // For each argument, determine how it will be passed (int, float, stack)
+  // and update the `reg_usage` counts. Additionally, fill out arg_classes for
+  // each argument.
+  for (Ins* instr = begin_instr; instr < end_instr; ++instr, ++arg) {
+    switch (instr->op) {
+      case Oarg:
+      case Opar:
+        assign_register_or_stack(reg_usage, arg, KBASE(instr->cls),
+                                 /*by_copy=*/false);
+        arg->cls = instr->cls;
+        arg->align = 3;
+        arg->size = 8;
+        break;
+      case Oargc:
+      case Oparc: {
+        int typ_index = instr->arg[0].val;
+        Typ* type = &typ[typ_index];
+        bool by_copy = type_is_by_copy(type);
+        assign_register_or_stack(reg_usage, arg, /*is_float=*/false, by_copy);
+        arg->cls = Kl;
+        if (!by_copy && type->size <= 4) {
+          arg->cls = Kw;
+        }
+        arg->align = 3;
+        arg->size = type->size;
+        break;
+      }
+      case Oarge:
+        *env = instr->arg[0];
+        arg->style = APS_EnvTag;
+        reg_usage->has_env = true;
+        break;
+      case Opare:
+        *env = instr->to;
+        arg->style = APS_EnvTag;
+        reg_usage->has_env = true;
+        break;
+      case Oargv:
+        reg_usage->is_varargs_call = true;
+        arg->style = APS_VarargsTag;
+        break;
+    }
+  }
+
+  if (reg_usage->has_env && reg_usage->is_varargs_call) {
+    die("can't use env with varargs");
+  }
+
+  // During a varargs call, float arguments have to be duplicated to their
+  // associated integer register, so mark them as in-use too.
+  if (reg_usage->is_varargs_call) {
+    for (int i = 0; i < 4; ++i) {
+      if (reg_usage->regs_passed[/*float*/ 1][i]) {
+        reg_usage->regs_passed[/*int*/ 0][i] = true;
+      }
+    }
+  }
+}
+
+static bool is_integer_type(int ty) {
+  assert(ty >= 0 && ty < 4 && "expecting Kw Kl Ks Kd");
+  return KBASE(ty) == 0;
+}
+
+static Ref register_for_arg(int cls, int counter) {
+  assert(counter < 4);
+  if (is_integer_type(cls)) {
+    return TMP(amd64_winabi_rsave[counter]);
+  } else {
+    return TMP(XMM0 + counter);
+  }
+}
+
+static Ins* lower_call(Fn* func,
+                       Blk* block,
+                       Ins* call_instr,
+                       ExtraAlloc** pextra_alloc) {
+  // Call arguments are instructions. Walk through them to find the end of the
+  // call+args that we need to process (and return the instruction past the body
+  // of the instruction for continuing processing).
+  Ins* instr_past_args = call_instr - 1;
+  for (; instr_past_args >= block->ins; --instr_past_args) {
+    if (!isarg(instr_past_args->op)) {
+      break;
+    }
+  }
+  Ins* earliest_arg_instr = instr_past_args + 1;
+
+  // Don't need an ArgClass for the call itself, so one less than the total
+  // number of instructions we're dealing with.
+  uint num_args = call_instr - earliest_arg_instr;
+  ArgClass* arg_classes = alloc(num_args * sizeof(ArgClass));
+
+  RegisterUsage reg_usage = {0};
+  ArgClass ret_arg_class = {0};
+
+  // Ocall's two arguments are the the function to be called in 0, and, if the
+  // the function returns a non-basic type, then arg[1] is a reference to the
+  // type of the return. req checks if Refs are equal; `R` is 0.
+  bool il_has_struct_return = !req(call_instr->arg[1], R);
+  bool is_struct_return = false;
+  if (il_has_struct_return) {
+    Typ* ret_type = &typ[call_instr->arg[1].val];
+    is_struct_return = type_is_by_copy(ret_type);
+    if (is_struct_return) {
+      assign_register_or_stack(&reg_usage, &ret_arg_class, /*is_float=*/false,
+                               /*by_copy=*/true);
+    }
+    ret_arg_class.size = ret_type->size;
+  }
+  Ref env = R;
+  classify_arguments(&reg_usage, earliest_arg_instr, call_instr, arg_classes,
+                     &env);
+
+  // We now know which arguments are on the stack and which are in registers, so
+  // we can allocate the correct amount of space to stash the stack-located ones
+  // into.
+  uint stack_usage = 0;
+  for (uint i = 0; i < num_args; ++i) {
+    ArgClass* arg = &arg_classes[i];
+    // stack_usage only accounts for pushes that are for values that don't have
+    // enough registers. Large struct copies are alloca'd separately, and then
+    // only have (potentially) 8 bytes to add to stack_usage here.
+    if (arg->style == APS_InlineOnStack) {
+      if (arg->align > 4) {
+        err("win abi cannot pass alignments > 16");
+      }
+      stack_usage += arg->size;
+    } else if (arg->style == APS_CopyAndPointerOnStack) {
+      stack_usage += 8;
+    }
+  }
+  stack_usage = ALIGN_UP(stack_usage, 16);
+
+  // Note that here we're logically 'after' the call (due to emitting
+  // instructions in reverse order), so we're doing a negative stack
+  // allocation to clean up after the call.
+  Ref stack_size_ref =
+      getcon(-(int64_t)(stack_usage + SHADOW_SPACE_SIZE), func);
+  emit(Osalloc, Kl, R, stack_size_ref, R);
+
+  ExtraAlloc* return_pad = NULL;
+  if (is_struct_return) {
+    return_pad = alloc(sizeof(ExtraAlloc));
+    Ref ret_pad_ref = newtmp("abi.ret_pad", Kl, func);
+    return_pad->instr =
+        (Ins){Oalloc8, Kl, ret_pad_ref, {getcon(ret_arg_class.size, func)}};
+    return_pad->link = (*pextra_alloc);
+    *pextra_alloc = return_pad;
+    reg_usage.rax_returned = true;
+    emit(Ocopy, call_instr->cls, call_instr->to, TMP(RAX), R);
+  } else {
+    if (il_has_struct_return) {
+      // In the case that at the IL level, a struct return was specified, but as
+      // far as the calling convention is concerned it's not actually by
+      // pointer, we need to store the return value into an alloca because
+      // subsequent IL will still be treating the function return as a pointer.
+      ExtraAlloc* return_copy = alloc(sizeof(ExtraAlloc));
+      return_copy->instr =
+          (Ins){Oalloc8, Kl, call_instr->to, {getcon(8, func)}};
+      return_copy->link = (*pextra_alloc);
+      *pextra_alloc = return_copy;
+      Ref copy = newtmp("abi.copy", Kl, func);
+      emit(Ostorel, 0, R, copy, call_instr->to);
+      emit(Ocopy, Kl, copy, TMP(RAX), R);
+      reg_usage.rax_returned = true;
+    } else if (is_integer_type(call_instr->cls)) {
+      // Only a basic type returned from the call, integer.
+      emit(Ocopy, call_instr->cls, call_instr->to, TMP(RAX), R);
+      reg_usage.rax_returned = true;
+    } else {
+      // Basic type, floating point.
+      emit(Ocopy, call_instr->cls, call_instr->to, TMP(XMM0), R);
+      reg_usage.xmm0_returned = true;
+    }
+  }
+
+  // Emit the actual call instruction. There's no 'to' value by this point
+  // because we've lowered it into register manipulation (that's the `R`),
+  // arg[0] of the call is the function, and arg[1] is register usage is
+  // documented as above (copied from SysV).
+  emit(Ocall, call_instr->cls, R, call_instr->arg[0],
+       CALL(register_usage_to_call_arg_value(reg_usage)));
+
+  if (!req(R, env)) {
+    // If there's an env arg to be passed, it gets stashed in RAX.
+    emit(Ocopy, Kl, TMP(RAX), env, R);
+  }
+
+  if (reg_usage.is_varargs_call) {
+    // Any float arguments need to be duplicated to integer registers. This is
+    // required by the calling convention so that dumping to shadow space can be
+    // done without a prototype and for varargs.
+#define DUP_IF_USED(index, floatreg, intreg)        \
+  if (reg_usage.regs_passed[/*float*/ 1][index]) {  \
+    emit(Ocast, Kl, TMP(intreg), TMP(floatreg), R); \
+  }
+    DUP_IF_USED(0, XMM0, RCX);
+    DUP_IF_USED(1, XMM1, RDX);
+    DUP_IF_USED(2, XMM2, R8);
+    DUP_IF_USED(3, XMM3, R9);
+#undef DUP_IF_USED
+  }
+
+  int reg_counter = 0;
+  if (is_struct_return) {
+    Ref first_reg = register_for_arg(Kl, reg_counter++);
+    emit(Ocopy, Kl, first_reg, return_pad->instr.to, R);
+  }
+
+  // This is where we actually do the load of values into registers or into
+  // stack slots.
+  Ref arg_stack_slots = newtmp("abi.args", Kl, func);
+  uint slot_offset = SHADOW_SPACE_SIZE;
+  ArgClass* arg = arg_classes;
+  for (Ins* instr = earliest_arg_instr; instr != call_instr; ++instr, ++arg) {
+    switch (arg->style) {
+      case APS_Register: {
+        Ref into = register_for_arg(arg->cls, reg_counter++);
+        if (instr->op == Oargc) {
+          // If this is a small struct being passed by value. The value in the
+          // instruction in this case is a pointer, but it needs to be loaded
+          // into the register.
+          emit(Oload, arg->cls, into, instr->arg[1], R);
+        } else {
+          // Otherwise, a normal value passed in a register.
+          emit(Ocopy, instr->cls, into, instr->arg[0], R);
+        }
+        break;
+      }
+      case APS_InlineOnStack: {
+        Ref slot = newtmp("abi.off", Kl, func);
+        if (instr->op == Oargc) {
+          // This is a small struct, so it's not passed by copy, but the
+          // instruction is a pointer. So we need to copy it into the stack
+          // slot. (And, remember that these are emitted backwards, so store,
+          // then load.)
+          Ref smalltmp = newtmp("abi.smalltmp", arg->cls, func);
+          emit(Ostorel, 0, R, smalltmp, slot);
+          emit(Oload, arg->cls, smalltmp, instr->arg[1], R);
+        } else {
+          // Stash the value into the stack slot.
+          emit(Ostorel, 0, R, instr->arg[0], slot);
+        }
+        emit(Oadd, Kl, slot, arg_stack_slots, getcon(slot_offset, func));
+        slot_offset += arg->size;
+        break;
+      }
+      case APS_CopyAndPointerInRegister:
+      case APS_CopyAndPointerOnStack: {
+        // Alloca a space to copy into, and blit the value from the instr to the
+        // copied location.
+        ExtraAlloc* arg_copy = alloc(sizeof(ExtraAlloc));
+        Ref copy_ref = newtmp("abi.copy", Kl, func);
+        arg_copy->instr =
+            (Ins){Oalloc8, Kl, copy_ref, {getcon(arg->size, func)}};
+        arg_copy->link = (*pextra_alloc);
+        *pextra_alloc = arg_copy;
+        emit(Oblit1, 0, R, INT(arg->size), R);
+        emit(Oblit0, 0, R, instr->arg[1], copy_ref);
+
+        // Now load the pointer into the correct register or stack slot.
+        if (arg->style == APS_CopyAndPointerInRegister) {
+          Ref into = register_for_arg(arg->cls, reg_counter++);
+          emit(Ocopy, Kl, into, copy_ref, R);
+        } else {
+          assert(arg->style == APS_CopyAndPointerOnStack);
+          Ref slot = newtmp("abi.off", Kl, func);
+          emit(Ostorel, 0, R, copy_ref, slot);
+          emit(Oadd, Kl, slot, arg_stack_slots, getcon(slot_offset, func));
+          slot_offset += 8;
+        }
+        break;
+      }
+      case APS_EnvTag:
+      case APS_VarargsTag:
+        // Nothing to do here, see right before the call for reg dupe.
+        break;
+      case APS_Invalid:
+        die("unreachable");
+    }
+  }
+
+  if (stack_usage) {
+    // The last (first in call order) thing we do is allocate the the stack
+    // space we're going to fill with temporaries.
+    emit(Osalloc, Kl, arg_stack_slots,
+         getcon(stack_usage + SHADOW_SPACE_SIZE, func), R);
+  } else {
+    // When there's no usage for temporaries, we can add this into the other
+    // alloca, but otherwise emit it separately (not storing into a reference)
+    // so that it doesn't get removed later for being useless.
+    emit(Osalloc, Kl, R, getcon(SHADOW_SPACE_SIZE, func), R);
+  }
+
+  return instr_past_args;
+}
+
+static void lower_block_return(Fn* func, Blk* block) {
+  int jmp_type = block->jmp.type;
+
+  if (!isret(jmp_type) || jmp_type == Jret0) {
+    return;
+  }
+
+  // Save the argument, and set the block to be a void return because once it's
+  // lowered it's handled by the the register/stack manipulation.
+  Ref ret_arg = block->jmp.arg;
+  block->jmp.type = Jret0;
+
+  RegisterUsage reg_usage = {0};
+
+  if (jmp_type == Jretc) {
+    Typ* type = &typ[func->retty];
+    if (type_is_by_copy(type)) {
+      assert(rtype(func->retr) == RTmp);
+      emit(Ocopy, Kl, TMP(RAX), func->retr, R);
+      emit(Oblit1, 0, R, INT(type->size), R);
+      emit(Oblit0, 0, R, ret_arg, func->retr);
+    } else {
+      emit(Oload, Kl, TMP(RAX), ret_arg, R);
+    }
+    reg_usage.rax_returned = true;
+  } else {
+    int k = jmp_type - Jretw;
+    if (is_integer_type(k)) {
+      emit(Ocopy, k, TMP(RAX), ret_arg, R);
+      reg_usage.rax_returned = true;
+    } else {
+      emit(Ocopy, k, TMP(XMM0), ret_arg, R);
+      reg_usage.xmm0_returned = true;
+    }
+  }
+  block->jmp.arg = CALL(register_usage_to_call_arg_value(reg_usage));
+}
+
+static void lower_vastart(Fn* func,
+                          RegisterUsage* param_reg_usage,
+                          Ref valist) {
+  assert(func->vararg);
+  // In varargs functions:
+  // 1. the int registers are already dumped to the shadow stack space;
+  // 2. any parameters passed in floating point registers have
+  //    been duplicated to the integer registers
+  // 3. we ensure (later) that for varargs functions we're always using an rbp
+  //    frame pointer.
+  // So, the ... argument is just indexed past rbp by the number of named values
+  // that were actually passed.
+
+  Ref offset = newtmp("abi.vastart", Kl, func);
+  emit(Ostorel, 0, R, offset, valist);
+
+  // *8 for sizeof(u64), +16 because the return address and rbp have been pushed
+  // by the time we get to the body of the function.
+  emit(Oadd, Kl, offset, TMP(RBP),
+       getcon(param_reg_usage->num_named_args_passed * 8 + 16, func));
+}
+
+static void lower_vaarg(Fn* func, Ins* vaarg_instr) {
+  // va_list is just a void** on winx64, so load the pointer, then load the
+  // argument from that pointer, then increment the pointer to the next arg.
+  // (All emitted backwards as usual.)
+  Ref inc = newtmp("abi.vaarg.inc", Kl, func);
+  Ref ptr = newtmp("abi.vaarg.ptr", Kl, func);
+  emit(Ostorel, 0, R, inc, vaarg_instr->arg[0]);
+  emit(Oadd, Kl, inc, ptr, getcon(8, func));
+  emit(Oload, vaarg_instr->cls, vaarg_instr->to, ptr, R);
+  emit(Oload, Kl, ptr, vaarg_instr->arg[0], R);
+}
+
+static void lower_args_for_block(Fn* func,
+                                 Blk* block,
+                                 RegisterUsage* param_reg_usage,
+                                 ExtraAlloc** pextra_alloc) {
+  // global temporary buffer used by emit. Reset to the end, and predecremented
+  // when adding to it.
+  curi = &insb[NIns];
+
+  lower_block_return(func, block);
+
+  if (block->nins) {
+    // Work backwards through the instructions, either copying them unchanged,
+    // or modifying as necessary.
+    for (Ins* instr = &block->ins[block->nins - 1]; instr >= block->ins;) {
+      switch (instr->op) {
+        case Ocall:
+          instr = lower_call(func, block, instr, pextra_alloc);
+          break;
+        case Ovastart:
+          lower_vastart(func, param_reg_usage, instr->arg[0]);
+          --instr;
+          break;
+        case Ovaarg:
+          lower_vaarg(func, instr);
+          --instr;
+          break;
+        case Oarg:
+        case Oargc:
+          die("unreachable");
+        default:
+          emiti(*instr);
+          --instr;
+          break;
+      }
+    }
+  }
+
+  // This it the start block, which is processed last. Add any allocas that
+  // other blocks needed.
+  bool is_start_block = block == func->start;
+  if (is_start_block) {
+    for (ExtraAlloc* ea = *pextra_alloc; ea; ea = ea->link) {
+      emiti(ea->instr);
+    }
+  }
+
+  // emit/emiti add instructions from the end to the beginning of the temporary
+  // global buffer. dup the final version into the final block storage.
+  block->nins = &insb[NIns] - curi;
+  idup(block, curi, block->nins);
+}
+
+static Ins* find_end_of_func_parameters(Blk* start_block) {
+  Ins* i;
+  for (i = start_block->ins; i < &start_block->ins[start_block->nins]; ++i) {
+    if (!ispar(i->op)) {
+      break;
+    }
+  }
+  return i;
+}
+
+// Copy from registers/stack into values.
+static RegisterUsage lower_func_parameters(Fn* func) {
+  // This is half-open, so end points after the last Opar.
+  Blk* start_block = func->start;
+  Ins* start_of_params = start_block->ins;
+  Ins* end_of_params = find_end_of_func_parameters(start_block);
+
+  size_t num_params = end_of_params - start_of_params;
+  ArgClass* arg_classes = alloc(num_params * sizeof(ArgClass));
+  ArgClass arg_ret = {0};
+
+  // global temporary buffer used by emit. Reset to the end, and predecremented
+  // when adding to it.
+  curi = &insb[NIns];
+
+  int reg_counter = 0;
+  RegisterUsage reg_usage = {0};
+  if (func->retty >= 0) {
+    bool by_copy = type_is_by_copy(&typ[func->retty]);
+    if (by_copy) {
+      assign_register_or_stack(&reg_usage, &arg_ret, /*is_float=*/false,
+                               by_copy);
+      Ref ret_ref = newtmp("abi.ret", Kl, func);
+      emit(Ocopy, Kl, ret_ref, TMP(RCX), R);
+      func->retr = ret_ref;
+      ++reg_counter;
+    }
+  }
+  Ref env = R;
+  classify_arguments(&reg_usage, start_of_params, end_of_params, arg_classes,
+                     &env);
+  func->reg = amd64_winabi_argregs(
+      CALL(register_usage_to_call_arg_value(reg_usage)), NULL);
+
+  // Copy from the registers or stack slots into the named parameters. Depending
+  // on how they're passed, they either need to be copied or loaded.
+  ArgClass* arg = arg_classes;
+  uint slot_offset = SHADOW_SPACE_SIZE / 4 + 4;
+  for (Ins* instr = start_of_params; instr < end_of_params; ++instr, ++arg) {
+    switch (arg->style) {
+      case APS_Register: {
+        Ref from = register_for_arg(arg->cls, reg_counter++);
+        // If it's a struct at the IL level, we need to copy the register into
+        // an alloca so we have something to point at (same for InlineOnStack).
+        if (instr->op == Oparc) {
+          arg->ref = newtmp("abi", Kl, func);
+          emit(Ostorel, 0, R, arg->ref, instr->to);
+          emit(Ocopy, instr->cls, arg->ref, from, R);
+          emit(Oalloc8, Kl, instr->to, getcon(arg->size, func), R);
+        } else {
+          emit(Ocopy, instr->cls, instr->to, from, R);
+        }
+        break;
+      }
+      case APS_InlineOnStack:
+        if (instr->op == Oparc) {
+          arg->ref = newtmp("abi", Kl, func);
+          emit(Ostorel, 0, R, arg->ref, instr->to);
+          emit(Ocopy, instr->cls, arg->ref, SLOT(-slot_offset), R);
+          emit(Oalloc8, Kl, instr->to, getcon(arg->size, func), R);
+        } else {
+          emit(Ocopy, Kl, instr->to, SLOT(-slot_offset), R);
+        }
+        slot_offset += 2;
+        break;
+      case APS_CopyAndPointerOnStack:
+        emit(Oload, Kl, instr->to, SLOT(-slot_offset), R);
+        slot_offset += 2;
+        break;
+      case APS_CopyAndPointerInRegister: {
+        // Because this has to be a copy (that we own), it is sufficient to just
+        // copy the register to the target.
+        Ref from = register_for_arg(Kl, reg_counter++);
+        emit(Ocopy, Kl, instr->to, from, R);
+        break;
+      }
+      case APS_EnvTag:
+        break;
+      case APS_VarargsTag:
+      case APS_Invalid:
+        die("unreachable");
+    }
+  }
+
+  // If there was an `env`, it was passed in RAX, so copy it into the env ref.
+  if (!req(R, env)) {
+    emit(Ocopy, Kl, env, TMP(RAX), R);
+  }
+
+  int num_created_instrs = &insb[NIns] - curi;
+  int num_other_after_instrs = (int)(start_block->nins - num_params);
+  int new_total_instrs = num_other_after_instrs + num_created_instrs;
+  Ins* new_instrs = vnew(new_total_instrs, sizeof(Ins), PFn);
+  Ins* instr_p = icpy(new_instrs, curi, num_created_instrs);
+  icpy(instr_p, end_of_params, num_other_after_instrs);
+  start_block->nins = new_total_instrs;
+  start_block->ins = new_instrs;
+
+  return reg_usage;
+}
+
+// The main job of this function is to lower generic instructions into the
+// specific details of how arguments are passed, and parameters are
+// interpreted for win x64. A useful reference is
+// https://learn.microsoft.com/en-us/cpp/build/x64-calling-convention .
+//
+// Some of the major differences from SysV if you're comparing the code
+// (non-exhaustive):
+// - only 4 int and 4 float regs are used
+// - when an int register is assigned a value, its associated float register is
+//   left unused (and vice versa). i.e. there's only one counter as you assign
+//   arguments to registers.
+// - any structs that aren't 1/2/4/8 bytes in size are passed by pointer, not
+//   by copying them into the stack. So e.g. if you pass something like
+//   `struct { void*, int64_t }` by value, it first needs to be copied to
+//   another alloca (in order to maintain value semantics at the language
+//   level), then the pointer to that copy is treated as a regular integer
+//   argument (which then itself may *also* be copied to the stack in the case
+//   there's no integer register remaining.)
+// - when calling a varargs functions, floating point values must be duplicated
+//   integer registers. Along with the above restrictions, this makes varargs
+//   handling simpler for the callee than SysV.
+void amd64_winabi_abi(Fn* func) {
+  // The first thing to do is lower incoming parameters to this function.
+  RegisterUsage param_reg_usage = lower_func_parameters(func);
+
+  // This is the second larger part of the job. We walk all blocks, and rewrite
+  // instructions returns, calls, and handling of varargs into their win x64
+  // specific versions. Any other instructions are just passed through unchanged
+  // by using `emiti`.
+
+  // Skip over the entry block, and do it at the end so that our later
+  // modifications can add allocations to the start block. In particular, we
+  // need to add stack allocas for copies when structs are passed or returned by
+  // value.
+  ExtraAlloc* extra_alloc = NULL;
+  for (Blk* block = func->start->link; block; block = block->link) {
+    lower_args_for_block(func, block, &param_reg_usage, &extra_alloc);
+  }
+  lower_args_for_block(func, func->start, &param_reg_usage, &extra_alloc);
+
+  if (debug['A']) {
+    fprintf(stderr, "\n> After ABI lowering:\n");
+    printfn(func, stderr);
+  }
+}