; This is designed to be serial-line downloaded to cdcode.
;
; Our memory map:
;
;	[8c000000,8c010000)	Stack (r15 set by cdcode)
;	[8c010000,8c01????)	cdcode
;	[8c020000,8c02????)	Us

	.include "regs.s"
	.include "ta-cmds.s"
	.include "maple-bits.s"

VRAM_BASE_32 = 0xa5000000
VRAM_BASE_64 = 0xa4000000
VRAM_SIZE = 8 << 20
STOREQ_BASE = 0xe0000000
VIDREG_BASE = 0xa05f0000
FRAME_X = 640
FRAME_Y = 480
VBLANK_REG = VIDREG_BASE + 0x6900
VBLANK_VBIT = 0x08
DISPLAY_VRAM = VIDREG_BASE + 0x8050
SHORT_FRAME_OFFSET = FRAME_X*2 ; FRAME_X pixels at two bytes each

COT_FOVY = 0f1.73 ; cot(FOVy/2), field-of-view angle figure
ZNEAR = 0f1
ZFAR = 0f100
DISTANCE = 0f15
BUTTON_FACTOR = 0f3
;BIT_YMIN = 0f40
;BIT_YMAX = 0f52

; Layout of the stuff we keep in video RAM.  We double-buffer, so there
;  are two of most of these.  _a and _b suffixes indicate the pairs.
;
; Unfortunately the rendering and video hardware aren't capable of
;  making distinctions equivalent to the difference between the CPU's
;  a4xxxxxx and a5xxxxxx views of video RAM (textures always come from
;  a4xxxxxx, or, to be more precise, always access video RAM in a way
;  compatible with the CPU's a4xxxxxx view, whereas everything else
;  comes from a5xxxxxx).  So we're stuck jigsawing together a4xxxxxx
;  allocations for textures and a5xxxxxx allocations for other stuff.

. = VRAM_BASE_64
	; Textures.  We have only one texture, used for maze walls,
	;  which we use with three different palettes depending on the
	;  orientation of the wall in question.  It's an 8x8 texture,
	;  occupying 64 bytes.
texture__base = .
texture_wall:
	.space	64
texture__end = .

. = VRAM_BASE_32 + [[texture__end - texture__base] / 2]
	.align	8
	; Space to render into.  Each field takes up FRAME_X*FRAME_Y
	;  pixels at two bytes per pixel.  (If it's displayed
	;  interlaced, this is handled with the display hardware; in
	;  memory it's totally non-interlaced.)
render_buf_size = FRAME_X * FRAME_Y * 2
render_buf_a:
	.space	render_buf_size
render_buf_b:
	.space	render_buf_size

	; Tile descriptors.  There is one of these, at 6 longs, per
	;  tile; there is also a 24-long header.  Each tile is 32x32
	;  pixels.  So for a 640x480 screen, we need
	;  24+(6*(640/32)*(480/32)) longs of space.  (I don't know what
	;  happens if the screen width or height is not a multiple of
	;  32.)  Each tile also uses 64 bytes of buffer space.
ta_buffers_size_cmd_list = 512 * 1024
ta_buffers_size_tile_buffer = 64 * [FRAME_X/32] * [FRAME_Y/32]
ta_buffers_size_tile_descriptor = 4 * [24 + [6 * [FRAME_X/32] * [FRAME_Y/32]]]
. = VRAM_BASE_32 + 0x00400000
ta_buffers_cmd_list_a:
	.space	ta_buffers_size_cmd_list
ta_buffers_cmd_list_b:
	.space	ta_buffers_size_cmd_list
ta_buffers_tile_buffer_a:
	.space	ta_buffers_size_tile_buffer
ta_buffers_tile_buffer_b:
	.space	ta_buffers_size_tile_buffer
ta_buffers_tile_descriptor_a:
	.space	ta_buffers_size_tile_descriptor
ta_buffers_tile_descriptor_b:
	.space	ta_buffers_size_tile_descriptor

; End of layout of video RAM.

. = 0x8c020000

	.sz	any
	.pr	any
	SETS.L	#main,r0
	jmp	@r0
	 nop
	SETCONST
	.sz	0
	.pr	0

; Our "data segment".

	; The maple command and response buffers.  The hardware
	;  requires these be aligned on 32-byte boundaries.
	.align	32
maple_cmd:
	.long	XDESC_LAST | [0 << XDESC_PORTSHIFT] | [1 << XDESC_LENSHIFT]
	.long	maple_resp & DMA_ADDRMASK
	MapleFrame	CMD_GETCOND, 0, ADDR_MAIN, 0, 0, 1
	.long	@BSL[FUNC_CONTROLLER]
	.align	32
	; 1024 is the largest the hardware supports, so it's a safe
	;  limit.  (The amount actually used is usually fairly small.)
maple_resp:
	.space	1024

	; The base matrix (composition of screenview, projection, and
	;  translation).
	.align	8
base_matrix:
	.space	16*4

	; The current and previous controller input state.  The
	;  patterns we initialize this to are what the controller sends
	;  when it's not being touched.
	.align	4
curistate:
	.long	0x0000ffff, 0x80808080
previstate:
	.long	0x0000ffff, 0x80808080

	; Stack pointer for abrupt return to cdcode.
	.align	4
throw_sp:
	.space	4

	; RNG state.
	.align	4
RNG_STATE_WORDS = 64
rng_state:
	.space	4 * RNG_STATE_WORDS
rng_hand:
	.space	1

	; The maze itself.
MZX = 3
MZY = 3
MZZ = 3
MZXY = MZX * MZY
MZWALLS = [MZX * MZY * MZZ * 3] - [MZX * MZY] - [MZY * MZZ] - [MZZ * MZX]
MZCELLS = MZX * MZY * MZZ
MZMAX = MZX
.if MZY > MZMAX
MZMAX = MZY
.endif
.if MZZ > MZMAX
MZMAX = MZZ
.endif
MZC_PX = 0x0001
MZC_PY = 0x0002
MZC_PZ = 0x0004
MZC_S =  0x0008
MZC_MX = 0x0010
MZC_MY = 0x0020
MZC_MZ = 0x0040
MZC_G =  0x0080
MZC_P =  0x0100
	.align	2
maze:
	.space	2 * MZCELLS
	.align	4
mwalls:
	.space	4 * MZWALLS
	.align	2
mcells:
	.space	2 * MZCELLS

	; Cookies to pass to the hardware (void *tiles[2] in tatest)
	.align	4
tiledesc_cookies:
	.space	2*4

	; Tile buffers (the 64-bytes-per-tile work space)
	.align	4
tilebuffers:
	.long	ta_buffers_tile_buffer_a
	.long	ta_buffers_tile_buffer_b

	; Tile descriptors (the spaces in which the descriptors are
	;  built)
	.align	4
tiledescs:
	.long	ta_buffers_tile_descriptor_a
	.long	ta_buffers_tile_descriptor_b

	; Camera location and orientation.  Location is (X,Y,Z);
	;  orientation is stored in the form of each of the world axes
	;  as a normalized vector in the camera's coordinate system.
	.align	4
eye_loc:
	.long	0f0.5, 0f0.5, 0f0.5
eye_x:
	.long	0f1, 0f0, 0f0
eye_y:
	.long	0f0, 0f1, 0f0
eye_z:
	.long	0f0, 0f0, 0f1

	; Scene corner coordinates.
	.align	4
vertex_coords:
	.macro	vtx_x	y, z
	.long	0f0, $(y), $(z)
	.long	0f1, $(y), $(z)
	.long	0f2, $(y), $(z)
	.long	0f3, $(y), $(z)
	.endm
	.macro	vtx_y	z
	vtx_x	0f0, $(z)
	vtx_x	0f1, $(z)
	vtx_x	0f2, $(z)
	vtx_x	0f3, $(z)
	.endm
	vtx_y	0f0
	vtx_y	0f1
	vtx_y	0f2
	vtx_y	0f3
n_vertex_coords = [. - vertex_coords] / [3*4]
xform_coords:
	.space	n_vertex_coords*3*4

	; Texture twiddling table.  Why "twiddle"?  That's the term
	;  used in tatest's comments.  It appears to be interleaving
	;  the bits of the numbers that form texture coordinates, so
	;  that the texels conceptually at (x,y) and (x+1,y), where
	;  x=ABCDEFG0 and y=abcdefgh (say), are stored at offsets
	;  aAbBcCdDeEfFgGh0 (x) and aAbBcCdDeEfFgGh1 (x+1).
	;
	; Why do it?  Because, in the words of another tatest comment,
	;  "palette based textures can not be non-twiddled".  Why
	;  design hardware that way?  MC, in email, passed along an
	;  explanation from someone who worked on the hardware, saying
	;  that twiddled textures provide higher performance, so the
	;  designers figured the only reason to use non-twiddled
	;  textures was to use a rendered frame as a texture (for, eg,
	;  reflections).  Since the renderer output is always
	;  true-colour, that's all they implemented.  (The
	;  "non-twiddled" bit got reused for a different meaning for
	;  palette-based textures.)
	;
	; tatest generates a 1024-entry table.  We reserve (and set up)
	;  that much space, but as of this writing use only 256 entries
	;  of it.
	;
	; One possible note to beware of is that this may not apply to
	;  the large dimension of non-square textures.  Done naīvely,
	;  doing this for non-square textures could use excessive
	;  amounts of memory; it would appear, for example, that an
	;  8x256 texture would take up almost as much memory space as a
	;  128x256 one (because of all the gaps between the address
	;  bits).  But it may be smarter than that; when I mentioned
	;  that in mail to MC, he said he had a fuzzy memory that the
	;  high bits of non-square textures aren't twiddled, that, eg,
	;  an 8x256 texture in memory consists of 32 consecutive 8x8
	;  (twiddled) blocks.  But he also warned that memory could be
	;  wrong, so test this before depending on it.
	;
	.align	2
twiddles:
	.space	1024*2

	; Current double-buffering buffer number.  Always 0 or 1.
curbuf:
	.space	1
	; When set, this causes printing of debugging info, but for
	;  only one cycle; it's cleared when the info is printed.
debug:
	.byte	0

; Our "text segment".

	; These palettes are straight from tatest; I've just
	;  reformatted them from C to assembly.  It doesn't say where,
	;  if anywhere, they came from.  They're small enough I haven't
	;  bothered trying to compress them.
	.align	4
palette_0:
	.long	0xff000000,0xff3c3c3c,0xff413c3c,0xff493c3c,0xff4d3838,0xff553838,0xff593434,0xff613434
	.long	0xff653030,0xff6d3030,0xff712c2c,0xff792c2c,0xff822828,0xff862828,0xff8e2424,0xff922424
	.long	0xff9a2020,0xff9e2020,0xffa61c1c,0xffaa1c1c,0xffb21818,0xffb61818,0xffbe1414,0xffc71414
	.long	0xffcb1010,0xffd31010,0xffd70c0c,0xffdf0c0c,0xffe30808,0xffeb0808,0xffef0404,0xfff70404
	.long	0xffff0000,0xffff0400,0xffff0c00,0xffff1400,0xffff1c00,0xffff2400,0xffff2c00,0xffff3400
	.long	0xffff3c00,0xffff4500,0xffff4d00,0xffff5500,0xffff5d00,0xffff6500,0xffff6d00,0xffff7500
	.long	0xffff7d00,0xffff8600,0xffff8e00,0xffff9600,0xffff9e00,0xffffa600,0xffffae00,0xffffb600
	.long	0xffffbe00,0xffffc700,0xffffcf00,0xffffd700,0xffffdf00,0xffffe700,0xffffef00,0xfffff700
	.long	0xffffff00,0xffffff04,0xffffff0c,0xffffff14,0xffffff1c,0xffffff24,0xffffff2c,0xffffff34
	.long	0xffffff3c,0xffffff45,0xffffff4d,0xffffff55,0xffffff5d,0xffffff65,0xffffff6d,0xffffff75
	.long	0xffffff7d,0xffffff86,0xffffff8e,0xffffff96,0xffffff9e,0xffffffa6,0xffffffae,0xffffffb6
	.long	0xffffffbe,0xffffffc7,0xffffffcf,0xffffffd7,0xffffffdf,0xffffffe7,0xffffffef,0xfffffff7
	.long	0xffffffff,0xffffffff,0xfffffbfb,0xfffffbf7,0xfffff7f3,0xfffff7ef,0xfffff3eb,0xfffff3e7
	.long	0xffffefe3,0xffffefdf,0xffffebdb,0xffffebd7,0xffffe7d3,0xffffe7cf,0xffffe3cb,0xffffe3c7
	.long	0xffffdfc3,0xffffdfbe,0xffffdbba,0xffffdbb6,0xffffd7b2,0xffffd7ae,0xffffd3aa,0xffffd3a6
	.long	0xffffcfa2,0xffffcf9e,0xffffcb9a,0xffffcb96,0xffffc792,0xffffc78e,0xffffc38a,0xffffc386
	.long	0xffffbe82,0xffffba7d,0xffffba79,0xffffb675,0xffffb671,0xffffb26d,0xffffb269,0xffffae65
	.long	0xffffae61,0xffffaa5d,0xffffaa59,0xffffa655,0xffffa651,0xffffa24d,0xffffa249,0xffff9e45
	.long	0xffff9e41,0xffff9a3c,0xffff9a38,0xffff9634,0xffff9630,0xffff922c,0xffff9228,0xffff8e24
	.long	0xffff8e20,0xffff8a1c,0xffff8a18,0xffff8614,0xffff8610,0xffff820c,0xffff8208,0xffff7d04
	.long	0xffff7900,0xffff7900,0xffff7500,0xffff7100,0xffff6d00,0xffff6900,0xffff6500,0xffff6100
	.long	0xffff5d00,0xffff5900,0xffff5500,0xffff5100,0xffff4d00,0xffff4900,0xffff4500,0xffff4100
	.long	0xffff3c00,0xffff3c00,0xffff3800,0xffff3400,0xffff3000,0xffff2c00,0xffff2800,0xffff2400
	.long	0xffff2000,0xffff1c00,0xffff1800,0xffff1400,0xffff1000,0xffff0c00,0xffff0800,0xffff0400
	.long	0xffff0000,0xffff0000,0xfffb0000,0xfff70000,0xfff70000,0xfff30000,0xffef0000,0xffeb0000
	.long	0xffeb0000,0xffe70000,0xffe30000,0xffe30000,0xffdf0000,0xffdb0000,0xffd70000,0xffd70000
	.long	0xffd30000,0xffcf0000,0xffcf0000,0xffcb0000,0xffc70000,0xffc30000,0xffc30000,0xffbe0000
	.long	0xffba0000,0xffba0000,0xffb60000,0xffb20000,0xffae0000,0xffae0000,0xffaa0000,0xffa60000
	.long	0xffa20000,0xffa20000,0xff9e0404,0xff9a0404,0xff960808,0xff920808,0xff8e0c0c,0xff8e0c0c
	.long	0xff8a1010,0xff861010,0xff821414,0xff7d1414,0xff791818,0xff791818,0xff751c1c,0xff711c1c
	.long	0xff6d2020,0xff692020,0xff652424,0xff652424,0xff612828,0xff5d2828,0xff592c2c,0xff552c2c
	.long	0xff513030,0xff513030,0xff4d3434,0xff493434,0xff453838,0xff413838,0xff3c3c3c,0xff3c3c3c
palette_1:
	.long	0xff000000,0xff000000,0xff000004,0xff00000c,0xff000010,0xff000018,0xff000020,0xff000024
	.long	0xff00002c,0xff000030,0xff000038,0xff000041,0xff000045,0xff00004d,0xff000051,0xff000059
	.long	0xff000061,0xff000065,0xff00006d,0xff000075,0xff000079,0xff000082,0xff000086,0xff00008e
	.long	0xff000096,0xff00009a,0xff0000a2,0xff0000a6,0xff0000ae,0xff0000b6,0xff0000ba,0xff0000c3
	.long	0xff0000cb,0xff0004cb,0xff000ccb,0xff0010cf,0xff0018cf,0xff001cd3,0xff0024d3,0xff0028d3
	.long	0xff0030d7,0xff0038d7,0xff003cdb,0xff0045db,0xff0049db,0xff0051df,0xff0055df,0xff005de3
	.long	0xff0065e3,0xff0069e3,0xff0071e7,0xff0075e7,0xff007deb,0xff0082eb,0xff008aeb,0xff008eef
	.long	0xff0096ef,0xff009ef3,0xff00a2f3,0xff00aaf3,0xff00aef7,0xff00b6f7,0xff00bafb,0xff00c3fb
	.long	0xff00cbff,0xff04cbff,0xff0ccbff,0xff14cfff,0xff1ccfff,0xff24d3ff,0xff2cd3ff,0xff34d3ff
	.long	0xff3cd7ff,0xff45d7ff,0xff4ddbff,0xff55dbff,0xff5ddbff,0xff65dfff,0xff6ddfff,0xff75e3ff
	.long	0xff7de3ff,0xff86e3ff,0xff8ee7ff,0xff96e7ff,0xff9eebff,0xffa6ebff,0xffaeebff,0xffb6efff
	.long	0xffbeefff,0xffc7f3ff,0xffcff3ff,0xffd7f3ff,0xffdff7ff,0xffe7f7ff,0xffeffbff,0xfff7fbff
	.long	0xffffffff,0xfffbffff,0xfff7ffff,0xfff3ffff,0xffebffff,0xffe7ffff,0xffe3ffff,0xffdbffff
	.long	0xffd7ffff,0xffd3ffff,0xffcbffff,0xffc7ffff,0xffc3ffff,0xffbaffff,0xffb6ffff,0xffb2ffff
	.long	0xffaaffff,0xffa6ffff,0xffa2ffff,0xff9effff,0xff96ffff,0xff92ffff,0xff8effff,0xff86ffff
	.long	0xff82ffff,0xff7dffff,0xff75ffff,0xff71ffff,0xff6dffff,0xff65ffff,0xff61ffff,0xff5dffff
	.long	0xff55ffff,0xff51ffff,0xff4dffff,0xff49ffff,0xff41ffff,0xff3cffff,0xff38ffff,0xff30ffff
	.long	0xff2cffff,0xff28ffff,0xff20ffff,0xff1cffff,0xff18ffff,0xff10ffff,0xff0cffff,0xff08ffff
	.long	0xff00ffff,0xff00fbff,0xff00f7ff,0xff00f3ff,0xff00ebff,0xff00e7ff,0xff00e3ff,0xff00dbff
	.long	0xff00d7ff,0xff00d3ff,0xff00cbff,0xff00c7ff,0xff00c3ff,0xff00baff,0xff00b6ff,0xff00b2ff
	.long	0xff00aaff,0xff00a6ff,0xff00a2ff,0xff009eff,0xff0096ff,0xff0092ff,0xff008eff,0xff0086ff
	.long	0xff0082ff,0xff007dff,0xff0075ff,0xff0071ff,0xff006dff,0xff0065ff,0xff0061ff,0xff005dff
	.long	0xff0055ff,0xff0051ff,0xff004dff,0xff0049ff,0xff0041ff,0xff003cff,0xff0038ff,0xff0030ff
	.long	0xff002cff,0xff0028ff,0xff0020ff,0xff001cff,0xff0018ff,0xff0010ff,0xff000cff,0xff0008ff
	.long	0xff0000ff,0xff0000fb,0xff0000f7,0xff0000f3,0xff0000ef,0xff0000eb,0xff0000e7,0xff0000e3
	.long	0xff0000df,0xff0000db,0xff0000d7,0xff0000d3,0xff0000cf,0xff0000cb,0xff0000c7,0xff0000c3
	.long	0xff0000be,0xff0000ba,0xff0000b6,0xff0000b2,0xff0000ae,0xff0000aa,0xff0000a6,0xff0000a2
	.long	0xff00009e,0xff00009a,0xff000096,0xff000092,0xff00008e,0xff00008a,0xff000086,0xff000082
	.long	0xff00007d,0xff000079,0xff000075,0xff000071,0xff00006d,0xff000069,0xff000065,0xff000061
	.long	0xff00005d,0xff000059,0xff000055,0xff000051,0xff00004d,0xff000049,0xff000045,0xff000041
	.long	0xff00003c,0xff000038,0xff000034,0xff000030,0xff00002c,0xff000028,0xff000024,0xff000020
	.long	0xff00001c,0xff000018,0xff000014,0xff000010,0xff00000c,0xff000008,0xff000000,0xff000000
palette_2:
	.long	0xff000000,0xff9208e7,0xff9208e3,0xff9608e3,0xff9a04df,0xff9e04df,0xff9e04db,0xffa204db
	.long	0xffa600d7,0xffaa00d7,0xffaa00d3,0xffae00cf,0xffb200cf,0xffb600cb,0xffb600c7,0xffba00c7
	.long	0xffbe00c3,0xffbe00be,0xffc300be,0xffc700ba,0xffc700b6,0xffcb00b6,0xffcf00b2,0xffcf00ae
	.long	0xffd300aa,0xffd700aa,0xffd700a6,0xffdb04a2,0xffdb049e,0xffdf049e,0xffdf049a,0xffe30896
	.long	0xffe30892,0xffe70892,0xffe7088e,0xffeb0c8a,0xffeb0c86,0xffef0c82,0xffef1082,0xffef107d
	.long	0xfff31479,0xfff31475,0xfff31475,0xfff71871,0xfff7186d,0xfff71c69,0xfffb1c65,0xfffb2065
	.long	0xfffb2061,0xfffb245d,0xffff2859,0xffff2859,0xffff2c55,0xffff2c51,0xffff304d,0xffff344d
	.long	0xffff3449,0xffff3845,0xffff3c45,0xffff3c41,0xffff413c,0xffff453c,0xffff4538,0xffff4934
	.long	0xffff4d34,0xffff4d30,0xffff512c,0xffff552c,0xffff5928,0xffff5928,0xfffb5d24,0xfffb6120
	.long	0xfffb6520,0xfffb651c,0xfff7691c,0xfff76d18,0xfff77118,0xfff37514,0xfff37514,0xfff37914
	.long	0xffef7d10,0xffef8210,0xffef820c,0xffeb860c,0xffeb8a0c,0xffe78e08,0xffe79208,0xffe39208
	.long	0xffe39608,0xffdf9a04,0xffdf9e04,0xffdb9e04,0xffdba204,0xffd7a600,0xffd7aa00,0xffd3aa00
	.long	0xffcfae00,0xffcfb200,0xffcbb600,0xffc7b600,0xffc7ba00,0xffc3be00,0xffbebe00,0xffbec300
	.long	0xffbac700,0xffb6c700,0xffb6cb00,0xffb2cf00,0xffaecf00,0xffaad300,0xffaad700,0xffa6d700
	.long	0xffa2db04,0xff9edb04,0xff9edf04,0xff9adf04,0xff96e308,0xff92e308,0xff92e708,0xff8ee708
	.long	0xff8aeb0c,0xff86eb0c,0xff82ef0c,0xff82ef10,0xff7def10,0xff79f314,0xff75f314,0xff75f314
	.long	0xff71f718,0xff6df718,0xff69f71c,0xff65fb1c,0xff65fb20,0xff61fb20,0xff5dfb24,0xff59ff28
	.long	0xff59ff28,0xff55ff2c,0xff51ff2c,0xff4dff30,0xff4dff34,0xff49ff34,0xff45ff38,0xff45ff3c
	.long	0xff41ff3c,0xff3cff41,0xff3cff45,0xff38ff45,0xff34ff49,0xff34ff4d,0xff30ff4d,0xff2cff51
	.long	0xff2cff55,0xff28ff59,0xff28ff59,0xff24fb5d,0xff20fb61,0xff20fb65,0xff1cfb65,0xff1cf769
	.long	0xff18f76d,0xff18f771,0xff14f375,0xff14f375,0xff14f379,0xff10ef7d,0xff10ef82,0xff0cef82
	.long	0xff0ceb86,0xff0ceb8a,0xff08e78e,0xff08e792,0xff08e392,0xff08e396,0xff04df9a,0xff04df9e
	.long	0xff04db9e,0xff04dba2,0xff00d7a6,0xff00d7aa,0xff00d3aa,0xff00cfae,0xff00cfb2,0xff00cbb6
	.long	0xff00c7b6,0xff00c7ba,0xff00c3be,0xff00bebe,0xff00bec3,0xff00bac7,0xff00b6c7,0xff00b6cb
	.long	0xff00b2cf,0xff00aecf,0xff00aad3,0xff00aad7,0xff00a6d7,0xff04a2db,0xff049edb,0xff049edf
	.long	0xff049adf,0xff0896e3,0xff0892e3,0xff0892e7,0xff088ee7,0xff0c8aeb,0xff0c86eb,0xff0c82ef
	.long	0xff1082ef,0xff107def,0xff1479f3,0xff1475f3,0xff1475f3,0xff1871f7,0xff186df7,0xff1c69f7
	.long	0xff1c65fb,0xff2065fb,0xff2061fb,0xff245dfb,0xff2859ff,0xff2859ff,0xff2c55ff,0xff2c51ff
	.long	0xff304dff,0xff344dff,0xff3449ff,0xff3845ff,0xff3c45ff,0xff3c41ff,0xff413cff,0xff453cff
	.long	0xff4538ff,0xff4934ff,0xff4d34ff,0xff4d30ff,0xff512cff,0xff552cff,0xff5928ff,0xff5928ff
	.long	0xff5d24fb,0xff6120fb,0xff6520fb,0xff651cfb,0xff691cf7,0xff6d18f7,0xff7118f7,0xff7514f3
	.long	0xff7514f3,0xff7914f3,0xff7d10ef,0xff8210ef,0xff820cef,0xff860ceb,0xff8a0ceb,0xff8e08e7

	; Video initialization parameters.  Most of these I don't
	;  understand; what documentation I have has been saved here as
	;  comments.  The comment "magic" means "meaning unknown".
	;
	; These lists are taken pretty much directly from tatest, which
	;  says of them "These values mainly from Dans 3dtest
	;  program...".
	;
	; Since these are longwords stores, the offset must always be
	;  multiples of 4; the terminator is any value which isn't.
	;  (We use 1, but set_params accepts anything whose low two
	;  bits are nonzero.)
	;
	.macro	param	offset, value
	.word	$(offset)
	.long	$(value)
	.endm
	.macro	endparam
	.word	1
	.endm
	.align	2
three_d_params:
	param	0x80a8, 0x15d1c951	; magic
	param	0x80a0, 0x00000020	; magic
	param	0x8008, 0x00000000	; TA out of reset
	param	0x8048, 0x00000009	; "alpha config" - ?
	param	0x8068, [FRAME_X<<16]|0	; pixel clipping x
	param	0x806c, [FRAME_Y<<16]|0	; pixel clipping y
	param	0x8110, 0x00093f39	; magic
	param	0x8098, 0x00800408	; magic
	param	0x804c, [FRAME_X*2]/8	; "display align" - ?
	param	0x8078, 0f1.0
	param	0x8084, 0x00000000	; magic
	param	0x8030, 0x00000101	; magic
	param	0x80b0, 0x007f7f7f	; fog table colour
	param	0x80b4, 0x007f7f7f	; fog vertex colour
	param	0x80c0, 0x00000000	; colour clamp min
	param	0x80bc, 0xffffffff	; colour clamp max
	param	0x8080, 0x00000007	; magic
	param	0x8074, 0x00000001	; "cheap shadow" - ?
	param	0x807c, 0x0027df77	; magic
	param	0x8008, 0x00000001	; TA into reset
	param	0x8008, 0x00000000	; TA out of reset
	param	0x80e4, 0x00000000	; "stride width" - ?
	param	0x6884, 0x00000000	; disable all interrupt enables
	param	0x6930, 0x00000000
	param	0x6938, 0x00000000
	param	0x6900, 0xffffffff	; reset all pending interrupts
	param	0x6908, 0xffffffff
	param	0x6930, 0x002807ec	; re-enable some events (which?)
	param	0x6938, 0x0000000e
	param	0x80b8, 0x0000ff07	; fog density (meanings?)
	param	0x80b4, 0x007f7f7f	; fog vertex colour
	param	0x80b0, 0x007f7f7f	; fog table colour
	param	0x8108, 0x00000003	; 32bit palette (?)
	endparam
screen_params:
	param	0x80e8, 0x00160000	; screen control (?)
	param	0x8044, 0x00800000	; pixel mode ("vb+0x11" - ?)
	param	0x805c, 0x00000000	; size modulo and display lines ("vb+0x17" - ?)
	param	0x80d0, 0x00000100	; interlace flags (bit meanings?)
	param	0x80d8, 0x020c0359	; magic
	param	0x80cc, 0x001501fe	; magic
	param	0x80d4, 0x007e0345	; horizontal border (meaning? - see below)
	param	0x80dc, 0x00240204	; vertical position (meaning?)
	param	0x80e0, 0x07d6c63f	; sync control (meaning?)
	param	0x80ec, 0x000000a4	; horizontal position (meaning?)
	param	0x80f0, 0x00120012	; vertical border (meanings?)
	param	0x80c8, 0x03450000	; "set to same as border H in 80d4" - ?
	param	0x8068, [FRAME_X-1]<<16	; (X resolution - 1) << 16
	param	0x806c, [FRAME_Y-1]<<16	; (Y resolution - 1) << 16
	param	0x804c, 0x000000a0	; "display align" - ?
	param	0x8118, 0x00008040	; magic
	param	0x80f4, 0x00000401	; "anti-aliasing" - ?
	param	0x8048, 0x00000009	; "alpha config" - ?
	param	0x7814, 0x00000000	; "more interrupt control stuff" - ?
	param	0x7834, 0x00000000
	param	0x7854, 0x00000000
	param	0x7874, 0x00000000
	param	0x78bc, 0x4659404f
	param	0x8040, 0x00000000	; border colour
	endparam
	; "???" here means "not documented in tatest at all"
	; The "2" in these is the offset from the beginning of the
	;  param to the place where we store the (longword) value.
cmdlist_params:
	param	0x8008, 0x00000001	; TA into reset
	param	0x8008, 0x00000000	; TA out of reset
cmdlist_param_tilebuf_a = 2 + . - cmdlist_params
	param	0x8124, 0
	param	0x812c, 0		; ???
cmdlist_param_cmdlist = 2 + . - cmdlist_params
	param	0x8128, 0
	param	0x8130, 0		; ???
	param	0x813c, [[[FRAME_Y/32]-1]<<16] | [[FRAME_X/32]-1]
cmdlist_param_tilebuf_b = 2 + . - cmdlist_params
	param	0x8164, 0
	param	0x8140, 0x00100002	; ???
	param	0x8144, 0x80000000	; confirm settings
	endparam

	.align	2
main:
; Things we need to do here:
;	- Set up the FPU (including clearing SR.FD)
;	- Set up the VBR (including clearing SR.BL and SR.RB)
;	- Save r10-r15 against a possible return to cdcode
; Things we do not need to do because cdcode has done them:
;	- Set up the SCIF (r14 is the SCIF's base address)
;	- Set up a stack (r15 is cdcode's stack pointer)
; We do, though, need to put r14 in the gbr; all we use the gbr for is
;  access to the SCIF.  We could use other addressing modes, but the
;  gbr-relative modes have 8 bits of offset, whereas the other
;  register-plus-offset modes have only 4.
	mov.l	r14,@-r15
	mov.l	r13,@-r15
	mov.l	r12,@-r15
	mov.l	r11,@-r15
	mov.l	r10,@-r15
	ldc	r14,gbr
	SETS.L	#0,r1
	lds	r1,fpscr
	SETS.L	#intvec,r0
	ldc	r0,vbr
	stc	sr,r1
	SETS.L	#~[SR_FD|SR_RB|SR_BL],r2
	and	r2,r1
	ldc	r1,sr
	; Note that r0-r7 may have changed if we switched banks.
	SETS.L	#throw_sp,r0
	mov.l	r15,@r0
; Application code begins here.
	bsr	init_rng
	 nop
	bsr	init_maze
	 nop
	bsr	print_maze
	 nop
	bsr	clear_vram
	 nop
	bsr	init_maple
	 nop
	bsr	init_powervr
	 nop
	bsr	init_video
	 nop
	bsr	init_palette
	 nop
	bsr	init_twiddling
	 nop
	bsr	init_textures
	 nop
	bsr	init_tiledesc
	 nop
	bsr	init_3dvalues
	 nop
1:	bsr	one_frame
	 nop
	SETS.L	#nbgetchar,r0
	jsr	@r0
	 nop
	cmp/pz	r0
	bf	1b
throw_out:
	SETS.L	#throw_sp,r0
	mov.l	@r0,r15
	mov.l	@r15+,r10
	mov.l	@r15+,r11
	mov.l	@r15+,r12
	mov.l	@r15+,r13
	mov.l	@r15+,r14
	jmp	@r11
	 nop

print_maze:
	sts.l	pr,@-r15
	SETS.L	#maze,r8
	SETS.L	#MZX,r9
	SETS.L	#MZY,r10
	SETS.L	#MZZ,r11
	SETS.L	#0,r14		; z
3:	SETS.L	#0,r13		; y
2:	SETS.L	#0,r12		; x
1:	bsr	putchar
	 mov	#'(,r1
	bsr	printdec
	 mov	r12,r1
	bsr	putchar
	 mov	#',,r1
	bsr	printdec
	 mov	r13,r1
	bsr	putchar
	 mov	#',,r1
	bsr	printdec
	 mov	r14,r1
	bsr	putchar
	 mov	#'),r1
	mul.l	r10,r14
	sts	macl,r0
	add	r13,r0
	mul.l	r9,r0
	sts	macl,r0
	add	r12,r0
	SHLL	#1,r0
	mov.w	@(r0,r8),r2
	SHLL	#25,r2
	shll	r2
	movt	r1
	bsr	putchar
	 add	#'0,r1
	shll	r2
	movt	r1
	bsr	putchar
	 add	#'0,r1
	shll	r2
	movt	r1
	bsr	putchar
	 add	#'0,r1
	shll	r2
	shll	r2
	movt	r1
	bsr	putchar
	 add	#'0,r1
	shll	r2
	movt	r1
	bsr	putchar
	 add	#'0,r1
	shll	r2
	movt	r1
	bsr	putchar
	 add	#'0,r1
	bsr	putchar
	 mov	#13,r1
	bsr	putchar
	 mov	#10,r1
	add	#1,r12
	cmp/hi	r12,r9
	bt	1b
	add	#1,r13
	cmp/hi	r13,r10
	bt	2b
	add	#1,r14
	cmp/hi	r14,r11
	bt	3b
	lds.l	@r15+,pr
	rts
	 nop

; We assume MZC_Px can be converted to MZC_Mx by shifting by four bits.
.if [[MZC_PX << 4] != MZC_MX] | [[MZC_PY << 4] != MZC_MY] | [[MZC_PZ << 4] != MZC_MZ]
.error Update init_maze, or fix MZC_Px and MZC_Mx!
.endif
init_maze:
	sts.l	pr,@-r15
	; Clear all maze cells and all cell clump IDs.
	SETS.L	#MZCELLS,r1
	SETS.L	#0,r2
	SETS.L	#maze+[2*MZCELLS],r3
	SETS.L	#mcells+[2*MZCELLS],r4
1:	dt	r1
	mov.w	r2,@-r3
	bf/s	1b
	 mov.w	r2,@-r4
	; Initialize the walls.
	SETS.L	#mwalls,r11
	SETS.L	#MZWALLS+1,r10
	SETS.L	#MZX-1,r14
4:	SETS.L	#MZY-1,r13
3:	SETS.L	#MZZ-1,r12
2:	mov	r12,r2
	SHLL	#8,r2
	or	r13,r2
	SHLL	#8,r2
	or	r14,r2
	SHLL	#8,r2
	tst	r12,r12
	bt	1f
	bsr	9f
	 mov	#MZC_MZ,r3
1:	tst	r13,r13
	bt	1f
	bsr	9f
	 mov	#MZC_MY,r3
1:	tst	r14,r14
	bt	1f
	bsr	9f
	 mov	#MZC_MX,r3
1:	add	#-1,r12
	cmp/pz	r12
	bt	2b
	add	#-1,r13
	cmp/pz	r13
	bt	3b
	add	#-1,r14
	cmp/pz	r14
	bt	4b
	dt	r10
	bt	1f
8:	bsr	panic
	 nop
9:	dt	r10
	bt/s	8b
	 or	r2,r3
	mov.l	r3,@r11
	rts
	 add	#4,r11
1:	SETS.L	#1,r14		; next clump ID
	SETS.L	#MZCELLS-1,r13	; highest non-1 clump ID
	SETS.L	#mcells,r12
	SETS.L	#MZWALLS,r11	; number of walls remaining
	SETS.L	#mwalls,r10
	SETS.L	#maze,r9
6:
1:	mov	r13,r0
	SHLL	#1,r0
	mov.w	@(r0,r12),r1
	dt	r1
	bf	1f
	add	#-1,r13
	cmp/pz	r13
	bt	1b
	; No cells with non-1 clump IDs - all done!
	lds.l	@r15+,pr
	rts
	 nop
2:	bsr	panic
	 nop
1:	cmp/pl	r11
	bf	2b
	bsr	random_mod
	 mov	r11,r0
	mov	r0,r8		; random index to look at
	SHLL	#2,r0
	mov.l	@(r0,r10),r1	; wall in question
	add	#-1,r11
	cmp/hi	r8,r11
	bf	1f
	mov	r11,r2
	SHLL	#2,r2
	add	r10,r2
	mov.l	@r2,r3
	mov.l	r3,@(r0,r10)
1:	mov	r1,r4
	SHLR	#8,r4
	mov	r4,r3
	SHLR	#8,r3
	mov	r3,r2
	SHLR	#8,r2
	SETS.L	#MZY,r5
	mul.l	r2,r5
	sts	macl,r6
	extu.b	r3,r3
	add	r3,r6
	.if	MZX != MZY
	SETS.L	#MZX,r5
	.endif
	mul.l	r6,r5
	sts	macl,r6
	extu.b	r4,r4
	add	r4,r6		; linear index of maze cell
	; linear index of cell to look at is in r6
	; walls element is in r1
	mov	r6,r7
	extu.b	r1,r2
	mov	r2,r0
	cmp/eq	#MZC_MX,r0
	bf	1f
	bra	2f
	 add	#-1,r7
1:	cmp/eq	#MZC_MY,r0
	bf	1f
	bra	2f
	 add	#-MZX,r7
1:	cmp/eq	#MZC_MZ,r0
	bf	1f
	SETS.L	#MZX*MZY,r0
	bra	2f
	 sub	r0,r7
1:	bsr	panic
	 nop
2:	; randomly-chosen cell index is in r6
	; adjacent cell's index is in r7
	; walls element is in r1
	; bit is in r2
	mov	r6,r0
	SHLL	#1,r0
	mov.w	@(r0,r12),r3	; chosen cell clump ID
	mov	r7,r0
	SHLL	#1,r0
	mov.w	@(r0,r12),r4	; adjacent cell clump ID
	tst	r3,r3
	bt	1f
	tst	r4,r4
	bt	2f
	cmp/eq	r3,r4
	bt	9f		; both cells are the same clump; do nothing
	; opening between two different clumps
	; renumber the greater to match the lesser
	cmp/hi	r3,r4
	bt	3f
	xor	r3,r4
	xor	r4,r3
	xor	r3,r4
3:	mov	r3,r1
	SETS.L	#MZCELLS,r8
	mov	r12,r5
5:	mov.w	@r5,r0
	cmp/eq	r0,r4
	bf	4f
	mov.w	r3,@r5
4:	dt	r8
	bf/s	5b
	 add	#2,r5
	bra	8f
	 nop
2:	; opening into new space - just store new clump ID
	bra	8f
	 mov.w	r3,@(r0,r12)
1:	tst	r4,r4
	bt	2f
	; opening from new space - just store new clump ID
	mov	r6,r0
	SHLL	#1,r0
	bra	8f
	 mov.w	r4,@(r0,r12)
2:	; opening between two hitherto untouched cells - new clump
	mov.w	r14,@(r0,r12)
	mov	r6,r0
	SHLL	#1,r0
	mov.w	r14,@(r0,r12)
	add	#1,r14
8:	; bookkeeping done; just open wall
	mov	r6,r0
	SHLL	#1,r0
	mov.w	@(r0,r9),r1
	or	r2,r1
	mov.w	r1,@(r0,r9)
	; This shift assumes that MZC_Mx can be converted into MZC_Px
	;  a right shift of four bits.
	SHLR	#4,r2
	mov	r7,r0
	SHLL	#1,r0
	mov.w	@(r0,r9),r1
	or	r2,r1
	mov.w	r1,@(r0,r9)
9:	; all done, including any opening
	bra	6b
	 nop

	SETCONST

; Returns a random integer modulo r0.
; Uses r1, r2, r3, r4, dr0, dr1, fpul, macl.
; Result is returned in r0.
random_mod:
	lds	r0,fpul
	sts	fpscr,r1
	mov	r1,r2
	SETS.L	#FPSCR_PR,r0
	or	r0,r1
	lds	r1,fpscr
	.pr	1
	float	fpul,dr0
	SETS.L	#rng_hand,r1
	mov.b	@r1,r4
	SETS.L	#rng_state,r3
	mov	r4,r0
	SHLL	#2,r0
	mov.l	@(r0,r3),r0
	cmp/pl	r4
	bt	1f
	bsr	stir_rng_preserving
	 nop
	bra	2f
	 nop
1:	add	#-1,r4
	mov.b	r4,@r1
2:	SHLR	#1,r0
	sts	fpul,r1
	lds	r0,fpul
	float	fpul,dr2
	fdiv	dr0,dr2
	ftrc	dr2,fpul
	float	fpul,dr2
	fmul	dr0,dr2
	ftrc	dr2,fpul
	sts	fpul,r3
	lds	r2,fpscr
	.pr	0
	rts
	 sub	r3,r0

; We deliberately don't initialize rng_buf here; we stir the area first
;  thing, and this way we may get some extra entropy from whatever is
;  lying around in that memory.  We don't need cryptographic levels of
;  unguessability here.
init_rng:
	sts.l	pr,@-r15
	bsr	stir_rng
	 nop
	bsr	rng_rtc
	 nop
	bsr	stir_rng
	 nop
	lds.l	@r15+,pr
	rts
	 nop

rng_rtc:
	sts	pr,r3
	SETS.L	#G2DRAIN_ADDR,r1
	SETS.L	#G2DRAIN_BIT,r2
1:	mov.l	@r1,r0
	tst	r2,r0
	bf	1b
	SETS.L	#G2RTC_BASE,r7
	SETS.L	#0,r1
1:	SETS.L	#3,r5
	mov	r1,r6
2:	bsr	read_rtc_once
	cmp/eq	r1,r6
	bf	1b
	dt	r5
	bf	2b
	mov	r1,r4
	SETS.L	#0,r5
1:	bsr	read_rtc_once
	 add	#1,r5
	cmp/eq	r1,r4
	bt	1b
	SETS.L	#rng_state,r1
	mov.l	@r1,r0
	add	r4,r0
	mov.l	r0,@r1
	mov.l	@(4,r1),r0
	add	r5,r0
	jmp	@r3
	 mov.l	r0,@(4,r1)
	; Assumes G2RTC_BASE is in r7 and r0-r2 are scratch.
	; Returned value is in r1.  Disturbs nothing else.
read_rtc_once:
	mov.l	@r7,r1
	mov.l	@(4,r7),r2
	SHLL	#16,r1/r0
	extu.w	r2,r2
	rts
	 or	r2,r1

stir_rng_preserving:
	mov.l	r0,@-r15
	mov.l	r1,@-r15
	mov.l	r2,@-r15
	mov.l	r3,@-r15
	sts.l	pr,@-r15
	mov.l	r4,@-r15
	mov.l	r5,@-r15
	mov.l	r6,@-r15
	bsr	stir_rng
	 mov.l	r7,@-r15
	mov.l	@r15+,r7
	mov.l	@r15+,r6
	mov.l	@r15+,r5
	mov.l	@r15+,r4
	lds.l	@r15+,pr
	mov.l	@r15+,r3
	mov.l	@r15+,r2
	mov.l	@r15+,r1
	rts
	 mov.l	@r15+,r0

	; Uses r0-r7.  Disturbs nothing else.
stir_rng:
	SETS.L	#0x12345678,r1
	SETS.L	#0x04c11db7,r2 ; edb88320 bit-reversed
	SETS.L	#rng_state,r3
	SETS.L	#RNG_STATE_WORDS,r4
	mov	r4,r5
	mov	r3,r6
3:	mov.l	@r6+,r7
	xor	r7,r1
	SETS.L	#32,r7
2:	shll	r1
	bf	1f
	xor	r2,r1
1:	dt	r7
	bf	2b
	dt	r5
	bf	3b
3:	mov.l	@r3,r7
	mov.l	r1,@r3
	add	r7,r1
	SETS.L	#8,r7
2:	shll	r1
	bf	1f
	xor	r2,r1
1:	dt	r7
	bf	2b
	dt	r4
	bf/s	3b
	 add	#4,r3
	.if	rng_hand == rng_state+[4*RNG_STATE_WORDS]
	; nothing
	.elif	@IS_SB[rng_hand-[rng_state+[4*RNG_STATE_WORDS]]]
	add	#rng_hand-[rng_state+[4*RNG_STATE_WORDS]],r3
	.else
	SETS.L	#rng_hand,r3
	.endif
	SETS.L	#RNG_STATE_WORDS-1,r0
	rts
	 mov.b	r0,@r3

clear_vram:
	SETS.L	#QACR0,r1
	SETS.L	#QACR1,r2
	SETS.L	#[[VRAM_BASE_64>>26]&7]<<2,r3
	SETS.L	#STOREQ_BASE+[4*16],r4
	SETS.L	#0,r5
	mov.l	r3,@r1
	mov.l	r3,@r2
	SETS.L	#16,r0
1:	dt	r0
	bf/s	1b
	 mov.l	r5,@-r4
	SETS.L	#VRAM_SIZE/32,r1
	SETS.L	#[VRAM_BASE_64&0x03ffffc0]|0xe0000000,r2
1:	pref	@r2
	dt	r1
	bf/s	1b
	 add	#32,r2
	mov.l	r5,@r4
	add	#4*16,r4
	rts
	 mov.l	r5,@r4

set_params:
	; r1 points to params table
	SETS.L	#VIDREG_BASE,r2
1:	mov.w	@r1+,r0
	tst	#3,r0
	bf/s	1f
	 extu.w	r0,r0
	mov.w	@r1+,r3
	mov.w	@r1+,r4
	SHLL	#16,r4
	extu.w	r3,r3
	or	r3,r4
	add	r2,r0
	bra	1b
	 mov.l	r4,@r0
1:
	rts
	 nop

init_maple:
	mova	9f,r0
1:	mov.l	@r0+,r1
	tst	r1,r1
	bt	1f
	mov.l	@r0+,r2
	bra	1b
	 mov.l	r2,@r1
1:	rts
	 nop
	.align	4
9:	.long	BUS_RESET,	BUS_RESET_VALUE
	.long	BUS_RESET2,	BUS_RESET2_VALUE
	.long	BUS_SPEED,	SPEED_2MBPS|[50000<<SPEED_TIMEOUT_SHIFT]
	.long	BUS_ENABLE,	BUS_ENABLE_VALUE
	.long	0

init_powervr:
	sts.l	pr,@-r15
	SETS.L	#three_d_params,r1
	bsr	set_params
	 nop
	SETS.L	#0xa05f810c,r1	; what does this point to?
	SETS.L	#0x000007ff,r2	; what does this mask mean?
	SETS.L	#65536,r4
	mov	r4,r3
1:	mov.l	@r1,r0
	tst	r2,r0
	bt	1b
1:	mov.l	@r1,r0
	tst	r2,r0
	bf	1b
	SETS.L	#screen_params,r1
	bsr	set_params
	 nop
	lds.l	@r15+,pr
	rts
	 nop

init_video:
	sts.l	pr,@-r15
	; Get cable type from port A bits 8 and 9
	; 0=VGA, 1=???, 2=RGB, 3=composite
	SETS.L	#PCTRA,r8
	SETS.L	#~0x000f0000,r2	; control bits for pins 8 and 9
	SETS.L	#0x000a0000,r3	; configure as inputs, pullups enabled
	mov.l	@r8,r0
	and	r2,r0
	or	r3,r0
	mov.l	r0,@r8
	.if	@IS_SB[PDTRA-PCTRA]
	add	#PDTRA-PCTRA,r8
	.else
	SETS.L	#PDTRA,r8
	.endif
	mov.w	@r8,r0
	SHXR	#8,r0
	and	#3,r0
	mov	r0,r9
	SETS.L	#VIDREG_BASE+0x8000,r8
	mov	r8,r2
	add	#8,r2
	SETS.L	#0,r6
	mov.l	r6,@r2		; 0xa05f8008, "TA out of reset"
	add	#0x40-8,r2
	mov.l	r6,@r2		; 0xa05f8040, border colour
	mov	#0x5,r3		; 5/6/5 2bpp, no scan doubling, display enabled
	SETS.L	#240,r7
	mov	r9,r0
	tst	#2,r0
	bf	1f
	SHLL	#1,r7
	swap.w	r3,r0		; |= 0x00800000, clock doubler
	or	#0x80,r0
	swap.w	r0,r3
1:	add	#0x44-0x40,r2
	mov.l	r3,@r2		; 0xa05f8044, display mode
	add	#0x50-0x44,r2
	mov.l	r6,@r2		; 0xa05f8050, vram base offset 1
	SETS.L	#SHORT_FRAME_OFFSET,r3 ; pixels * bytes-per-pixel
	add	#0x54-0x50,r2
	mov.l	r3,@r2		; 0xa05f8054, vram base offset 2
	SETS.L	#1<<8,r3	; VO, negative H and V sync
	SETS.L	#[FRAME_X/2],r4	; longs of (16bpp) pixel data per scanline
	SETS.L	#1,r5
	mov	r9,r0
	tst	#2,r0
	bt	1f
	add	r4,r5
	SETS.L	#0x10,r0	; interlaced, NTSC colour
	or	r0,r3
1:	SHLL	#10,r5
	add	r7,r5
	add	#-1,r5
	SHLL	#10,r5
	add	r4,r5
	add	#-1,r5
	add	#0x5c-0x54,r2
	mov.l	r5,@r2		; 0xa05f805c, display size and modulo
	add	#0xd0-0x5c,r2
	mov.l	r3,@r2		; 0xa05f80d0, video encapsulation
	SETS.L	#0x007e0345,r8	; doesn't make sense per doc
	add	#0xd4-0xd0,r2
	mov.l	r8,@r2		; 0xa05f80d4, H border range
	SETS.L	#[524<<16]|857,r8; NTSC/VGA
	add	#0xd8-0xd4,r2
	mov.l	r8,@r2		; 0xa05f80d8, full video size
	mov	r9,r0
	and	#2,r0
	mov	r0,r3
	SHLL	#3,r0
	or	r3,r0
	SETS.L	#36,r3
	sub	r0,r3
	mov	r3,r0
	SHLL	#16,r0
	or	r0,r3
	mov	r3,r8
	add	r7,r8
	add	#0xdc-0xd8,r2
	mov.l	r8,@r2		; 0xa05f80dc, V border range
	SETS.L	#22<<16,r8	; N=magic, pixel duplication disabled
	add	#0xe8-0xdc,r2
	mov.l	r8,@r2		; 0xa05f80e8, additional video settings
	SETS.L	#0xa4,r8
	add	#0xec-0xe8,r2
	mov.l	r8,@r2		; 0xa05f80ec, H position
	add	#0xf0-0xec,r2
	mov.l	r3,@r2		; 0xa05f80f0, V position
	SETS.L	#260,r4
	mov	r9,r0
	tst	#2,r0
	bf	1f
	SETS.L	#510,r4
1:	SETS.L	#0x21<<16,r3
	or	r3,r4
	add	#0xcc-0xf0,r2
	mov.l	r4,@r2		; 0xa05f80cc, raster event position
	mov	r9,r0
	tst	#1,r0
	bt/s	1f
	 mov	#0,r8
	mov	#3,r8
1:	SETS.L	#0xa0702c00,r3
	mov.l	r8,@r2		; 0xa0702c00, "Select RGB/CVBS" (??)
	lds.l	@r15+,pr
	rts
	 nop

init_palette:
	SETS.L	#0xa05f9000,r1
	SETS.L	#256*4,r7
	mov	r1,r3
	add	r7,r3
	mov	r3,r5
	add	r7,r5
	SETS.L	#palette_0,r2
	SETS.L	#palette_1,r4
	SETS.L	#palette_2,r6
	SETS.L	#256,r7
1:	mov.l	@r2+,r0
	mov.l	r0,@r1
	mov.l	@r4+,r0
	mov.l	r0,@r3
	mov.l	@r6+,r0
	mov.l	r0,@r5
	add	#4,r1
	add	#4,r3
	dt	r7
	bf/s	1b
	 add	#4,r5
	rts
	 nop

init_twiddling:
	SETS.L	#twiddles+[1024*2],r1
	SETS.L	#1024,r2
	SETS.L	#0x00300,r3
	SETS.L	#0x000f0,r4
	SETS.L	#0x00c0c,r5
	SETS.L	#0x22222,r6
1:	add	#-1,r2
	mov	r2,r0
	and	r3,r0
	SHLL	#8,r0
	mov	r2,r7
	not	r3,r8
	and	r8,r7
	or	r0,r7
	mov	r7,r0
	and	r4,r0
	SHLL	#4,r0
	not	r4,r8
	and	r8,r7
	or	r0,r7
	mov	r7,r0
	and	r5,r0
	SHLL	#2,r0
	not	r5,r8
	and	r8,r7
	or	r0,r7
	mov	r7,r0
	and	r6,r0
	SHLL	#1,r0
	not	r6,r8
	and	r8,r7
	or	r0,r7
	tst	r2,r2
	bf/s	1b
	 mov.w	r7,@-r1
	rts
	 nop

	SETCONST

; The C code this is based upon (again, from tatest)
;
;		  for(i=0; i<256; i++)
;		    for(j=0; j<256; j+=2) {
;		      /* Texture 0 = Mandelbrot */
;		      tex[0][twiddletab[i]|(twiddletab[j]>>1)] =
;			compute_texture(i, j, 0) | (compute_texture(i, j+1, 0)<<8);
;		      /* Texture 1 = Julia */
;		      tex[1][twiddletab[i]|(twiddletab[j]>>1)] =
;			compute_texture(i, j, 1) | (compute_texture(i, j+1, 1)<<8);
;		    }
;
; We change some names, but it's otherwise pretty similar.  We keep a
;  lot of stuff on the stack rather than in registers; while we might
;  have enough registers, this means I don't have to think about
;  register allocation as much.  It also means the texture computation
;  functions have a much freer hand with registers.
;
; Arguably we should write these through 0x84000000 and then flush the
;  d$, but this is initialization code and hence uncached performance
;  is acceptable here.
;
init_textures:
	sts.l	pr,@-r15
	; Wall texture.
	SETS.L	#twiddles,r7
	mov.l	r7,@-r15
	SETS.L	#texture_wall,r8
	mov.l	r8,@-r15
	mov	#0,r0
	mov.l	r0,@-r15
2:	mov	#0,r0
	mov.l	r0,@-r15
	; stack = x y tex twiddles
1:	mov.l	@r15,r1			; x
	bsr	compute_texture
	 mov.l	@(4,r15),r2		; y
	mov.l	r0,@-r15		; valA(x,y)
	mov.l	@(4,r15),r1		; x
	mov.l	@(8,r15),r2		; y
	bsr	compute_texture
	 add	#1,r1			; r0=valA(x+1,y)
	mov.l	@r15+,r1		; valA(x,y)
	SHLL	#8,r0
	or	r1,r0			; combined vals
	mov.l	r0,@-r15
	; stack = vals x y tex twiddles
	mov.l	@(16,r15),r2		; twiddles
	mov.l	@(4,r15),r1		; x
	SHLL	#1,r1
	add	r2,r1
	mov.w	@r1,r1
	mov.l	@(8,r15),r3		; y
	SHLL	#1,r3
	add	r2,r3
	mov.w	@r3,r3
	SHLL	#1,r3
	or	r1,r3
	; r3 now holds twiddled texture offset
	mov.l	@(12,r15),r2		; tex
	add	r3,r2
	mov.l	@r15,r0			; val
	mov.w	r0,@r2
	add	#4,r15			; pop vals
	SETS.L	#8,r1
	mov.l	@r15,r0			; x
	add	#2,r0
	cmp/hs	r1,r0
	bf/s	1b
	 mov.l	r0,@r15
	add	#4,r15			; pop x
	mov.l	@r15,r0			; y
	add	#1,r0
	cmp/hs	r1,r0
	bf/s	2b
	 mov.l	r0,@r15
	add	#4,r15			; pop tex, twiddles
	lds.l	@r15+,pr
	rts
	 nop

compute_texture:
	; return(255&(int)(20*hypot(x-3.5,y-3.5)))
	SETS.L	#@FLOAT[0f3.5],r0
	lds	r0,fpul
	fsts	fpul,fr2
	mov	#20,r0
	lds	r0,fpul
	float	fpul,fr3
	lds	r1,fpul
	float	fpul,fr0
	fsub	fr2,fr0
	lds	r2,fpul
	float	fpul,fr1
	fsub	fr2,fr1
	fmul	fr0,fr0
	fmul	fr1,fr1
	fadd	fr1,fr0
	fsqrt	fr0
	fmul	fr3,fr0
	ftrc	fr0,fpul
	sts	fpul,r0
	rts
	 extu.b	r0,r0

init_tiledesc:
	sts.l	pr,@-r15
	SETS.L	#tiledesc_cookies,r4
	SETS.L	#tilebuffers,r5
	SETS.L	#tiledescs,r6
	mov.l	r4,@-r15
	mov.l	@(4,r5),r0
	mov.l	r0,@-r15
	mov.l	@(4,r6),r0
	mov.l	r0,@-r15
	mov.l	@r6,r2
	bsr	setup_tiledesc
	 mov.l	@r5,r3
	mov.l	@(8,r15),r4
	mov.l	r0,@r4
	mov.l	@r15+,r2
	bsr	setup_tiledesc
	 mov.l	@r15+,r3
	mov.l	@r15+,r4
	mov.l	r0,@(4,r4)
	SETS.L	#curbuf,r1
	mov	#0,r0
	lds.l	@r15+,pr
	rts
	 mov.b	r0,@r1
setup_tiledesc:
	; in tatest terms, this is ta_create_tile_descriptors.  ptr is
	;  r2, buf is r3, w is FRAME_X/32, and h is FRAME_Y/32.  No
	;  registers r0-r9 are important upon return; they all are
	;  available to us.
	; vr = ptr
	mov	r2,r4		; vr is r4
	; bf = ((unsigned int)buf)&0x007fffff  (buf is dead after this)
	SETS.L	#0x007fffff,r0
	and	r0,r3		; bf is r3 from here on
	; strbase = (((unsigned int)ptr)&0x007fffff)|0x80000000
	; ptr is _not_ dead here, but 0x007fffff is.
	SETS.L	#0x80000000,r7
	and	r2,r0
	or	r0,r7		; strbase is r7
	; for (18 loops) *vr++ = 0
	mov	#18,r1
	mov	#0,r0
1:	mov.l	r0,@r4
	dt	r1
	bf/s	1b
	 add	#4,r4
	; *vr++ = 0x10000000
	; *vr++ = 0x80000000 (five times)
	SETS.L	#0x10000000,r1
	mov.l	r1,@r4
	SETS.L	#0x80000000,r1
	mov.l	r1,@(4,r4)
	mov.l	r1,@(8,r4)
	mov.l	r1,@(12,r4)
	mov.l	r1,@(16,r4)
	mov.l	r1,@(20,r4)
	add	#24,r4
	SETS.L	#FRAME_X/32,r8	; w is r8
	SETS.L	#FRAME_Y/32,r9	; h is r9
	; for (x=0;x<w;x++)
	mov	#0,r5		; x is r5
2:	; for (y=0;y<h;y++)
	mov	#0,r6		; y is r6
1:	; *vr++ = (y << 8) | (x << 2)
	mov	r6,r0
	SHLL	#8,r0
	mov	r5,r1
	SHLL	#2,r1
	or	r1,r0
	mov.l	r0,@r4
	; *vr++ = bf+((x+y*w)<<6)
	mul.l	r8,r6
	sts	macl,r0
	add	r5,r0
	SHLL	#6,r0/r1
	add	r3,r0
	mov.l	r0,@(4,r4)
	; *vr++ = strbase (four times)
	mov.l	r7,@(8,r4)
	mov.l	r7,@(12,r4)
	mov.l	r7,@(16,r4)
	mov.l	r7,@(20,r4)
	; end of y loop
	add	#1,r6
	cmp/hi	r6,r9
	bt/s	1b
	 add	#24,r4
	; end of x loop
	add	#1,r5
	cmp/hi	r5,r8
	bt/s	2b
	 nop
	; vr[-6] |= 0x80000000
	add	#-4*6,r4
	SETS.L	#0x80000000,r1
	mov.l	@r4,r0
	or	r1,r0
	mov.l	r0,@r4
	; return ((char *)ptr)+72
	mov	r2,r0
	rts
	 add	#72,r0

	; tatest uses fschg, eight fmovs from drX to xdX, fschg to set
	;  the matrix.  I'm not convinced this is safe; is it
	;  impossible to have two single-floats that, when the bits are
	;  reinterpreted as a double-float, turn into a signaling NaN?
	;  Programmer's PDF page 128 seems to imply that it's supposed
	;  to work, and pages 271ff do not list any possible exceptions
	;  for fmov, so it probably just does the move regardless of
	;  NaNs.  However, frchg is a much faster way to do basically
	;  the same thing; tatest doesn't use it because libgcc does
	;  not get along with FPSCR.FR=1, something we don't care
	;  about.  (We do, however, depend on fmoving two singles as a
	;  double when moving to/from memory.  And it'd take more
	;  thought to be certain, but I *think* the double move is safe
	;  in all cases even if the hardware does treat them as numbers
	;  instead of uninterpreted bags of bits, possibly excepting
	;  the FPSCR.DN=1 case - and that's a setting we don't use.)
init_3dvalues:
	; clear_matrix()
	fldi1	fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	fldi1	fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	fldi1	fr10
	fldi0	fr11
	fldi0	fr12
	fldi0	fr13
	fldi0	fr14
	fldi1	fr15
	frchg
	; apply_matrix(&screenview_matrix)
	SETS.L	#FRAME_X/0f2,r0
	SETS.L	#FRAME_Y/0f2,r1
	lds	r0,fpul
	fsts	fpul,fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	lds	r1,fpul
	fsts	fpul,fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	fldi1	fr10
	fldi0	fr11
	fmov	fr0,fr12
	fmov	fr5,fr13
	fldi0	fr14
	fldi1	fr15
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	ftrv	xmtrx,fv12
	frchg
	; apply_matrix(&projection_matrix)
	SETS.L	#@FLOAT[COT_FOVY],r0
	lds	r0,fpul
	fsts	fpul,fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	fmov	fr0,fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	SETS.L	#[ZFAR+ZNEAR]/@FLOAT[ZNEAR-ZFAR],r0
	lds	r0,fpul
	fsts	fpul,fr10
	fldi1	fr11
	fneg	fr11
	fldi0	fr12
	fldi0	fr13
	SETS.L	#[2*ZFAR*ZNEAR]/@FLOAT[ZNEAR-ZFAR],r0
	lds	r0,fpul
	fsts	fpul,fr14
	fldi1	fr15
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	ftrv	xmtrx,fv12
	frchg
	; apply_matrix(&translation_matrix)
	fldi1	fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	fldi1	fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	fldi1	fr10
	fldi0	fr11
	fldi0	fr12
	fldi0	fr13
	SETS.L	#@FLOAT[DISTANCE],r0
	lds	r0,fpul
	fsts	fpul,fr14
	fldi1	fr15
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	ftrv	xmtrx,fv12
	SETS.L	#base_matrix+[16*4],r0
	fschg
	fmov	dr14,@-r0
	fmov	dr12,@-r0
	fmov	dr10,@-r0
	fmov	dr8,@-r0
	fmov	dr6,@-r0
	fmov	dr4,@-r0
	fmov	dr2,@-r0
	fmov	dr0,@-r0
	fschg
	rts
	 nop

one_frame:
	sts.l	pr,@-r15
	bsr	start_maple
	 nop
	bsr	update_rotations
	 nop
	bsr	apply_transform
	 nop
	bsr	transform_coords
	 nop
	bsr	setup_cmd_list
	 nop
	bsr	draw_scene
	 nop
	bsr	handle_maple
	 nop
	bsr	await_video
	 nop
	bsr	next_frame
	 nop
	lds.l	@r15+,pr
	rts
	 nop
start_maple:
	SETS.L	#BUS_DMAADDR,r1
	SETS.L	#maple_cmd&DMA_ADDRMASK,r2
	SETS.L	#BUS_STATE,r3
	SETS.L	#BUS_STATE_GO,r0
	mov.l	r2,@r1
	rts
	 mov.l	r0,@r3
update_rotations:
	sts.l	pr,@-r15
	SETS.L	#curistate,r0
	mov.l	@(4,r0),r5
	mov.l	@r0,r0
	extu.b	r5,r1		; joystick X
	extu.w	r5,r2
	SHXR	#8,r2		; joystick Y
	SETS.L	#0x80,r5
	mov	r5,r3
	sub	r1,r5		; X, center-zero
	sub	r2,r3		; Y, center-zero
	lds	r5,fpul
	float	fpul,fr14	; X
	lds	r3,fpul
	float	fpul,fr15	; Y
	SETS.L	#@FLOAT[BUTTON_FACTOR],r5
	lds	r5,fpul
	fsts	fpul,fr8
	fldi1	fr9
	tst	#4,r0		; A (slow down)
	bf	1f
	fdiv	fr8,fr9
1:	SHXR	#8,r0
	tst	#2,r0		; Y (speed up)
	bf	1f
	fmul	fr8,fr9
1:	fmul	fr9,fr14	; fr14 = (accelerated/retarded) X
	fmul	fr9,fr15	; fr15 = (accelerated/retarded) Y
	fmov	fr15,fr4
	fmov	fr14,fr5
	fneg	fr5
	fldi0	fr6		; (fr4,fr5,fr6) = axis vector
	fmov	fr14,fr0
	fmov	fr15,fr13
	fmul	fr13,fr13
	fmul	fr0,fr0
	fadd	fr0,fr13
	fmov	fr13,fr1
	fsqrt	fr13		; hypot(X,Y)
	ftrc	fr13,fpul
	sts	fpul,r0
	tst	r0,r0
	bt	1f
	fsrra	fr1
	fmul	fr1,fr4		; normalize axis vector
	fmul	fr1,fr5
	fmul	fr1,fr6
1:	mov.l	r0,@-r15	; rotation angle from joystick
	fmov.s	fr6,@-r15	; rotation axis from joystick
	fmov.s	fr5,@-r15
	fmov.s	fr4,@-r15
	SETS.L	#curistate,r1
	mov.l	@r1,r0
	SHXR	#16,r0/r1
	extu.b	r0,r1		; right trigger
	SHXR	#8,r0/r2
	extu.b	r0,r0		; left trigger
	sub	r0,r1
	fldi0	fr4
	fldi0	fr5
	fldi1	fr6
	cmp/pz	r1
	bt	1f
	neg	r1,r1
	fneg	fr6
	fmov.s	fr9,@-r15
1:	lds	r1,fpul		; rotation amount from triggers
	float	fpul,fr0
	fmul	fr9,fr0
	bsr	rotate_camera
	 ftrc	fr0,fpul
	fmov.s	@r15+,fr4
	fmov.s	@r15+,fr5
	fmov.s	@r15+,fr6
	bsr	rotate_camera
	 lds.l	@r15+,fpul
	SETS.L	#curistate,r2
	SETS.L	#previstate,r3
	mov.w	@r2,r0
	mov.w	@r3,r1
	mov.w	r0,@r3
	not	r0,r0
	and	r0,r1
	; set bits in r0 indicate currently-pressed buttons
	; set bits in r1 indicate newly-pressed buttons
	; bits are ---- -XY- RLDU sAB-
	; (s = Start)
	SETS.L	#@FLOAT[0f100],r1
	lds	r1,fpul
	float	fpul,fr0
	fmov.s	@r15+,fr9
	fmul	fr9,fr0
	tst	#0x10,r0
	bf	1f
	tst	#0x20,r0
	bt	2f
	fneg	fr0
1:	; move camera forward by fr0
2:	lds.l	@r15+,pr
	rts
	 nop
; Rotate the camera by fpul (int) fsca units about (fr4,fr5,fr6)
rotate_camera:
	sts	fpul,r0
	tst	r0,r0
	bt	1f
	sts.l	pr,@-r15
	SETS.L	#eye_x,r0
	fmov.s	@r0+,fr0
	fmov.s	@r0+,fr1
	bsr	rotate_around_axis
	 fmov.s	@r0+,fr2
	bsr	normalize
	 nop
	.if	eye_x+12 == eye_y
	mov	r0,r1
	.elif	@IS_SB[eye_y-[eye_x+12]]
	mov	r0,r1
	add	#eye_y-[eye_x+12],r1
	.else
	SETS.L	#eye_y,r1
	.endif
	fmov.s	fr2,@-r0
	fmov.s	fr1,@-r0
	fmov.s	fr0,@-r0
	fmov.s	@r1+,fr0
	fmov.s	@r1+,fr1
	bsr	rotate_around_axis
	 fmov.s	@r1+,fr2
	fmov.s	@r0+,fr8
	fmov.s	@r0+,fr9
	bsr	subtract_component
	 fmov.s	@r0+,fr10
	bsr	normalize
	 nop
	.if	@IS_SB[[eye_z+12]-[eye_y+12]]
	mov	r1,r2
	add	#[eye_z+12]-[eye_y+12],r2
	.else
	SETS.L	#eye_z+12,r2
	.endif
	fmov.s	fr2,@-r1
	fmov.s	fr1,@-r1
	fmov.s	fr0,@-r1
	fmov	fr8,fr3
	fmov	fr9,fr4
	bsr	crossproduct
	 fmov	fr10,fr5
	fmov.s	fr2,@-r2
	fmov.s	fr1,@-r2
	fmov.s	fr0,@-r2
2:	lds.l	@r15+,pr
1:	rts
	 nop

; Apply the transformation implied by the camera location and
;  orientation to the base matrix.  The result is left in xmtrx.
;  All other fr* registers are destroyed, but fpul is untouched.
apply_transform:
	SETS.L	#base_matrix,r0
	fschg
	fmov	@r0+,dr0
	fmov	@r0+,dr2
	fmov	@r0+,dr4
	fmov	@r0+,dr6
	fmov	@r0+,dr8
	fmov	@r0+,dr10
	fmov	@r0+,dr12
	fmov	@r0+,dr14
	fschg
	frchg
	SETS.L	#eye_x,r0
	fmov.s	@r0+,fr0
	fmov.s	@r0+,fr1
	fmov.s	@r0+,fr2
	fldi0	fr3
	.if eye_y == eye_x+12
	; nothing
	.elif @IS_SB[eye_y-[eye_x+12]]
	add	#eye_y-[eye_x+12],r0
	.else
	SETS.L	#eye_y,r0
	.endif
	fmov.s	@r0+,fr4
	fmov.s	@r0+,fr5
	fmov.s	@r0+,fr6
	fldi0	fr7
	.if eye_z == eye_y+12
	; nothing
	.elif @IS_SB[eye_z-[eye_y+12]]
	add	#eye_z-[eye_y+12],r0
	.else
	SETS.L	#eye_z,r0
	.endif
	fmov.s	@r0+,fr8
	fmov.s	@r0+,fr9
	fmov.s	@r0+,fr10
	fldi0	fr11
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	; We could fldi0 on fr12, fr13, fr14 and fldi1 on fr15, then
	;  ftrv xmtrx,fv12.  That would be the conceptually clean
	;  answer.  But that costs 8 cycles; this way costs only half
	;  that, and is one instruction shorter to boot.
	;
	; We can't do four single moves; there is no way to fmov just
	;  one of the xmtrx registers anywhere.  (While talking about
	;  the speed of nonexistent instructions is always dubious, it
	;  feels like a one-cycle instruction, in which case it
	;  wouldn't be any faster than this - but no slower either.)
	fschg
	fmov	xd12,dr12
	fmov	xd14,dr14
	fschg
	rts
	 frchg

	SETCONST

; Apply the transformation in xmtrx to scene coordinates (the array
;  in vertex_coords), leaving the result in xform_coords.
transform_coords:
	SETS.L	#vertex_coords,r1
	pref	@r1
	mov	r1,r5
	add	#[3*4]-1,r5
	pref	@r5
	SETS.L	#xform_coords,r0
	SETS.L	#n_vertex_coords,r2
	SETS.L	#4,r3
	SETS.L	#8,r4
1:	add	#3*4,r5
	pref	@r5
	fmov.s	@r1+,fr0
	fmov.s	@r1+,fr1
	fmov.s	@r1+,fr2
	fldi1	fr3
	ftrv	xmtrx,fv0
	dt	r2
	; fr3 should always be exactly 1 here; this is paranoia
	fdiv	fr3,fr0
	fdiv	fr3,fr1
	fdiv	fr3,fr2
	fmov.s	fr0,@r0
	fmov.s	fr1,@(r0,r3)
	fmov.s	fr2,@(r0,r4)
	bf/s	1b
	 add	#3*4,r0
	; dump stuff if debug
	SETS.L	#debug,r1
	mov.b	@r1,r0
	tst	r0,r0
	bt	1f
	mov	#0,r0
	mov.b	r0,@r1
	sts.l	pr,@-r15
	SETS.L	#eye_x,r0
	bsr	2f
	 nop
	SETS.L	#eye_y,r0
	bsr	2f
	 nop
	SETS.L	#eye_z,r0
	bsr	2f
	 nop
	bra	3f
	 nop
2:	sts.l	pr,@-r15
	mov.l	@r0+,r1
	mov.l	@r0+,r4
	bsr	print_float
	 mov.l	@r0+,r5
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov	r4,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov	r5,r1
	bsr	putchar
	 mov	#13,r1
	lds.l	@r15+,pr
	bra	putchar
	 mov	#10,r1
3:	SETS.L	#vertex_coords,r9
	SETS.L	#xform_coords,r8
	SETS.L	#n_vertex_coords,r7
2:	bsr	print_float
	 mov.l	@r9+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r9+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r9+,r1
	mova	9f,r0
	bsr	putstr
	 mov	r0,r1
	bsr	print_float
	 mov.l	@r8+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r8+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r8+,r1
	bsr	putchar
	 mov	#13,r1
	bsr	putchar
	 mov	#10,r1
	dt	r7
	bf	2b
	lds.l	@r15+,pr
1:	rts
	 nop
	.align	4
9:	.asciz	" -> "
	.align	2
setup_cmd_list:
	; In tatest terms, this is ta_set_target, but with args
	;  computed here based on curbuf rather than being passed in.
	sts.l	pr,@-r15
	SETS.L	#curbuf,r1
	mov.b	@r1,r1
	SHLL	#2,r1
	SETS.L	#cmdlists,r2
	SETS.L	#tilebuffers,r3
	add	r1,r2
	mov.l	@r2,r2
	add	r1,r3
	mov.l	@r3,r3
	SETS.L	#0x007fffff,r4
	and	r4,r2
	and	r4,r3
	swap.w	r2,r4
	swap.w	r3,r5
	SETS.L	#cmdlist_params,r0
	SETS.L	#cmdlist_param_tilebuf_a,r1
	mov.w	r3,@(r0,r1)
	add	#2,r1
	mov.w	r5,@(r0,r1)
	SETS.L	#cmdlist_param_tilebuf_b,r1
	mov.w	r3,@(r0,r1)
	add	#2,r1
	mov.w	r5,@(r0,r1)
	SETS.L	#cmdlist_param_cmdlist,r1
	mov.w	r2,@(r0,r1)
	add	#2,r1
	mov.w	r4,@(r0,r1)
	bsr	set_params
	 mov	r0,r1
	SETS.L	#VIDREG_BASE+0x8144,r0
	mov.l	@r0,r0
	lds.l	@r15+,pr
	rts
	 nop

draw_scene:
	sts.l	pr,@-r15
	SETS.L	#scene_faces,r9
	SETS.L	#n_scene_faces,r8
	SETS.L	#ta_cmd,r7
	SETS.L	#0,r6
	SETS.L	#0f1,r5
	SETS.L	#xform_coords,r4
	SETS.L	#3*4,r3
1:	SETS.L	#TA_CMD_POLYGON|TA_CMD_POLYGON_TYPE_OPAQUE|TA_CMD_POLYGON_SUBLIST|TA_CMD_POLYGON_STRIPLENGTH_2|TA_CMD_POLYGON_TEXTURED,r0
	mov.l	r0,@r7		; cmd
	SETS.L	#TA_POLYMODE1_Z_GREATER|TA_POLYMODE1_CULL_CCW,r0
	mov.l	r0,@(4,r7)	; mode1
	SETS.L	#TA_TEXTUREMODE_CLUT8,r1
	mov.l	@(48,r9),r0	; palette number
	SHLL	#TA_TEXTUREMODE_CLUTBANK8_SHIFT,r0/r2
	or	r0,r1
	mov.l	@(52,r9),r0	; texture number
	SETS.L	#textures,r2
	SHLL	#3,r0
	add	r0,r2
	mov.l	@r2,r0		; texture pointer
	mov.l	@(4,r2),r2	; size bits
	SETS.L	#cur_texture_mode,r10
	mov.l	@r10,r10
	or	r2,r10
	mov.l	r10,@(8,r7)	; mode2
	SHXR	#TA_TEXTUREMODE_ADDRESS_SHIFT,r0
	SETS.L	#TA_TEXTUREMODE_ADDRESS_MASK,r2
	and	r2,r0
	or	r0,r1
	mov.l	r1,@(12,r7)	; texture
	mov.l	r6,@(16,r7)	; alpha
	mov.l	r6,@(20,r7)	; red
	mov.l	r6,@(24,r7)	; green
	bsr	commit_ta_cmd
	 mov.l	r6,@(28,r7)	; blue
	SETS.L	#TA_CMD_VERTEX,r1
	mov.l	r1,@r7		; cmd
	mov.l	r6,@(28,r7)	; ocolour
	not	r6,r1
	mov.l	r1,@(24,r7)	; colour
	mov.l	@r9,r1
	mulu.w	r1,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(4,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(8,r9),r0
	bsr	commit_ta_cmd
	 mov.l	r0,@(20,r7)	; v
	mov.l	@(12,r9),r0
	mulu.w	r0,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(16,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(20,r9),r0
	bsr	commit_ta_cmd
	 mov.l	r0,@(20,r7)	; v
	mov.l	@(24,r9),r0
	mulu.w	r0,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(28,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(32,r9),r0
	bsr	commit_ta_cmd
	 mov.l	r0,@(20,r7)	; v
	mov.l	@(36,r9),r0
	mulu.w	r0,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(40,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(44,r9),r0
	mov.l	r0,@(20,r7)	; v
	SETS.L	#TA_CMD_VERTEX|TA_CMD_VERTEX_EOS,r1
	bsr	commit_ta_cmd
	 mov.l	r1,@r7		; cmd
	dt	r8
	bf/s	1b
	 add	#56,r9
	; making this a loop saves only one instruction and adds time.
	mov.l	r6,@r7
	mov.l	r6,@(4,r7)
	mov.l	r6,@(8,r7)
	mov.l	r6,@(12,r7)
	mov.l	r6,@(16,r7)
	mov.l	r6,@(20,r7)
	mov.l	r6,@(24,r7)
	bsr	commit_ta_cmd
	 mov.l	r6,@(28,r7)
	lds.l	@r15+,pr
	rts
	 nop
commit_ta_cmd:
	; In tatest terms, this is ta_commit_list(), with the argument
	;  always being ta_cmd.
	SETS.L	#QACR0,r1
	SETS.L	#STOREQ_BASE,r14
	SETS.L	#[[TA_CMD_BASE>>26]&7]<<2,r13
	SETS.L	#ta_cmd,r12
	SETS.L	#8,r11
	mov.l	r13,@r1
	mov	r14,r10
1:	mov.l	@r12+,r0
	dt	r11
	mov.l	r0,@r14
	bf/s	1b
	 add	#4,r14
	rts
	 pref	@r10
handle_maple:
	SETS.L	#BUS_STATE,r3
1:	mov.l	@r3,r0
	tst	#BUS_STATE_RUNNING,r0
	bf	1b
	SETS.L	#maple_resp,r0
	; We ocbi only one cache line, because the parts of the
	;  response we care about fit in a single cache line.  The
	;  hardware's alignment requirements for maple buffers match
	;  cache line alignments, and we access only 8 bytes of it at
	;  low offsets.
	;	
	; We arguably should ocbi the line back just before we kick off
	;  the maple operation rather than waiting until here.  Since
	;  we never write to this cache line, the only difference I see
	;  is whether it sits around in the cache in the interim.  This
	;  might conceivably affect something, but even if it does I
	;  have trouble seeing the difference being more than one cache
	;  line fill penalty.
	ocbi	@r0
	mov.l	@(8,r0),r1
	mov.l	@(12,r0),r2
	SETS.L	#curistate,r0
	mov.l	r1,@r0
	mov.l	r2,@(4,r0)
	rts
	 nop
await_video:
	; In tatest terms, this is everything in the main loop after
	;  the call to ta_commit_end().
	; ta_wait_render()
	SETS.L	#TA_RENDER_EVENT,r1
	SETS.L	#TA_RENDER_BIT,r2
1:	mov.l	@r1,r0
	tst	r2,r0
	bt	1b
	mov.l	r2,@r1
	; wait_bovp()
	SETS.L	#VBLANK_REG,r1
	SETS.L	#VBLANK_VBIT,r2
	mov.l	r2,@r1
1:	mov.l	@r1,r0
	tst	r2,r0
	bt	1b
	rts
	 mov.l	r2,@r1
next_frame:
	; Switch to the previously-rendered screen
	SETS.L	#curbuf,r10
	SETS.L	#render_buf,r11
	mov.b	@r10,r0
	SHLL	#2,r0
	mov.l	@(r0,r11),r1
	SETS.L	#0x007fffff,r12
	SETS.L	#DISPLAY_VRAM,r3
	and	r12,r1
	mov.l	r1,@r3
	SETS.L	#SHORT_FRAME_OFFSET,r0
	add	r0,r1
	mov.l	r1,@(4,r3)
	; Kick off rendering to the screen we just stopped displaying
	; In tatest terms, this is ta_begin_render.
	mov.b	@r10,r0			; curbuf
	SETS.L	#cmdlists,r1
	SHLL	#2,r0
	SETS.L	#tiledesc_cookies,r2
	mov.l	@(r0,r1),r1		; cmdlist
	mov.l	@(r0,r2),r2		; tiles
	xor	#4,r0
	mov.l	@(r0,r11),r3		; scrn
	SETS.L	#VIDREG_BASE+0x8138,r4
	SETS.L	#0x12,r5
	SETS.L	#0,r6
	mov.l	@r4,r4
	SETS.L	#VRAM_BASE_32,r0
	or	r0,r4			; taend
1:	mov.l	r6,@r4
	dt	r5
	bf/s	1b
	 add	#4,r4
	add	#-0x12*4,r4
	; We could use set_params here, but between the number of
	;  values to store and the need to break longs into two words,
	;  it's less pain to do it this way.
	;
	; Do we have to do all these in exactly this order?  I suspect
	;  not, but, absent documentation, it's hard to tell how much
	;  deviation is OK.  We stick strictly to tatest's order.
	SETS.L	#VIDREG_BASE+0x802c,r5
	and	r12,r2
	mov.l	r2,@r5			; 0xa05f802c
	add	#0x8020-0x802c,r5
	mov	r1,r0
	and	r12,r0
	mov.l	r0,@r5			; 0xa05f8020
	add	#0x8060-0x8020,r5
	and	r12,r3
	mov.l	r3,@r5			; 0xa05f8060
	add	#0x808c-0x8060,r5
	sub	r1,r4
	SHLL	#1,r4
	SETS.L	#0x01000000,r0
	or	r4,r0
	mov.l	r0,@r5			; 0xa05f808c
	add	#0x8088-0x808c,r5
	SETS.L	#0x3e4cccc0,r0		; tatest says "zclip"
	mov.l	r0,@r5			; 0xa05f8088
	add	#0x8068-0x8088,r5
	SETS.L	#[FRAME_X-1]<<16,r0	; tatest calls it "clipw"
	mov.l	r0,@r5			; 0xa05f8068
	add	#0x806c-0x8068,r5
	SETS.L	#[FRAME_Y-1]<<16,r0	; tatest calls it "cliph"
	mov.l	r0,@r5			; 0xa05f806c
	add	#0x804c-0x806c,r5
	SETS.L	#[FRAME_X*2]>>3,r0	; tatest calls it "modulo"
	mov.l	r0,@r5			; 0xa05f804c
	add	#0x8048-0x804c,r5
	SETS.L	#TA_PIXFMT_RGB565|TA_PIXFMT_DITHER,r0	; tatest calls it "pixfmt"
	mov.l	r0,@r5			; 0xa05f8048
	add	#0x8014-0x8048,r5
	SETS.L	#0xffffffff,r0		; tatest says "Launch!"
	mov.l	r0,@r5			; 0xa05f8014
	; curbuf = ! curbuf
	mov.b	@r10,r0
	tst	r0,r0
	bt/s	1f
	 add	#1,r0
	mov	#0,r0
1:
	rts
	 mov.b	r0,@r10

	SETCONST

; Rotate (fr0,fr1,fr2) by fpul fsca units around axis (fr4,fr5,fr6).
; fpul is in integer format (as required by fsca).
; The axis vector must be normalized already.
; Output in (fr0,fr1,fr2).
; Preserves fr4-fr6, fr13-fr15, fpul, all CPU registers.
; Destroys fr3, fr7-fr12.
; Let s = sin(fpul), c = cos(fpul); output in terms of input is
;
;	fr0 =	(fr0 * ((fr4 * fr4 * (1-c)) + c)) +		A
;		(fr1 * ((fr4 * fr5 * (1-c)) - (fr6 * s))) +	B
;		(fr2 * ((fr4 * fr6 * (1-c)) + (fr5 * s)))	C
;
;	fr1 =	(fr0 * ((fr5 * fr4 * (1-c)) + (fr6 * s))) +	D
;		(fr1 * ((fr5 * fr5 * (1-c)) + c)) +		E
;		(fr2 * ((fr5 * fr6 * (1-c)) - (fr4 * s)))	F
;
;	fr2 =	(fr0 * ((fr6 * fr4 * (1-c)) - (fr5 * s))) +	G
;		(fr1 * ((fr6 * fr5 * (1-c)) + (fr4 * s))) +	H
;		(fr2 * ((fr6 * fr6 * (1-c)) + c))		I
rotate_around_axis:
	fsca	fpul,fr8	; fr8 = s, fr9 = c
	fldi1	fr3
	fsub	fr9,fr3		; fr3 = 1-c
	fmov	fr4,fr7		; fr4
	fmul	fr4,fr7		; fr4 * fr4
	fmul	fr3,fr7		; fr4 * fr4 * (1-c)
	fadd	fr9,fr7		; (fr4 * fr4 * (1-c)) + c
	fmul	fr0,fr7		; A
	fmov	fr4,fr10	; fr4
	fmul	fr5,fr10	; fr4 * fr5
	fmul	fr3,fr10	; fr4 * fr5 * (1-c)
	fmov	fr6,fr11	; fr6
	fmul	fr8,fr11	; fr6 * s
	fsub	fr11,fr10	; (fr4 * fr5 * (1-c)) - (fr6 * s)
	fmul	fr1,fr10	; B
	fadd	fr10,fr7	; A + B
	fmov	fr4,fr12	; fr4
	fmul	fr6,fr12	; fr4 * fr6
	fmul	fr3,fr12	; fr4 * fr6 * (1-c)
	fmov	fr5,fr11	; fr5
	fmul	fr8,fr11	; fr5 * s
	fadd	fr11,fr12	; (fr4 * fr6 * (1-c)) + (fr5 * s)
	fmul	fr2,fr12	; C
	fadd	fr7,fr12	; output fr0
	fmov	fr5,fr7		; fr5
	fmul	fr4,fr7		; fr5 * fr4
	fmul	fr3,fr7		; fr5 * fr4 * (1-c)
	fmov	fr6,fr10	; fr6
	fmul	fr8,fr10	; fr6 * s
	fadd	fr10,fr7	; (fr5 * fr4 * (1-c)) + (fr6 * s)
	fmul	fr0,fr7		; D
	fmov	fr5,fr11	; fr5
	fmul	fr6,fr11	; fr5 * fr6
	fmul	fr3,fr11	; fr5 * fr6 * (1-c)
	fmov	fr4,fr10	; fr4
	fmul	fr8,fr10	; fr4 * s
	; This is our point of maximum register use.
	; We have the following, all live, at this point:
	; fr0,fr1,fr2 = input values
	; fr3 = 1-c
	; fr4,fr5,fr6,fpul = input values to be preserved
	; fr7 = D
	; fr8 = s
	; fr9 = c
	; fr10 = fr4 * s
	; fr11 = fr5 * fr6 * (1-c)
	; fr12 = output fr0
	fsub	fr10,fr11	; (fr5 * fr6 * (1-c)) - (fr4 * s)
	fmul	fr2,fr11	; F
	fadd	fr7,fr11	; D + F
	fmov	fr5,fr10	; fr5
	fmul	fr5,fr10	; fr5 * fr5
	fmul	fr3,fr10	; fr5 * fr5 * (1-c)
	fadd	fr9,fr10	; (fr5 * fr5 * (1-c)) + c
	fmul	fr1,fr10	; E
	fadd	fr10,fr11	; output fr1
	fmov	fr6,fr7		; fr6
	fmul	fr6,fr7		; fr6 * fr6
	fmul	fr3,fr7		; fr6 * fr6 * (1-c)
	fadd	fr9,fr7		; (fr6 * fr6 * (1-c)) + c [fr9 dead]
	fmul	fr7,fr2		; I [fr2 dead]
	fmov	fr6,fr7		; fr6
	fmul	fr5,fr7		; fr6 * fr5
	fmul	fr3,fr7		; fr6 * fr5 * (1-c)
	fmov	fr4,fr10	; fr4
	fmul	fr8,fr10	; fr4 * s
	fadd	fr10,fr7	; (fr6 * fr5 * (1-c)) + (fr4 * s)
	fmul	fr1,fr7		; H [fr1 dead]
	fadd	fr7,fr2		; H + I
	fmov	fr6,fr7		; fr6
	fmul	fr4,fr7		; fr6 * fr4
	fmul	fr3,fr7		; fr6 * fr4 * (1-c) [fr3 dead]
	fmul	fr5,fr8		; fr5 * s [fr8 dead]
	fsub	fr8,fr7		; (fr6 * fr4 * (1-c)) - (fr5 * s)
	fmul	fr0,fr7		; G [fr0 dead]
	fadd	fr7,fr2		; output fr2
	fmov	fr11,fr1	; output fr1
	rts
	 fmov	fr12,fr0	; output fr0

; Modifies (fr0,fr1,fr2) by subtracting off the component in the
;  direction of (fr8,fr9,fr10).
; (fr8,fr9,fr10) must be normalized already.
; Output in (fr0,fr1,fr2).
; Preserves fr4-fr6, fr8-fr15, fpul, all CPU registers.
; Destroys fr3, fr7.
; Output in terms of input is
;
;	Let dp = (fr0 * fr8) + (fr1 * fr9) + (fr2 * fr10)
;
;	fr0 = fr0 - (dp * fr8)
;	fr1 = fr1 - (dp * fr9)
;	fr2 = fr2 - (dp * fr10)
subtract_component:
	fldi0	fr3
	fipr	fv8,fv0
	fmov	fr3,fr7
	fmul	fr8,fr7
	fsub	fr7,fr0
	fmov	fr3,fr7
	fmul	fr9,fr7
	fsub	fr7,fr1
	fmov	fr3,fr7
	fmul	fr10,fr7
	rts
	 fsub	fr7,fr2

; Normalize the vector in (fr0,fr1,fr2).
; Output in (fr0,fr1,fr2).
; Preserves fr4-fr15, fpul, all integer registers.
; Destroys fr3.
normalize:
	fldi0	fr3
	fipr	fv0,fv0
	fsrra	fr3
	fmul	fr3,fr0
	fmul	fr3,fr1
	rts
	 fmul	fr3,fr2

; computes (fr3,fr4,fr5) × (fr0,fr1,fr2) -> (fr0,fr1,fr2)
; uses fr6 as temporary; destroys fr3/fr4/fr5 inputs too
; ( (fr4*fr2)-(fr5*fr1) , (fr5*fr0)-(fr3*fr2) , (fr3*fr1)-(fr4*fr0) )
;       A    B    C           D    E    F           G    H    I
crossproduct:
	fmov	fr0,fr6
	fmul	fr5,fr6	; D
	fmul	fr1,fr5	; C, input fr1 now dead
	fmul	fr3,fr1	; G, input fr3 now dead
	fmul	fr2,fr3	; F, input fr2 now dead
	fmul	fr4,fr2	; A, input fr4 now dead
	fmul	fr0,fr4	; I, input fr0 and fr5 now dead
	fmov	fr2,fr0	; A, temporary fr2 now dead
	fsub	fr5,fr0	; B, A and C now dead
	fmov	fr1,fr2	; G, temporary fr1 now dead
	fsub	fr4,fr2	; H, G and I now dead
	fmov	fr6,fr1	; D, temporary fr6 now dead
	rts
	 fsub	fr3,fr1	; E, D and F now dead

; Input value in r1
; Destroys r0, r1
printdec01:
	mov.l	r4,@-r15
	mov.l	r3,@-r15
	sts.l	pr,@-r15
	bsr	printdec
	 mov.l	r2,@-r15
	mov.l	@r15+,r2
	lds.l	@r15+,pr
	mov.l	@r15+,r3
	rts
	 mov.l	@r15+,r4
; Input value in r1
; Destroys r0, r1, r2, r3, r4
printdec:
	sts.l	pr,@-r15
	tst	r1,r1
	bf	1f
	lds.l	@r15+,pr
	bra	putchar
	 mov	#'0,r1
1:	mov	r1,r2
	SETS.L	#p10table,r3
1:	mov.l	@r3+,r4
	cmp/hs	r4,r2
	bf	1b
2:	mov	#'0,r1
1:	cmp/hs	r4,r2
	bf	1f
	sub	r4,r2
	bra	1b
	 add	#1,r1
1:	bsr	putchar
	 nop
	mov.l	@r3+,r4
	tst	r4,r4
	bf	2b
	lds.l	@r15+,pr
	rts
	 nop
	.align	4
p10table:
	.long	1000000000
	.long	100000000
	.long	10000000
	.long	1000000
	.long	100000
	.long	10000
	.long	1000
	.long	100
	.long	10
	.long	1
	.long	0
	.align	2
printhex8:
	mov	#8,r0
printhexN:
	mov.l	r4,@-r15
	mov	r0,r4
	add	#-8,r0
	neg	r0,r0
	SHLL	#2,r0
	shld	r0,r1
	mov.l	r3,@-r15
	mov.l	r2,@-r15
	sts.l	pr,@-r15
	mova	9f,r0
	mov	r0,r3
	mov	r1,r2
1:	mov	r2,r0
	SHLR	#28,r0/r1
	SHLL	#4,r2
	add	r3,r0
	bsr	putchar
	 mov.b	@r0,r1
	dt	r4
	bf	1b
	lds.l	@r15+,pr
	mov.l	@r15+,r2
	mov.l	@r15+,r3
	rts
	 mov.l	@r15+,r4
	.align	4
9:	.ascii	"0123456789abcdef"
	.align	2
putchar:
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	and	#SCFDR2_TX_MASK,r0
	cmp/eq	#16,r0
	bt	1b
	mov	r1,r0
	mov.b	r0,@(SCFTDR2-SCIF_BASE,gbr)
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	tst	#SCFDR2_TX_MASK,r0
	bf	1b
	rts
	 nop
putstr:
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	and	#SCFDR2_TX_MASK,r0
	cmp/eq	#16,r0
	bt	1b
	mov.b	@r1+,r0
	tst	r0,r0
	bt	1f
	bra	1b
	 mov.b	r0,@(SCFTDR2-SCIF_BASE,gbr)
1:	; don't bother waiting for drain here; we do a putchar call,
	;  which will drain everything, after all putstr calls and
	;  before anything for which it matters.
	rts
	 nop
print_float:
	; float in r1
	; uses r0, r1, r2, fr0, fr1, fr2, fpul
	sts.l	pr,@-r15
	; check for negative; if so, print - and negate
	lds	r1,fpul
	fsts	fpul,fr0
	fldi0	fr1
	fcmp/gt	fr0,fr1
	bf	1f
	bsr	putchar
	 mov	#'-,r1
	fneg	fr0
1:	; divide by 10 until it's less than 10, and keep count
	mov	#10,r0
	lds	r0,fpul
	float	fpul,fr1
	mov	#0,r2
1:	fcmp/gt	fr0,fr1
	bt	1f
	fdiv	fr1,fr0
	bra	1b
	 add	#1,r2
1:	; now fr0 < 10 and r2 is the number of divisions we did
	; print the first (possibly only) digit before the .
	ftrc	fr0,fpul
	sts	fpul,r1
	bsr	putchar
	 add	#'0,r1
	float	fpul,fr2
	fsub	fr2,fr0
	; now, for r2 loops, print next digit
1:	cmp/pl	r2
	bf	1f
	fmul	fr1,fr0
	ftrc	fr0,fpul
	sts	fpul,r1
	float	fpul,fr2
	bsr	putchar
	 add	#'0,r1
	fsub	fr2,fr0
	bra	1b
	 add	#-1,r2
1:	; print as many digits as necessary to reach 0
	; print a . before the first one, if there are any
	mov	#'.,r1
	SETS.L	#0f0,r0
	lds	r0,fpul
1:	; Invariants at this point:
	;  - fpul contains integer part to be subtracted from fr0
	;  - r1 contains next character to print
	;  - loop if fr0 != 0 at this point
	fldi0	fr2
	fcmp/eq	fr0,fr2
	bt	2f
	float	fpul,fr2
	fsub	fr2,fr0
	fmul	fr1,fr0
	bsr	putchar
	 ftrc	fr0,fpul
	sts	fpul,r1
	bra	1b
	 add	#'0,r1
2:	; Done.
	lds.l	@r15+,pr
	rts
	 nop
nbgetchar:
	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_RX_SHIFT,r0/r1
	tst	#SCFDR2_RX_MASK,r0
	bt	1f
	mov.b	@(SCFRDR2-SCIF_BASE,gbr),r0
	extu.b	r0,r1
	mov.w	@(SCLSR2-SCIF_BASE,gbr),r0
	mov	#0,r0
	mov.w	r0,@(SCLSR2-SCIF_BASE,gbr)
	rts
	 mov	r1,r0
1:	rts
	 mov	#-1,r0

panic:
	sts.l	pr,@-r15
	mov.l	r14,@-r15
	mov.l	r13,@-r15
	mov.l	r12,@-r15
	mov.l	r11,@-r15
	mov.l	r10,@-r15
	mov.l	r9,@-r15
	mov.l	r8,@-r15
	mov.l	r7,@-r15
	mov.l	r6,@-r15
	mov.l	r5,@-r15
	mov.l	r4,@-r15
	mov.l	r3,@-r15
	mov.l	r2,@-r15
	mov.l	r1,@-r15
	mov.l	r0,@-r15
	SETS.L	#panic_msg,r1
	bsr	putstr
	 nop
	SETS.L	#16,r2
1:	bsr	printhex8
	 mov.l	@r15+,r1
	SETS.L	#panic_crlf,r1
	bsr	putstr
	 nop
	dt	r2
	bf	1b
	SETS.L	#throw_sp,r0
	mov.l	@r0,r15
	SETS.L	#throw_out,r0
	jmp	@r0
	 nop
panic_msg:
	.ascii	"panic"
panic_crlf:
	.asciz	(13,10)

	.align	2
	SETCONST

	.include "crash-handler.s"