; Debugging flags.  Set these to 1 to turn on various debugging output.
; set_params
debug_set_params = 0
; rendering cycle kickoff
debug_start_render = 0
; texture setup
debug_texture = 0
; TA command commits
debug_ta_commit = 0

; This is designed to be serial-line downloaded to cdcode.
;
; This matters mostly in the interfaces it means we expect.  In
;  particular, we are not called with a bsr/jsr; we are entered with a
;  jmp, and our return-to address, to the extent that we have one, is
;  in r11, not pr.  We also expect registers set up the way cdcode sets
;  them; in particular, we expect r15 to be set to point to a stack,
;  8c010000 (or a little below that if cdcode happens to have anything
;  on the stack), and r14 set to the SCIF's base address.  If we return
;  to cdcode, it expects those two, and r11 and r10, to be preserved.
;  It doesn't mind if we trash r12/r13, but we preserve them too.
;
; Our memory map:
;
;	[8c000000,8c010000)	Stack (r15 set by cdcode)
;	[8c010000,8c01????)	cdcode
;	[8c020000,8c0?????)	Us
;
; Our entry point is 8c020000.  We don't set an entry point here with
;  .entry because then send-s would jump to us directly, and I'd rather
;  do that manually.

; Throughout this file, MC, used as a name in the comments, means
;  Marcus Comstedt, and tatest is a C program of his which he
;  distributes as an example of a 3D rendering program.  This file
;  started out as tatest.s, a (manual) rewriting in assembly of tatest,
;  but it's evolving in its own directions.

	.include "regs.s"
	.include "ta-cmds.s"
	.include "maple-bits.s"

VRAM_BASE_32 = 0xa5000000
VRAM_BASE_64 = 0xa4000000
VRAM_SIZE = 8 << 20
STOREQ_BASE = 0xe0000000
VIDREG_BASE = 0xa05f0000
X_SIZE = 640
Y_SIZE = 480
VBLANK_REG = VIDREG_BASE + 0x6900
VBLANK_VBIT = 0x08
DISPLAY_VRAM = VIDREG_BASE + 0x8050
SHORT_FRAME_OFFSET = X_SIZE*2 ; X_SIZE pixels at two bytes each

COT_FOVY = 0f1.73 ; cot(FOVy/2), field-of-view angle figure
ZNEAR = 0f1
ZFAR = 0f100
BUTTON_FACTOR = 0f3
DISTANCE = 0f15

; Layout of some things in video RAM.  We double-buffer, so there are
;  two of most of these.  _a and _b suffixes indicate the pairs.
;
; We copy tatest's layout in video RAM.  This means we put rendered
;  scenes at [a5000000,a512c000) and TA tile buffers, tile descriptors,
;  and command lists at [a5400000,a550df00) (these values are for
;  X_SIZE=640 Y_SIZE=480), with textures in [a4400000-a4420000), the
;  textures using the same memory as [a5200000-a5210000) and
;  [a5600000-a5610000).
;
; Unfortunately the rendering and video hardware aren't capable of
;  making distinctions equivalent to the difference between the CPU's
;  a4xxxxxx and a5xxxxxx views of video RAM (textures always come from
;  a4xxxxxx, or, to be more precise, always access video RAM in a way
;  compatible with the CPU's a4xxxxxx view, whereas everything else
;  comes from a5xxxxxx).  So we're stuck jigsawing together a4xxxxxx
;  allocations for textures and a5xxxxxx allocations for other stuff.

	; Texture space.
. = VRAM_BASE_64 + 0x00400000
	; Cube faces.  Each one uses 256*256 bytes.
texture_cubeface_1:
	.space	65536
texture_cubeface_2:
	.space	65536
	; Characters.  We need only 0 and 1.  Each one is an 8x8
	;  texture, taking 64 bytes.
texture_font_0:
	.space	64
texture_font_1:
	.space	64

	; Space to render into.  Each field takes up X_SIZE*Y_SIZE
	;  pixels at two bytes per pixel.  (If it's displayed
	;  interlaced, this is handled with the display hardware; in
	;  memory it's totally non-interlaced.)
render_buf_size = X_SIZE * Y_SIZE * 2
. = VRAM_BASE_32
render_buf_a:
	.space	render_buf_size
render_buf_b:
	.space	render_buf_size

	; Tile descriptors.  There is one of these, at 6 longs, per
	;  tile; there is also a 24-long header.  Each tile is 32x32
	;  pixels.  So for a 640x480 screen, we need
	;  24+(6*(640/32)*(480/32)) longs of space.  (I don't know what
	;  happens if the screen width or height is not a multiple of
	;  32.)  Each tile also uses 64 bytes of buffer space.
ta_buffers_size_cmd_list = 512 * 1024
ta_buffers_size_tile_buffer = 64 * [X_SIZE/32] * [Y_SIZE/32]
ta_buffers_size_tile_descriptor = 4 * [24 + [6 * [X_SIZE/32] * [Y_SIZE/32]]]
. = VRAM_BASE_32 + 0x00400000
ta_buffers_cmd_list_a:
	.space	ta_buffers_size_cmd_list
ta_buffers_cmd_list_b:
	.space	ta_buffers_size_cmd_list
ta_buffers_tile_buffer_a:
	.space	ta_buffers_size_tile_buffer
ta_buffers_tile_buffer_b:
	.space	ta_buffers_size_tile_buffer
ta_buffers_tile_descriptor_a:
	.space	ta_buffers_size_tile_descriptor
ta_buffers_tile_descriptor_b:
	.space	ta_buffers_size_tile_descriptor

; End of layout of video RAM.

. = 0x8c020000

	.sz	any
	.pr	any
	SETS.L	#main,r0
	jmp	@r0
	 nop
	SETCONST

	; Our "data segment".  We don't really have segments the way
	;  the term implies.  The data is here rather than at the end
	;  so the symbols' values are known by the time the assembler
	;  sees them later.  This is not critical, but does produce
	;  slightly better code.
	;
	; Things here are ordered approximately by decreasing alignment
	;  requirement.  Not essential, just avoids needless gaps.

	; The maple command and response buffers.  The hardware
	;  requires they be aligned on 32-byte boundaries.
	.align	32
maple_cmd:
	.long	XDESC_LAST | [0 << XDESC_PORTSHIFT] | [1 << XDESC_LENSHIFT]
	.long	maple_resp & DMA_ADDRMASK
	MapleFrame	CMD_GETCOND, 0, ADDR_MAIN, 0, 0, 1
	.long	@BSL[FUNC_CONTROLLER]
	.align	32
	; 1024 is the largest the hardware supports, so it's a safe
	;  limit.  (The amount actually used is usually fairly small.)
maple_resp:
	.space	1024

	; The base matrix (composition of screenview, projection, and
	;  translation).
	.align	8
base_matrix:
	.space	16*4

	; Font setup table.
font_setup:
	.long	init_font_0, texture_font_0
1:	.long	init_font_1, texture_font_1
n_font_setup = [. - font_setup] / [1b - font_setup]

	; The current and previous controller input state.  The
	;  patterns we initialize this to are what the controller sends
	;  when it's not being touched.
	.align	4
curistate:
	.long	0x0000ffff, 0x80808080
previstate:
	.long	0x0000ffff, 0x80808080

	; Pointers to the texture memory.  We have two different
	;  cube-face textures (each of which is used with three
	;  different palettes, though that doesn't matter here), and
	;  two font character textures.
	.align	4
textures:
	.long	texture_font_0, TA_POLYMODE2_U_SIZE_8 | TA_POLYMODE2_V_SIZE_8
	.long	texture_font_1, TA_POLYMODE2_U_SIZE_8 | TA_POLYMODE2_V_SIZE_8
	.long	texture_cubeface_1, TA_POLYMODE2_U_SIZE_256 | TA_POLYMODE2_V_SIZE_256
	.long	texture_cubeface_2, TA_POLYMODE2_U_SIZE_256 | TA_POLYMODE2_V_SIZE_256

	; Pointers to the two places screens get rendered into.
	.align	4
render_buf:
	.long	render_buf_a
	.long	render_buf_b

	; Cookies to pass to the hardware (void *tiles[2] in tatest)
	.align	4
tiledesc_cookies:
	.space	2*4

	; Command lists (ta_buffers cmd_list arrays in tatest; we point
	;  to them rather than using a struct to generate offsets)
	.align	4
cmdlists:
	.long	ta_buffers_cmd_list_a
	.long	ta_buffers_cmd_list_b

	; Tile buffers (the 64-bytes-per-tile work space)
	.align	4
tilebuffers:
	.long	ta_buffers_tile_buffer_a
	.long	ta_buffers_tile_buffer_b

	; Tile descriptors (the spaces in which the descriptors are
	;  built)
	.align	4
tiledescs:
	.long	ta_buffers_tile_descriptor_a
	.long	ta_buffers_tile_descriptor_b

	; Current orientation, stored in the form of the world axes in
	;  eye coordinates.  This is just the rotation.
	.align	4
eye_x:
	.long	0f1, 0f0, 0f0
eye_y:
	.long	0f0, 0f1, 0f0
eye_z:
	.long	0f0, 0f0, 0f1

	; Texture modes.
	.align	4
texture_mode_base = TA_POLYMODE2_BLEND_DEFAULT|TA_POLYMODE2_FOG_DISABLED|TA_POLYMODE2_BILINEAR_FILTER|TA_POLYMODE2_MIPMAP_D_1_00|TA_POLYMODE2_TEXTURE_REPLACE
cur_texture_mode:
	.long	texture_mode_base

	; Scene corner coordinates.
	.align	4
vertex_coords:
	.long	-0f1, -0f1, -0f1	;  0
	.long	-0f1, -0f1,  0f1	;  1
	.long	-0f1,  0f1,  0f1	;  2
	.long	-0f1,  0f1, -0f1	;  3
	.long	 0f1,  0f1, -0f1	;  4
	.long	 0f1,  0f1,  0f1	;  5
	.long	 0f1, -0f1,  0f1	;  6
	.long	 0f1, -0f1, -0f1	;  7
	.long	-0f3, -0f3, -0f3	;  8
	.long	 0f3, -0f3, -0f3	;  9
	.long	-0f3, -0f3, -0f2	; 10
	.long	 0f3, -0f3, -0f2	; 11
	.long	-0f3, -0f2, -0f2	; 12
	.long	 0f3, -0f2, -0f2	; 13
	.long	-0f3, -0f2, -0f3	; 14
	.long	 0f3, -0f2, -0f3	; 15
n_vertex_coords = [. - vertex_coords] / [3*4]
xform_coords:
	.space	n_vertex_coords*3*4
	; These vertices don't get transformed; they are fixed in terms
	;  of screen location, not in terms of 3D scene location.  The
	;  z coordinate (0f11) here is noncritical; it just needs to be
	;  larger than anything the scene produces.  (The right fix
	;  would be to store a flag indicating use of Z_ALWAYS rather
	;  than Z_GREATER, but that's more hair than I want to bother
	;  with.)
	.long	[0f16* 0]+ 0, 0f20, 0f11	; 16
	.long	[0f16* 0]+ 0, 0f32, 0f11	; 17
	.long	[0f16* 1]+ 0, 0f20, 0f11	; 18
	.long	[0f16* 1]+ 0, 0f32, 0f11	; 19
	.long	[0f16* 2]+ 0, 0f20, 0f11	; 20
	.long	[0f16* 2]+ 0, 0f32, 0f11	; 21
	.long	[0f16* 3]+ 0, 0f20, 0f11	; 22
	.long	[0f16* 3]+ 0, 0f32, 0f11	; 23
	.long	[0f16* 4]+ 0, 0f20, 0f11	; 24
	.long	[0f16* 4]+ 0, 0f32, 0f11	; 25
	.long	[0f16* 5]+ 0, 0f20, 0f11	; 26
	.long	[0f16* 5]+ 0, 0f32, 0f11	; 27
	.long	[0f16* 6]+ 0, 0f20, 0f11	; 28
	.long	[0f16* 6]+ 0, 0f32, 0f11	; 29
	.long	[0f16* 7]+ 0, 0f20, 0f11	; 30
	.long	[0f16* 7]+ 0, 0f32, 0f11	; 31
	.long	[0f16* 8]+ 0, 0f20, 0f11	; 32
	.long	[0f16* 8]+ 0, 0f32, 0f11	; 33

	.long	[0f16* 8]+ 8, 0f20, 0f11	; 34
	.long	[0f16* 8]+ 8, 0f32, 0f11	; 35
	.long	[0f16* 9]+ 8, 0f20, 0f11	; 36
	.long	[0f16* 9]+ 8, 0f32, 0f11	; 37
	.long	[0f16*10]+ 8, 0f20, 0f11	; 38
	.long	[0f16*10]+ 8, 0f32, 0f11	; 39
	.long	[0f16*11]+ 8, 0f20, 0f11	; 40
	.long	[0f16*11]+ 8, 0f32, 0f11	; 41
	.long	[0f16*12]+ 8, 0f20, 0f11	; 42
	.long	[0f16*12]+ 8, 0f32, 0f11	; 43
	.long	[0f16*13]+ 8, 0f20, 0f11	; 44
	.long	[0f16*13]+ 8, 0f32, 0f11	; 45
	.long	[0f16*14]+ 8, 0f20, 0f11	; 46
	.long	[0f16*14]+ 8, 0f32, 0f11	; 47
	.long	[0f16*15]+ 8, 0f20, 0f11	; 48
	.long	[0f16*15]+ 8, 0f32, 0f11	; 49
	.long	[0f16*16]+ 8, 0f20, 0f11	; 50
	.long	[0f16*16]+ 8, 0f32, 0f11	; 51

	.long	[0f16*16]+16, 0f20, 0f11	; 52
	.long	[0f16*16]+16, 0f32, 0f11	; 53
	.long	[0f16*17]+16, 0f20, 0f11	; 54
	.long	[0f16*17]+16, 0f32, 0f11	; 55
	.long	[0f16*18]+16, 0f20, 0f11	; 56
	.long	[0f16*18]+16, 0f32, 0f11	; 57
	.long	[0f16*19]+16, 0f20, 0f11	; 58
	.long	[0f16*19]+16, 0f32, 0f11	; 59
	.long	[0f16*20]+16, 0f20, 0f11	; 60
	.long	[0f16*20]+16, 0f32, 0f11	; 61
	.long	[0f16*21]+16, 0f20, 0f11	; 62
	.long	[0f16*21]+16, 0f32, 0f11	; 63
	.long	[0f16*22]+16, 0f20, 0f11	; 64
	.long	[0f16*22]+16, 0f32, 0f11	; 65
	.long	[0f16*23]+16, 0f20, 0f11	; 66
	.long	[0f16*23]+16, 0f32, 0f11	; 67
	.long	[0f16*24]+16, 0f20, 0f11	; 68
	.long	[0f16*24]+16, 0f32, 0f11	; 69

	.long	[0f16*24]+24, 0f20, 0f11	; 70
	.long	[0f16*24]+24, 0f32, 0f11	; 71
	.long	[0f16*25]+24, 0f20, 0f11	; 72
	.long	[0f16*25]+24, 0f32, 0f11	; 73
	.long	[0f16*26]+24, 0f20, 0f11	; 74
	.long	[0f16*26]+24, 0f32, 0f11	; 75
	.long	[0f16*27]+24, 0f20, 0f11	; 76
	.long	[0f16*27]+24, 0f32, 0f11	; 77
	.long	[0f16*28]+24, 0f20, 0f11	; 78
	.long	[0f16*28]+24, 0f32, 0f11	; 79
	.long	[0f16*29]+24, 0f20, 0f11	; 80
	.long	[0f16*29]+24, 0f32, 0f11	; 81
	.long	[0f16*30]+24, 0f20, 0f11	; 82
	.long	[0f16*30]+24, 0f32, 0f11	; 83
	.long	[0f16*31]+24, 0f20, 0f11	; 84
	.long	[0f16*31]+24, 0f32, 0f11	; 85
	.long	[0f16*32]+24, 0f20, 0f11	; 86
	.long	[0f16*32]+24, 0f32, 0f11	; 87

	; Coordinate numbers of the various faces' corners, with
	;  palette numbers and texture numbers.
	.macro	face	c1,c1u,c1v,c2,c2u,c2v,c3,c3u,c3v,c4,c4u,c4v,pal,tex
	.long	$(c2),@FLOAT[$(c2u)],@FLOAT[$(c2v)]
	.long	$(c1),@FLOAT[$(c1u)],@FLOAT[$(c1v)]
	.long	$(c3),@FLOAT[$(c3u)],@FLOAT[$(c3v)]
	.long	$(c4),@FLOAT[$(c4u)],@FLOAT[$(c4v)]
	.long	$(pal),$(tex)
	.endm
	.align	4
scene_faces:
	face	 0,0,0,  1,1,0,  2,1,1,  3,0,1,  0,  2
1:	face	 0,0,0,  7,1,0,  6,1,1,  1,0,1,  1,  2
	face	 0,0,0,  3,1,0,  4,1,1,  7,0,1,  2,  2
	face	 5,0,0,  6,1,0,  7,1,1,  4,0,1,  0,  3
	face	 5,0,0,  4,1,0,  3,1,1,  2,0,1,  1,  3
	face	 5,0,0,  2,1,0,  1,1,1,  6,0,1,  2,  3
	face	 8,0,0, 10,1,0, 12,1,1, 14,0,1,  0,  2
	face	 8,0,0,  9,6,0, 11,6,1, 10,0,1,  1,  2
	face	 8,0,0, 14,1,0, 15,1,6,  9,0,6,  2,  2
	face	13,0,0, 11,1,0,  9,1,1, 15,0,1,  0,  3
	face	13,0,0, 15,1,0, 14,1,6, 12,0,6,  1,  3
	face	13,0,0, 12,6,0, 10,6,1, 11,0,1,  2,  3

	face	16,0,0, 17,1,0, 19,1,1, 18,0,1,  0,  0
bits_base_bit = . - 4
bits_base_pal = . - 8
bits_inc = 1b - scene_faces
	face	18,0,0, 19,1,0, 21,1,1, 20,0,1,  0,  0
	face	20,0,0, 21,1,0, 23,1,1, 22,0,1,  0,  0
	face	22,0,0, 23,1,0, 25,1,1, 24,0,1,  0,  0
	face	24,0,0, 25,1,0, 27,1,1, 26,0,1,  0,  0
	face	26,0,0, 27,1,0, 29,1,1, 28,0,1,  0,  0
	face	28,0,0, 29,1,0, 31,1,1, 30,0,1,  0,  0
	face	30,0,0, 31,1,0, 33,1,1, 32,0,1,  0,  0

	face	34,0,0, 35,1,0, 37,1,1, 36,0,1,  0,  0
	face	36,0,0, 37,1,0, 39,1,1, 38,0,1,  0,  0
	face	38,0,0, 39,1,0, 41,1,1, 40,0,1,  0,  0
	face	40,0,0, 41,1,0, 43,1,1, 42,0,1,  0,  0
	face	42,0,0, 43,1,0, 45,1,1, 44,0,1,  0,  0
	face	44,0,0, 45,1,0, 47,1,1, 46,0,1,  0,  0
	face	46,0,0, 47,1,0, 49,1,1, 48,0,1,  0,  0
	face	48,0,0, 49,1,0, 51,1,1, 50,0,1,  0,  0

	face	52,0,0, 53,1,0, 55,1,1, 54,0,1,  0,  0
	face	54,0,0, 55,1,0, 57,1,1, 56,0,1,  0,  0
	face	56,0,0, 57,1,0, 59,1,1, 58,0,1,  0,  0
	face	58,0,0, 59,1,0, 61,1,1, 60,0,1,  0,  0
	face	60,0,0, 61,1,0, 63,1,1, 62,0,1,  0,  0
	face	62,0,0, 63,1,0, 65,1,1, 64,0,1,  0,  0
	face	64,0,0, 65,1,0, 67,1,1, 66,0,1,  0,  0
	face	66,0,0, 67,1,0, 69,1,1, 68,0,1,  0,  0

	face	70,0,0, 71,1,0, 73,1,1, 72,0,1,  0,  0
	face	72,0,0, 73,1,0, 75,1,1, 74,0,1,  0,  0
	face	74,0,0, 75,1,0, 77,1,1, 76,0,1,  0,  0
	face	76,0,0, 77,1,0, 79,1,1, 78,0,1,  0,  0
	face	78,0,0, 79,1,0, 81,1,1, 80,0,1,  0,  0
	face	80,0,0, 81,1,0, 83,1,1, 82,0,1,  0,  0
	face	82,0,0, 83,1,0, 85,1,1, 84,0,1,  0,  0
	face	84,0,0, 85,1,0, 87,1,1, 86,0,1,  0,  0
n_scene_faces = [. - scene_faces] / [1b - scene_faces]

	; A command to be sent to the TA.  There are two kinds of
	;  commands, one 32 bytes and one 64 bytes.  We reserve space
	;  for the larger against future need (we don't currently use
	;  64-byte commands).
	.align	4
ta_cmd:
	.space	64

	; These palettes are straight from tatest; I've just
	;  reformatted them from C to assembly.  It doesn't say where,
	;  if anywhere, they came from.  They're small enough I haven't
	;  bothered trying to compress them.
	.align	4
palette_0:
	.long	0xff000000,0xff3c3c3c,0xff413c3c,0xff493c3c,0xff4d3838,0xff553838,0xff593434,0xff613434
	.long	0xff653030,0xff6d3030,0xff712c2c,0xff792c2c,0xff822828,0xff862828,0xff8e2424,0xff922424
	.long	0xff9a2020,0xff9e2020,0xffa61c1c,0xffaa1c1c,0xffb21818,0xffb61818,0xffbe1414,0xffc71414
	.long	0xffcb1010,0xffd31010,0xffd70c0c,0xffdf0c0c,0xffe30808,0xffeb0808,0xffef0404,0xfff70404
	.long	0xffff0000,0xffff0400,0xffff0c00,0xffff1400,0xffff1c00,0xffff2400,0xffff2c00,0xffff3400
	.long	0xffff3c00,0xffff4500,0xffff4d00,0xffff5500,0xffff5d00,0xffff6500,0xffff6d00,0xffff7500
	.long	0xffff7d00,0xffff8600,0xffff8e00,0xffff9600,0xffff9e00,0xffffa600,0xffffae00,0xffffb600
	.long	0xffffbe00,0xffffc700,0xffffcf00,0xffffd700,0xffffdf00,0xffffe700,0xffffef00,0xfffff700
	.long	0xffffff00,0xffffff04,0xffffff0c,0xffffff14,0xffffff1c,0xffffff24,0xffffff2c,0xffffff34
	.long	0xffffff3c,0xffffff45,0xffffff4d,0xffffff55,0xffffff5d,0xffffff65,0xffffff6d,0xffffff75
	.long	0xffffff7d,0xffffff86,0xffffff8e,0xffffff96,0xffffff9e,0xffffffa6,0xffffffae,0xffffffb6
	.long	0xffffffbe,0xffffffc7,0xffffffcf,0xffffffd7,0xffffffdf,0xffffffe7,0xffffffef,0xfffffff7
	.long	0xffffffff,0xffffffff,0xfffffbfb,0xfffffbf7,0xfffff7f3,0xfffff7ef,0xfffff3eb,0xfffff3e7
	.long	0xffffefe3,0xffffefdf,0xffffebdb,0xffffebd7,0xffffe7d3,0xffffe7cf,0xffffe3cb,0xffffe3c7
	.long	0xffffdfc3,0xffffdfbe,0xffffdbba,0xffffdbb6,0xffffd7b2,0xffffd7ae,0xffffd3aa,0xffffd3a6
	.long	0xffffcfa2,0xffffcf9e,0xffffcb9a,0xffffcb96,0xffffc792,0xffffc78e,0xffffc38a,0xffffc386
	.long	0xffffbe82,0xffffba7d,0xffffba79,0xffffb675,0xffffb671,0xffffb26d,0xffffb269,0xffffae65
	.long	0xffffae61,0xffffaa5d,0xffffaa59,0xffffa655,0xffffa651,0xffffa24d,0xffffa249,0xffff9e45
	.long	0xffff9e41,0xffff9a3c,0xffff9a38,0xffff9634,0xffff9630,0xffff922c,0xffff9228,0xffff8e24
	.long	0xffff8e20,0xffff8a1c,0xffff8a18,0xffff8614,0xffff8610,0xffff820c,0xffff8208,0xffff7d04
	.long	0xffff7900,0xffff7900,0xffff7500,0xffff7100,0xffff6d00,0xffff6900,0xffff6500,0xffff6100
	.long	0xffff5d00,0xffff5900,0xffff5500,0xffff5100,0xffff4d00,0xffff4900,0xffff4500,0xffff4100
	.long	0xffff3c00,0xffff3c00,0xffff3800,0xffff3400,0xffff3000,0xffff2c00,0xffff2800,0xffff2400
	.long	0xffff2000,0xffff1c00,0xffff1800,0xffff1400,0xffff1000,0xffff0c00,0xffff0800,0xffff0400
	.long	0xffff0000,0xffff0000,0xfffb0000,0xfff70000,0xfff70000,0xfff30000,0xffef0000,0xffeb0000
	.long	0xffeb0000,0xffe70000,0xffe30000,0xffe30000,0xffdf0000,0xffdb0000,0xffd70000,0xffd70000
	.long	0xffd30000,0xffcf0000,0xffcf0000,0xffcb0000,0xffc70000,0xffc30000,0xffc30000,0xffbe0000
	.long	0xffba0000,0xffba0000,0xffb60000,0xffb20000,0xffae0000,0xffae0000,0xffaa0000,0xffa60000
	.long	0xffa20000,0xffa20000,0xff9e0404,0xff9a0404,0xff960808,0xff920808,0xff8e0c0c,0xff8e0c0c
	.long	0xff8a1010,0xff861010,0xff821414,0xff7d1414,0xff791818,0xff791818,0xff751c1c,0xff711c1c
	.long	0xff6d2020,0xff692020,0xff652424,0xff652424,0xff612828,0xff5d2828,0xff592c2c,0xff552c2c
	.long	0xff513030,0xff513030,0xff4d3434,0xff493434,0xff453838,0xff413838,0xff3c3c3c,0xff3c3c3c
palette_1:
	.long	0xff000000,0xff000000,0xff000004,0xff00000c,0xff000010,0xff000018,0xff000020,0xff000024
	.long	0xff00002c,0xff000030,0xff000038,0xff000041,0xff000045,0xff00004d,0xff000051,0xff000059
	.long	0xff000061,0xff000065,0xff00006d,0xff000075,0xff000079,0xff000082,0xff000086,0xff00008e
	.long	0xff000096,0xff00009a,0xff0000a2,0xff0000a6,0xff0000ae,0xff0000b6,0xff0000ba,0xff0000c3
	.long	0xff0000cb,0xff0004cb,0xff000ccb,0xff0010cf,0xff0018cf,0xff001cd3,0xff0024d3,0xff0028d3
	.long	0xff0030d7,0xff0038d7,0xff003cdb,0xff0045db,0xff0049db,0xff0051df,0xff0055df,0xff005de3
	.long	0xff0065e3,0xff0069e3,0xff0071e7,0xff0075e7,0xff007deb,0xff0082eb,0xff008aeb,0xff008eef
	.long	0xff0096ef,0xff009ef3,0xff00a2f3,0xff00aaf3,0xff00aef7,0xff00b6f7,0xff00bafb,0xff00c3fb
	.long	0xff00cbff,0xff04cbff,0xff0ccbff,0xff14cfff,0xff1ccfff,0xff24d3ff,0xff2cd3ff,0xff34d3ff
	.long	0xff3cd7ff,0xff45d7ff,0xff4ddbff,0xff55dbff,0xff5ddbff,0xff65dfff,0xff6ddfff,0xff75e3ff
	.long	0xff7de3ff,0xff86e3ff,0xff8ee7ff,0xff96e7ff,0xff9eebff,0xffa6ebff,0xffaeebff,0xffb6efff
	.long	0xffbeefff,0xffc7f3ff,0xffcff3ff,0xffd7f3ff,0xffdff7ff,0xffe7f7ff,0xffeffbff,0xfff7fbff
	.long	0xffffffff,0xfffbffff,0xfff7ffff,0xfff3ffff,0xffebffff,0xffe7ffff,0xffe3ffff,0xffdbffff
	.long	0xffd7ffff,0xffd3ffff,0xffcbffff,0xffc7ffff,0xffc3ffff,0xffbaffff,0xffb6ffff,0xffb2ffff
	.long	0xffaaffff,0xffa6ffff,0xffa2ffff,0xff9effff,0xff96ffff,0xff92ffff,0xff8effff,0xff86ffff
	.long	0xff82ffff,0xff7dffff,0xff75ffff,0xff71ffff,0xff6dffff,0xff65ffff,0xff61ffff,0xff5dffff
	.long	0xff55ffff,0xff51ffff,0xff4dffff,0xff49ffff,0xff41ffff,0xff3cffff,0xff38ffff,0xff30ffff
	.long	0xff2cffff,0xff28ffff,0xff20ffff,0xff1cffff,0xff18ffff,0xff10ffff,0xff0cffff,0xff08ffff
	.long	0xff00ffff,0xff00fbff,0xff00f7ff,0xff00f3ff,0xff00ebff,0xff00e7ff,0xff00e3ff,0xff00dbff
	.long	0xff00d7ff,0xff00d3ff,0xff00cbff,0xff00c7ff,0xff00c3ff,0xff00baff,0xff00b6ff,0xff00b2ff
	.long	0xff00aaff,0xff00a6ff,0xff00a2ff,0xff009eff,0xff0096ff,0xff0092ff,0xff008eff,0xff0086ff
	.long	0xff0082ff,0xff007dff,0xff0075ff,0xff0071ff,0xff006dff,0xff0065ff,0xff0061ff,0xff005dff
	.long	0xff0055ff,0xff0051ff,0xff004dff,0xff0049ff,0xff0041ff,0xff003cff,0xff0038ff,0xff0030ff
	.long	0xff002cff,0xff0028ff,0xff0020ff,0xff001cff,0xff0018ff,0xff0010ff,0xff000cff,0xff0008ff
	.long	0xff0000ff,0xff0000fb,0xff0000f7,0xff0000f3,0xff0000ef,0xff0000eb,0xff0000e7,0xff0000e3
	.long	0xff0000df,0xff0000db,0xff0000d7,0xff0000d3,0xff0000cf,0xff0000cb,0xff0000c7,0xff0000c3
	.long	0xff0000be,0xff0000ba,0xff0000b6,0xff0000b2,0xff0000ae,0xff0000aa,0xff0000a6,0xff0000a2
	.long	0xff00009e,0xff00009a,0xff000096,0xff000092,0xff00008e,0xff00008a,0xff000086,0xff000082
	.long	0xff00007d,0xff000079,0xff000075,0xff000071,0xff00006d,0xff000069,0xff000065,0xff000061
	.long	0xff00005d,0xff000059,0xff000055,0xff000051,0xff00004d,0xff000049,0xff000045,0xff000041
	.long	0xff00003c,0xff000038,0xff000034,0xff000030,0xff00002c,0xff000028,0xff000024,0xff000020
	.long	0xff00001c,0xff000018,0xff000014,0xff000010,0xff00000c,0xff000008,0xff000000,0xff000000
palette_2:
	.long	0xff000000,0xff9208e7,0xff9208e3,0xff9608e3,0xff9a04df,0xff9e04df,0xff9e04db,0xffa204db
	.long	0xffa600d7,0xffaa00d7,0xffaa00d3,0xffae00cf,0xffb200cf,0xffb600cb,0xffb600c7,0xffba00c7
	.long	0xffbe00c3,0xffbe00be,0xffc300be,0xffc700ba,0xffc700b6,0xffcb00b6,0xffcf00b2,0xffcf00ae
	.long	0xffd300aa,0xffd700aa,0xffd700a6,0xffdb04a2,0xffdb049e,0xffdf049e,0xffdf049a,0xffe30896
	.long	0xffe30892,0xffe70892,0xffe7088e,0xffeb0c8a,0xffeb0c86,0xffef0c82,0xffef1082,0xffef107d
	.long	0xfff31479,0xfff31475,0xfff31475,0xfff71871,0xfff7186d,0xfff71c69,0xfffb1c65,0xfffb2065
	.long	0xfffb2061,0xfffb245d,0xffff2859,0xffff2859,0xffff2c55,0xffff2c51,0xffff304d,0xffff344d
	.long	0xffff3449,0xffff3845,0xffff3c45,0xffff3c41,0xffff413c,0xffff453c,0xffff4538,0xffff4934
	.long	0xffff4d34,0xffff4d30,0xffff512c,0xffff552c,0xffff5928,0xffff5928,0xfffb5d24,0xfffb6120
	.long	0xfffb6520,0xfffb651c,0xfff7691c,0xfff76d18,0xfff77118,0xfff37514,0xfff37514,0xfff37914
	.long	0xffef7d10,0xffef8210,0xffef820c,0xffeb860c,0xffeb8a0c,0xffe78e08,0xffe79208,0xffe39208
	.long	0xffe39608,0xffdf9a04,0xffdf9e04,0xffdb9e04,0xffdba204,0xffd7a600,0xffd7aa00,0xffd3aa00
	.long	0xffcfae00,0xffcfb200,0xffcbb600,0xffc7b600,0xffc7ba00,0xffc3be00,0xffbebe00,0xffbec300
	.long	0xffbac700,0xffb6c700,0xffb6cb00,0xffb2cf00,0xffaecf00,0xffaad300,0xffaad700,0xffa6d700
	.long	0xffa2db04,0xff9edb04,0xff9edf04,0xff9adf04,0xff96e308,0xff92e308,0xff92e708,0xff8ee708
	.long	0xff8aeb0c,0xff86eb0c,0xff82ef0c,0xff82ef10,0xff7def10,0xff79f314,0xff75f314,0xff75f314
	.long	0xff71f718,0xff6df718,0xff69f71c,0xff65fb1c,0xff65fb20,0xff61fb20,0xff5dfb24,0xff59ff28
	.long	0xff59ff28,0xff55ff2c,0xff51ff2c,0xff4dff30,0xff4dff34,0xff49ff34,0xff45ff38,0xff45ff3c
	.long	0xff41ff3c,0xff3cff41,0xff3cff45,0xff38ff45,0xff34ff49,0xff34ff4d,0xff30ff4d,0xff2cff51
	.long	0xff2cff55,0xff28ff59,0xff28ff59,0xff24fb5d,0xff20fb61,0xff20fb65,0xff1cfb65,0xff1cf769
	.long	0xff18f76d,0xff18f771,0xff14f375,0xff14f375,0xff14f379,0xff10ef7d,0xff10ef82,0xff0cef82
	.long	0xff0ceb86,0xff0ceb8a,0xff08e78e,0xff08e792,0xff08e392,0xff08e396,0xff04df9a,0xff04df9e
	.long	0xff04db9e,0xff04dba2,0xff00d7a6,0xff00d7aa,0xff00d3aa,0xff00cfae,0xff00cfb2,0xff00cbb6
	.long	0xff00c7b6,0xff00c7ba,0xff00c3be,0xff00bebe,0xff00bec3,0xff00bac7,0xff00b6c7,0xff00b6cb
	.long	0xff00b2cf,0xff00aecf,0xff00aad3,0xff00aad7,0xff00a6d7,0xff04a2db,0xff049edb,0xff049edf
	.long	0xff049adf,0xff0896e3,0xff0892e3,0xff0892e7,0xff088ee7,0xff0c8aeb,0xff0c86eb,0xff0c82ef
	.long	0xff1082ef,0xff107def,0xff1479f3,0xff1475f3,0xff1475f3,0xff1871f7,0xff186df7,0xff1c69f7
	.long	0xff1c65fb,0xff2065fb,0xff2061fb,0xff245dfb,0xff2859ff,0xff2859ff,0xff2c55ff,0xff2c51ff
	.long	0xff304dff,0xff344dff,0xff3449ff,0xff3845ff,0xff3c45ff,0xff3c41ff,0xff413cff,0xff453cff
	.long	0xff4538ff,0xff4934ff,0xff4d34ff,0xff4d30ff,0xff512cff,0xff552cff,0xff5928ff,0xff5928ff
	.long	0xff5d24fb,0xff6120fb,0xff6520fb,0xff651cfb,0xff691cf7,0xff6d18f7,0xff7118f7,0xff7514f3
	.long	0xff7514f3,0xff7914f3,0xff7d10ef,0xff8210ef,0xff820cef,0xff860ceb,0xff8a0ceb,0xff8e08e7

	; Video initialization parameters.  Most of these I don't
	;  understand; what documentation I have has been saved here as
	;  comments.  The comment "magic" means "meaning unknown".
	;
	; These lists are taken pretty much directly from tatest, which
	;  says of them "These values mainly from Dans 3dtest
	;  program...".
	;
	; Since these are longwords stores, the offset must always be
	;  multiples of 4; the terminator is any value which isn't.
	;  (We use 1, but set_params accepts anything whose low two
	;  bits are nonzero.)
	;
	.macro	param	offset, value
	.word	$(offset)
	.long	$(value)
	.endm
	.macro	endparam
	.word	1
	.endm
	.align	2
three_d_params:
	param	0x80a8, 0x15d1c951	; magic
	param	0x80a0, 0x00000020	; magic
	param	0x8008, 0x00000000	; TA out of reset
	param	0x8048, 0x00000009	; "alpha config" - ?
	param	0x8068, [X_SIZE<<16]|0	; pixel clipping x
	param	0x806c, [Y_SIZE<<16]|0	; pixel clipping y
	param	0x8110, 0x00093f39	; magic
	param	0x8098, 0x00800408	; magic
	param	0x804c, [X_SIZE*2]/8	; "display align" - ?
	param	0x8078, 0f1.0
	param	0x8084, 0x00000000	; magic
	param	0x8030, 0x00000101	; magic
	param	0x80b0, 0x007f7f7f	; fog table colour
	param	0x80b4, 0x007f7f7f	; fog vertex colour
	param	0x80c0, 0x00000000	; colour clamp min
	param	0x80bc, 0xffffffff	; colour clamp max
	param	0x8080, 0x00000007	; magic
	param	0x8074, 0x00000001	; "cheap shadow" - ?
	param	0x807c, 0x0027df77	; magic
	param	0x8008, 0x00000001	; TA into reset
	param	0x8008, 0x00000000	; TA out of reset
	param	0x80e4, 0x00000000	; "stride width" - ?
	param	0x6884, 0x00000000	; disable all interrupt enables
	param	0x6930, 0x00000000
	param	0x6938, 0x00000000
	param	0x6900, 0xffffffff	; reset all pending interrupts
	param	0x6908, 0xffffffff
	param	0x6930, 0x002807ec	; re-enable some events (which?)
	param	0x6938, 0x0000000e
	param	0x80b8, 0x0000ff07	; fog density (meanings?)
	param	0x80b4, 0x007f7f7f	; fog vertex colour
	param	0x80b0, 0x007f7f7f	; fog table colour
	param	0x8108, 0x00000003	; 32bit palette (?)
	endparam
screen_params:
	param	0x80e8, 0x00160000	; screen control (?)
	param	0x8044, 0x00800000	; pixel mode ("vb+0x11" - ?)
	param	0x805c, 0x00000000	; size modulo and display lines ("vb+0x17" - ?)
	param	0x80d0, 0x00000100	; interlace flags (bit meanings?)
	param	0x80d8, 0x020c0359	; magic
	param	0x80cc, 0x001501fe	; magic
	param	0x80d4, 0x007e0345	; horizontal border (meaning? - see below)
	param	0x80dc, 0x00240204	; vertical position (meaning?)
	param	0x80e0, 0x07d6c63f	; sync control (meaning?)
	param	0x80ec, 0x000000a4	; horizontal position (meaning?)
	param	0x80f0, 0x00120012	; vertical border (meanings?)
	param	0x80c8, 0x03450000	; "set to same as border H in 80d4" - ?
	param	0x8068, [X_SIZE-1]<<16	; (X resolution - 1) << 16
	param	0x806c, [Y_SIZE-1]<<16	; (Y resolution - 1) << 16
	param	0x804c, 0x000000a0	; "display align" - ?
	param	0x8118, 0x00008040	; magic
	param	0x80f4, 0x00000401	; "anti-aliasing" - ?
	param	0x8048, 0x00000009	; "alpha config" - ?
	param	0x7814, 0x00000000	; "more interrupt control stuff" - ?
	param	0x7834, 0x00000000
	param	0x7854, 0x00000000
	param	0x7874, 0x00000000
	param	0x78bc, 0x4659404f
	param	0x8040, 0x00000000	; border colour
	endparam
	; "???" here means "not documented in tatest at all"
	; The "2" in these is the offset from the beginning of the
	;  param to the place where we store the (longword) value.
cmdlist_params:
	param	0x8008, 0x00000001	; TA into reset
	param	0x8008, 0x00000000	; TA out of reset
cmdlist_param_tilebuf_a = 2 + . - cmdlist_params
	param	0x8124, 0
	param	0x812c, 0		; ???
cmdlist_param_cmdlist = 2 + . - cmdlist_params
	param	0x8128, 0
	param	0x8130, 0		; ???
	param	0x813c, [[[Y_SIZE/32]-1]<<16] | [[X_SIZE/32]-1]
cmdlist_param_tilebuf_b = 2 + . - cmdlist_params
	param	0x8164, 0
	param	0x8140, 0x00100002	; ???
	param	0x8144, 0x80000000	; confirm settings
	endparam

	; Texture twiddling table.  Why "twiddle"?  That's the term
	;  used in tatest's comments.  It appears to be interleaving
	;  the bits of the numbers that form texture coordinates, so
	;  that the texels conceptually at (x,y) and (x+1,y), where
	;  x=ABCDEFG0 and y=abcdefgh (say), are stored at offsets
	;  aAbBcCdDeEfFgGh0 (x) and aAbBcCdDeEfFgGh1 (x+1).
	;
	; Why do it?  Because, in the words of another tatest comment,
	;  "palette based textures can not be non-twiddled".  Why
	;  design hardware that way?  MC, in email, passed along an
	;  explanation from someone who worked on the hardware, saying
	;  that twiddled textures provide higher performance, so the
	;  designers figured the only reason to use non-twiddled
	;  textures was to use a rendered frame as a texture (for, eg,
	;  reflections).  Since the renderer output is always
	;  true-colour, that's all they implemented.  (The
	;  "non-twiddled" bit got reused for a different meaning for
	;  palette-based textures.)
	;
	; tatest generates a 1024-entry table.  We reserve (and set up)
	;  that much space, but as of this writing use only 256 entries
	;  of it.
	;
	; One possible note to beware of is that this may not apply to
	;  the large dimension of non-square textures.  Done na�vely,
	;  doing this for non-square textures could use excessive
	;  amounts of memory; it would appear, for example, that an
	;  8x256 texture would take up almost as much memory space as a
	;  128x256 one (because of all the gaps between the address
	;  bits).  But it may be smarter than that; when I mentioned
	;  that in mail to MC, he said he had a fuzzy memory that the
	;  high bits of non-square textures aren't twiddled, that, eg,
	;  an 8x256 texture in memory consists of 32 consecutive 8x8
	;  (twiddled) blocks.  But he also warned that memory could be
	;  wrong, so test this before depending on it.
	;
	.align	2
twiddles:
	.space	1024*2

	; Current double-buffering buffer number.  Always 0 or 1.
curbuf:
	.space	1
	; When set, this causes printing of debugging info, but for
	;  only one cycle; it's cleared when the info is printed.
debug:
	.byte	0
	; Character bitmaps.  These are used to set up texture_font_0
	;  and texture_font_1.  These are non-twiddled versions; they
	;  also contain 0 and 1, one bit per texel, whereas the
	;  versions in video RAM contain 0 and 96, one byte per texel.
	;  (0 and 96 because those are where the palettes keep the
	;  colours we use for font characters.)
	.macro	fontrow	a,b,c,d,e,f,g,h
	.byte	[$(a)<<0] | [$(b)<<1] | [$(c)<<2] | [$(d)<<3] | [$(e)<<4] | [$(f)<<5] | [$(g)<<6] | [$(h)<<7]
	.endm
init_font_0:
	fontrow	0,0,1,1,1,0,0,0
	fontrow	0,1,0,0,0,1,0,0
	fontrow	0,1,0,0,0,1,0,0
	fontrow	0,1,0,0,0,1,0,0
	fontrow	0,1,0,0,0,1,0,0
	fontrow	0,1,0,0,0,1,0,0
	fontrow	0,1,0,0,0,1,0,0
	fontrow	0,0,1,1,1,0,0,0
init_font_1:
	fontrow	0,0,0,1,0,0,0,0
	fontrow	0,0,1,1,0,0,0,0
	fontrow	0,0,0,1,0,0,0,0
	fontrow	0,0,0,1,0,0,0,0
	fontrow	0,0,0,1,0,0,0,0
	fontrow	0,0,0,1,0,0,0,0
	fontrow	0,0,0,1,0,0,0,0
	fontrow	0,0,1,1,1,0,0,0
texture_bit_cursor:
	.byte	0

	.align	2
main:
; The only things startup.s sets up that cdcode hasn't already done for
;  us are (1) fpscr and (2) clearing bss.  We don't have bss because we
;  aren't linked by a conventional linker.  FPSCR needs setup too.  So
;  does the VBR.  Make sure FD, RB, and BL are clear in the SR.  We
;  don't need to copy r10-r15, even for the sake of returning to
;  cdcode, because only r0-r7 are banked.  We save r10-r14 on the stack
;  so that we can use them; they matter only on return to cdcode, which
;  happens only controlledly.
	mov.l	r14,@-r15
	mov.l	r13,@-r15
	mov.l	r12,@-r15
	mov.l	r11,@-r15
	mov.l	r10,@-r15
	ldc	r14,gbr
	stc	sr,r1
	SETS.L	#~[SR_FD|SR_RB|SR_BL],r2
	and	r2,r1
	ldc	r1,sr
	; Note that r0-r7 may have just changed if we switched banks.
	mov	#0,r1
	lds	r1,fpscr
	.sz	0
	.pr	0
	SETS.L	#intvec,r0
	ldc	r0,vbr
; Real code begins here.
	bsr	clear_vram
	 nop
	bsr	init_maple
	 nop
	bsr	init_powervr
	 nop
	bsr	init_video
	 nop
	bsr	init_palette
	 nop
	bsr	init_twiddling
	 nop
	bsr	init_textures
	 nop
	bsr	init_tiledesc
	 nop
	bsr	init_3dvalues
	 nop
1:	bsr	one_frame
	 nop
	bsr	nbgetchar
	 nop
	cmp/pz	r0
	bf	1b
	cmp/eq	#'d,r0
	bt	setdebug
done:
	bsr	putchar
	 mov	#13,r1
	bsr	putchar
	 mov	#10,r1
	; Turn SR.BL (back) on before returning to cdcode.
	stc	sr,r1
	SETS.L	#SR_BL,r2
	or	r2,r1
	ldc	r1,sr
	mov.l	@r15+,r10
	mov.l	@r15+,r11
	mov.l	@r15+,r12
	mov.l	@r15+,r13
	lds	r11,pr
	rts
	 mov.l	@r15+,r14
setdebug:
	SETS.L	#debug,r1
	SETS.L	#1,r0
	bra	1b
	 mov.b	r0,@r1

clear_vram:
	SETS.L	#QACR0,r1
	SETS.L	#QACR1,r2
	SETS.L	#[[VRAM_BASE_64>>26]&7]<<2,r3
	SETS.L	#STOREQ_BASE+[4*16],r4
	SETS.L	#0,r5
	mov.l	r3,@r1
	mov.l	r3,@r2
	SETS.L	#16,r0
1:	dt	r0
	bf/s	1b
	 mov.l	r5,@-r4
	SETS.L	#VRAM_SIZE/32,r1
	SETS.L	#[VRAM_BASE_64&0x03ffffc0]|0xe0000000,r2
1:	pref	@r2
	dt	r1
	bf/s	1b
	 add	#32,r2
	mov.l	r5,@r4
	add	#4*16,r4
	rts
	 mov.l	r5,@r4

set_params:
.if debug_set_params
		sts.l	pr,@-r15
.endif
	; r1 points to params table
	SETS.L	#VIDREG_BASE,r2
1:	mov.w	@r1+,r0
	tst	#3,r0
	bf/s	1f
	 extu.w	r0,r0
	mov.w	@r1+,r3
	mov.w	@r1+,r4
	SHLL	#16,r4
	extu.w	r3,r3
	or	r3,r4
	add	r2,r0
.if debug_set_params
		mov.l	r0,@-r15
		mov.l	r1,@-r15
		sts.l	pr,@-r15
		bsr	putchar
		 mov	#'*,r1
		bsr	printhex8
		 mov.l	@(8,r15),r1
		bsr	putchar
		 mov	#'=,r1
		bsr	printhex8
		 mov	r4,r1
		bsr	putchar
		 mov	#13,r1
		bsr	putchar
		 mov	#10,r1
		lds.l	@r15+,pr
		mov.l	@r15+,r1
		mov.l	@r15+,r0
.endif
	bra	1b
	 mov.l	r4,@r0
1:
.if debug_set_params
		lds.l	@r15+,pr
.endif
	rts
	 nop

init_maple:
	mova	9f,r0
1:	mov.l	@r0+,r1
	tst	r1,r1
	bt	1f
	mov.l	@r0+,r2
	bra	1b
	 mov.l	r2,@r1
1:	rts
	 nop
	.align	4
9:	.long	BUS_RESET,	BUS_RESET_VALUE
	.long	BUS_RESET2,	BUS_RESET2_VALUE
	.long	BUS_SPEED,	SPEED_2MBPS|[50000<<SPEED_TIMEOUT_SHIFT]
	.long	BUS_ENABLE,	BUS_ENABLE_VALUE
	.long	0

init_powervr:
	sts.l	pr,@-r15
	SETS.L	#three_d_params,r1
	bsr	set_params
	 nop
	SETS.L	#0xa05f810c,r1	; what does this point to?
	SETS.L	#0x000007ff,r2	; what does this mask mean?
	SETS.L	#65536,r4
	mov	r4,r3
1:	mov.l	@r1,r0
	tst	r2,r0
	bt	1b
1:	mov.l	@r1,r0
	tst	r2,r0
	bf	1b
	SETS.L	#screen_params,r1
	bsr	set_params
	 nop
	lds.l	@r15+,pr
	rts
	 nop

init_video:
	sts.l	pr,@-r15
	; Get cable type from port A bits 8 and 9
	; 0=VGA, 1=???, 2=RGB, 3=composite
	SETS.L	#PCTRA,r8
	SETS.L	#~0x000f0000,r2	; control bits for pins 8 and 9
	SETS.L	#0x000a0000,r3	; configure as inputs, pullups enabled
	mov.l	@r8,r0
	and	r2,r0
	or	r3,r0
	mov.l	r0,@r8
	.if	@IS_SB[PDTRA-PCTRA]
	add	#PDTRA-PCTRA,r8
	.else
	SETS.L	#PDTRA,r8
	.endif
	mov.w	@r8,r0
	SHXR	#8,r0
	and	#3,r0
	mov	r0,r9
	SETS.L	#VIDREG_BASE+0x8000,r8
	mov	r8,r2
	add	#8,r2
	SETS.L	#0,r6
	mov.l	r6,@r2		; 0xa05f8008, "TA out of reset"
	add	#0x40-8,r2
	mov.l	r6,@r2		; 0xa05f8040, border colour
	mov	#0x5,r3		; 5/6/5 2bpp, no scan doubling, display enabled
	SETS.L	#240,r7
	mov	r9,r0
	tst	#2,r0
	bf	1f
	SHLL	#1,r7
	swap.w	r3,r0		; |= 0x00800000, clock doubler
	or	#0x80,r0
	swap.w	r0,r3
1:	add	#0x44-0x40,r2
	mov.l	r3,@r2		; 0xa05f8044, display mode
	add	#0x50-0x44,r2
	mov.l	r6,@r2		; 0xa05f8050, vram base offset 1
	SETS.L	#SHORT_FRAME_OFFSET,r3 ; pixels * bytes-per-pixel
	add	#0x54-0x50,r2
	mov.l	r3,@r2		; 0xa05f8054, vram base offset 2
	SETS.L	#1<<8,r3	; VO, negative H and V sync
	SETS.L	#[X_SIZE/2],r4	; longs of (16bpp) pixel data per scanline
	SETS.L	#1,r5
	mov	r9,r0
	tst	#2,r0
	bt	1f
	add	r4,r5
	SETS.L	#0x10,r0	; interlaced, NTSC colour
	or	r0,r3
1:	SHLL	#10,r5
	add	r7,r5
	add	#-1,r5
	SHLL	#10,r5
	add	r4,r5
	add	#-1,r5
	add	#0x5c-0x54,r2
	mov.l	r5,@r2		; 0xa05f805c, display size and modulo
	add	#0xd0-0x5c,r2
	mov.l	r3,@r2		; 0xa05f80d0, video encapsulation
	SETS.L	#0x007e0345,r8	; doesn't make sense per doc
	add	#0xd4-0xd0,r2
	mov.l	r8,@r2		; 0xa05f80d4, H border range
	SETS.L	#[524<<16]|857,r8; NTSC/VGA
	add	#0xd8-0xd4,r2
	mov.l	r8,@r2		; 0xa05f80d8, full video size
	mov	r9,r0
	and	#2,r0
	mov	r0,r3
	SHLL	#3,r0
	or	r3,r0
	SETS.L	#36,r3
	sub	r0,r3
	mov	r3,r0
	SHLL	#16,r0
	or	r0,r3
	mov	r3,r8
	add	r7,r8
	add	#0xdc-0xd8,r2
	mov.l	r8,@r2		; 0xa05f80dc, V border range
	SETS.L	#22<<16,r8	; N=magic, pixel duplication disabled
	add	#0xe8-0xdc,r2
	mov.l	r8,@r2		; 0xa05f80e8, additional video settings
	SETS.L	#0xa4,r8
	add	#0xec-0xe8,r2
	mov.l	r8,@r2		; 0xa05f80ec, H position
	add	#0xf0-0xec,r2
	mov.l	r3,@r2		; 0xa05f80f0, V position
	SETS.L	#260,r4
	mov	r9,r0
	tst	#2,r0
	bf	1f
	SETS.L	#510,r4
1:	SETS.L	#0x21<<16,r3
	or	r3,r4
	add	#0xcc-0xf0,r2
	mov.l	r4,@r2		; 0xa05f80cc, raster event position
	mov	r9,r0
	tst	#1,r0
	bt/s	1f
	 mov	#0,r8
	mov	#3,r8
1:	SETS.L	#0xa0702c00,r3
	mov.l	r8,@r2		; 0xa0702c00, "Select RGB/CVBS" (??)
	lds.l	@r15+,pr
	rts
	 nop

	SETCONST

init_palette:
	SETS.L	#0xa05f9000,r1
	SETS.L	#256*4,r7
	mov	r1,r3
	add	r7,r3
	mov	r3,r5
	add	r7,r5
	SETS.L	#palette_0,r2
	SETS.L	#palette_1,r4
	SETS.L	#palette_2,r6
	SETS.L	#256,r7
1:	mov.l	@r2+,r0
	mov.l	r0,@r1
	mov.l	@r4+,r0
	mov.l	r0,@r3
	mov.l	@r6+,r0
	mov.l	r0,@r5
	add	#4,r1
	add	#4,r3
	dt	r7
	bf/s	1b
	 add	#4,r5
	rts
	 nop

init_twiddling:
	SETS.L	#twiddles+[1024*2],r1
	SETS.L	#1024,r2
	SETS.L	#0x00300,r3
	SETS.L	#0x000f0,r4
	SETS.L	#0x00c0c,r5
	SETS.L	#0x22222,r6
1:	add	#-1,r2
	mov	r2,r0
	and	r3,r0
	SHLL	#8,r0
	mov	r2,r7
	not	r3,r8
	and	r8,r7
	or	r0,r7
	mov	r7,r0
	and	r4,r0
	SHLL	#4,r0
	not	r4,r8
	and	r8,r7
	or	r0,r7
	mov	r7,r0
	and	r5,r0
	SHLL	#2,r0
	not	r5,r8
	and	r8,r7
	or	r0,r7
	mov	r7,r0
	and	r6,r0
	SHLL	#1,r0
	not	r6,r8
	and	r8,r7
	or	r0,r7
	tst	r2,r2
	bf/s	1b
	 mov.w	r7,@-r1
	rts
	 nop

; The C code this is based upon (again, from tatest)
;
;		  for(i=0; i<256; i++)
;		    for(j=0; j<256; j+=2) {
;		      /* Texture 0 = Mandelbrot */
;		      tex[0][twiddletab[i]|(twiddletab[j]>>1)] =
;			compute_texture(i, j, 0) | (compute_texture(i, j+1, 0)<<8);
;		      /* Texture 1 = Julia */
;		      tex[1][twiddletab[i]|(twiddletab[j]>>1)] =
;			compute_texture(i, j, 1) | (compute_texture(i, j+1, 1)<<8);
;		    }
;
; We change some names (eg, compute_texture_a and compute_texture_b
;  rather than a third arg to compute_texture), but it's otherwise
;  pretty similar.  We keep a lot of stuff on the stack rather than in
;  registers; while we might have enough registers, this means I don't
;  have to think about register allocation as much.  It also means the
;  texture computation functions have a much freer hand with registers.
;
; Arguably we should write these through 0x84000000 and then flush the
;  d$, but this is initialization code and hence uncached performance
;  is acceptable here.
;
init_textures:
	sts.l	pr,@-r15
	; Cube-face textures.
	SETS.L	#twiddles,r7
	mov.l	r7,@-r15
	SETS.L	#texture_cubeface_1,r8
	SETS.L	#texture_cubeface_2,r6
	mov.l	r6,@-r15
	mov.l	r8,@-r15
	mov	#0,r0
	mov.l	r0,@-r15
2:	mov	#0,r0
	mov.l	r0,@-r15
	; stack = x y tex0 tex1 twiddles
1:	mov.l	@r15,r1			; x
	bsr	compute_texture_a
	 mov.l	@(4,r15),r2		; y
	mov.l	r0,@-r15		; valA(x,y)
	mov.l	@(4,r15),r1		; x
	mov.l	@(8,r15),r2		; y
	bsr	compute_texture_a
	 add	#1,r1			; r0=valA(x+1,y)
	mov.l	@r15+,r1		; valA(x,y)
	SHLL	#8,r0
	or	r1,r0			; combined vals
	mov.l	r0,@-r15
	mov.l	@(4,r15),r1		; x
	bsr	compute_texture_b
	 mov.l	@(8,r15),r2		; y
	mov.l	r0,@-r15		; valB(x,y)
	mov.l	@(8,r15),r1		; x
	mov.l	@(12,r15),r2		; y
	bsr	compute_texture_b
	 add	#1,r1			; r0=valB(x+1,y)
	mov.l	@r15+,r1		; valB(x,y)
	SHLL	#8,r0
	or	r1,r0			; combined vals
	mov.l	r0,@-r15
	; stack = valsB valsA x y tex0 tex1 twiddles
	mov.l	@(24,r15),r2		; twiddles
	mov.l	@(8,r15),r1		; x
	SHLL	#1,r1
	add	r2,r1
	mov.w	@r1,r1
	mov.l	@(12,r15),r3		; y
	SHLL	#1,r3
	add	r2,r3
	mov.w	@r3,r3
	SHLL	#1,r3
	or	r1,r3
	; r3 now holds twiddled texture offset
	mov.l	@(16,r15),r2		; tex0
	add	r3,r2
.if debug_texture
		bsr	printhex8
		 mov.l	@(8,r15),r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@(12,r15),r1
		bsr	putchar2
		 mov	#' ,r1
		bsr	printhex8
		 mov	r2,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@(4,r15),r1
		bsr	putchar2
		 mov	#' ,r1
.endif
	mov.l	@(4,r15),r0		; valA
	mov.w	r0,@r2
	mov.l	@(20,r15),r2		; tex1
	add	r3,r2
.if debug_texture
		bsr	printhex8
		 mov	r2,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r15,r1
		bsr	putchar
		 mov	#13,r1
		bsr	putchar
		 mov	#10,r1
.endif
	mov.l	@r15,r0			; valB
	mov.w	r0,@r2
	add	#8,r15			; pop valA, valB
	SETS.L	#256,r1
	mov.l	@r15,r0			; x
	add	#2,r0
	cmp/hs	r1,r0
	bf/s	1b
	 mov.l	r0,@r15
	add	#4,r15			; pop x
	mov.l	@r15,r0			; y
	add	#1,r0
	cmp/hs	r1,r0
	bf/s	2b
	 mov.l	r0,@r15
	add	#16,r15			; pop remaining
	; Font textures.
	SETS.L	#font_setup,r9
	SETS.L	#n_font_setup,r8
	SETS.L	#twiddles,r7
5:	mov.l	@r9+,r1			; init data
	mov.l	@r9+,r2			; texture area
	SETS.L	#8,r3			; x
4:	SETS.L	#8,r4			; y
3:	mov	r1,r0
	add	#-1,r0
	mov.b	@(r0,r4),r0
	SHLL	#2,r0
	neg	r3,r5
	shld	r5,r0
	tst	#1,r0
	bt	1f
	bra	2f
	 mov	#96,r5
1:	mov	#0,r5
2:	tst	#2,r0
	bt	1f
	bra	2f
	 mov	#96,r0
1:	mov	#0,r0
2:	; r0 holds (x-1,y-1); r5 holds (x-2,y-1)
	SHLL	#8,r0
	or	r0,r5
	; r5 now holds combined (x-2,y-1) and (x-1,y-1) values
	mov	r3,r0
	add	#-2,r0
	SHLL	#1,r0
	mov.w	@(r0,r7),r6		; twiddled x-2
	mov	r4,r0
	add	#-1,r0
	SHLL	#1,r0
	mov.w	@(r0,r7),r0		; twiddled y-1
	SHLL	#1,r0
	or	r6,r0
	dt	r4
	bf/s	3b
	 mov.w	r5,@(r0,r2)
	dt	r3
	dt	r3
	bf	4b
	dt	r8
	bf	5b
	lds.l	@r15+,pr
	rts
	 nop

; Texture A is diagonal stripes; texture B is concentric circles
;  centred on (0,80).

compute_texture_a:
	; return(255&(x+y))
	add	r2,r1
	rts
	 extu.b	r1,r0

compute_texture_b:
	; return(255&(int)hypot(x,y-80))
	lds	r1,fpul
	float	fpul,fr0
	add	#-80,r2
	lds	r2,fpul
	float	fpul,fr1
	fmul	fr0,fr0
	fmul	fr1,fr1
	fadd	fr1,fr0
	fsqrt	fr0
	ftrc	fr0,fpul
	sts	fpul,r0
	rts
	 extu.b	r0,r0

	SETCONST

init_tiledesc:
	sts.l	pr,@-r15
	SETS.L	#tiledesc_cookies,r4
	SETS.L	#tilebuffers,r5
	SETS.L	#tiledescs,r6
	mov.l	r4,@-r15
	mov.l	@(4,r5),r0
	mov.l	r0,@-r15
	mov.l	@(4,r6),r0
	mov.l	r0,@-r15
	mov.l	@r6,r2
	bsr	setup_tiledesc
	 mov.l	@r5,r3
	mov.l	@(8,r15),r4
	mov.l	r0,@r4
	mov.l	@r15+,r2
	bsr	setup_tiledesc
	 mov.l	@r15+,r3
	mov.l	@r15+,r4
	mov.l	r0,@(4,r4)
	SETS.L	#curbuf,r1
	mov	#0,r0
	lds.l	@r15+,pr
	rts
	 mov.b	r0,@r1
setup_tiledesc:
	; in tatest terms, this is ta_create_tile_descriptors.  ptr is
	;  r2, buf is r3, w is X_SIZE/32, and h is Y_SIZE/32.  No
	;  registers r0-r9 are important upon return; they all are
	;  available to us.
	; vr = ptr
	mov	r2,r4		; vr is r4
	; bf = ((unsigned int)buf)&0x007fffff  (buf is dead after this)
	SETS.L	#0x007fffff,r0
	and	r0,r3		; bf is r3 from here on
	; strbase = (((unsigned int)ptr)&0x007fffff)|0x80000000
	; ptr is _not_ dead here, but 0x007fffff is.
	SETS.L	#0x80000000,r7
	and	r2,r0
	or	r0,r7		; strbase is r7
	; for (18 loops) *vr++ = 0
	mov	#18,r1
	mov	#0,r0
1:	mov.l	r0,@r4
	dt	r1
	bf/s	1b
	 add	#4,r4
	; *vr++ = 0x10000000
	; *vr++ = 0x80000000 (five times)
	SETS.L	#0x10000000,r1
	mov.l	r1,@r4
	SETS.L	#0x80000000,r1
	mov.l	r1,@(4,r4)
	mov.l	r1,@(8,r4)
	mov.l	r1,@(12,r4)
	mov.l	r1,@(16,r4)
	mov.l	r1,@(20,r4)
	add	#24,r4
	SETS.L	#X_SIZE/32,r8	; w is r8
	SETS.L	#Y_SIZE/32,r9	; h is r9
	; for (x=0;x<w;x++)
	mov	#0,r5		; x is r5
2:	; for (y=0;y<h;y++)
	mov	#0,r6		; y is r6
1:	; *vr++ = (y << 8) | (x << 2)
	mov	r6,r0
	SHLL	#8,r0
	mov	r5,r1
	SHLL	#2,r1
	or	r1,r0
	mov.l	r0,@r4
	; *vr++ = bf+((x+y*w)<<6)
	mul.l	r8,r6
	sts	macl,r0
	add	r5,r0
	SHLL	#6,r0,r1
	add	r3,r0
	mov.l	r0,@(4,r4)
	; *vr++ = strbase (four times)
	mov.l	r7,@(8,r4)
	mov.l	r7,@(12,r4)
	mov.l	r7,@(16,r4)
	mov.l	r7,@(20,r4)
	; end of y loop
	add	#1,r6
	cmp/hi	r6,r9
	bt/s	1b
	 add	#24,r4
	; end of x loop
	add	#1,r5
	cmp/hi	r5,r8
	bt/s	2b
	 nop
	; vr[-6] |= 0x80000000
	add	#-4*6,r4
	SETS.L	#0x80000000,r1
	mov.l	@r4,r0
	or	r1,r0
	mov.l	r0,@r4
	; return ((char *)ptr)+72
	mov	r2,r0
	rts
	 add	#72,r0

	; tatest uses fschg, eight fmovs from drX to xdX, fschg to set
	;  the matrix.  I'm not convinced this is safe; is it
	;  impossible to have two single-floats that, when the bits are
	;  reinterpreted as a double-float, turn into a signaling NaN?
	;  Programmer's PDF page 128 seems to imply that it's supposed
	;  to work, and pages 271ff do not list any possible exceptions
	;  for fmov, so it probably just does the move regardless of
	;  NaNs.  However, frchg is a much faster way to do basically
	;  the same thing; tatest doesn't use it because libgcc does
	;  not get along with FPSCR.FR=1, something we don't care
	;  about.  (We do, however, depend on fmoving two singles as a
	;  double when moving to/from memory.  And it'd take more
	;  thought to be certain, but I *think* the double move is safe
	;  in all cases even if the hardware does treat them as numbers
	;  instead of uninterpreted bags of bits, possibly excepting
	;  the FPSCR.DN=1 case - and that's a setting we don't use.)
init_3dvalues:
	; clear_matrix()
	fldi1	fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	fldi1	fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	fldi1	fr10
	fldi0	fr11
	fldi0	fr12
	fldi0	fr13
	fldi0	fr14
	fldi1	fr15
	frchg
	; apply_matrix(&screenview_matrix)
	SETS.L	#X_SIZE/0f2,r0
	SETS.L	#Y_SIZE/0f2,r1
	lds	r0,fpul
	fsts	fpul,fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	lds	r1,fpul
	fsts	fpul,fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	fldi1	fr10
	fldi0	fr11
	fmov	fr0,fr12
	fmov	fr5,fr13
	fldi0	fr14
	fldi1	fr15
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	ftrv	xmtrx,fv12
	frchg
	; apply_matrix(&projection_matrix)
	SETS.L	#@FLOAT[COT_FOVY],r0
	lds	r0,fpul
	fsts	fpul,fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	fmov	fr0,fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	SETS.L	#[ZFAR+ZNEAR]/@FLOAT[ZNEAR-ZFAR],r0
	lds	r0,fpul
	fsts	fpul,fr10
	fldi1	fr11
	fneg	fr11
	fldi0	fr12
	fldi0	fr13
	SETS.L	#[2*ZFAR*ZNEAR]/@FLOAT[ZNEAR-ZFAR],r0
	lds	r0,fpul
	fsts	fpul,fr14
	fldi1	fr15
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	ftrv	xmtrx,fv12
	frchg
	; apply_matrix(&translation_matrix)
	fldi1	fr0
	fldi0	fr1
	fldi0	fr2
	fldi0	fr3
	fldi0	fr4
	fldi1	fr5
	fldi0	fr6
	fldi0	fr7
	fldi0	fr8
	fldi0	fr9
	fldi1	fr10
	fldi0	fr11
	fldi0	fr12
	fldi0	fr13
	SETS.L	#@FLOAT[DISTANCE],r0
	lds	r0,fpul
	fsts	fpul,fr14
	fldi1	fr15
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	ftrv	xmtrx,fv12
	SETS.L	#base_matrix+[16*4],r0
	fschg
	fmov	dr14,@-r0
	fmov	dr12,@-r0
	fmov	dr10,@-r0
	fmov	dr8,@-r0
	fmov	dr6,@-r0
	fmov	dr4,@-r0
	fmov	dr2,@-r0
	fmov	dr0,@-r0
	fschg
	rts
	 nop

	SETCONST

	; Because we do trig with fsca, the most convenient units are
	;  not radians, but rather something in which a full circle is
	;  64K.  That's the units we use here.  This makes reduction
	;  modulo "2pi" really really easy.

one_frame:
	sts.l	pr,@-r15
	bsr	start_maple
	 nop
	bsr	update_rotations
	 nop
	bsr	apply_rotations
	 nop
	bsr	transform_coords
	 nop
	bsr	setup_cmd_list
	 nop
	bsr	setup_bits
	 nop
	bsr	draw_scene
	 nop
	bsr	handle_maple
	 nop
	bsr	await_video
	 nop
	bsr	next_frame
	 nop
	lds.l	@r15+,pr
	rts
	 nop
start_maple:
	SETS.L	#BUS_DMAADDR,r1
	SETS.L	#maple_cmd&DMA_ADDRMASK,r2
	SETS.L	#BUS_STATE,r3
	SETS.L	#BUS_STATE_GO,r0
	mov.l	r2,@r1
	rts
	 mov.l	r0,@r3
update_rotations:
	sts.l	pr,@-r15
	SETS.L	#curistate,r0
	mov.l	@(4,r0),r5
	mov.l	@r0,r0
	extu.b	r5,r1		; X
	extu.w	r5,r2
	SHXR	#8,r2		; Y
	SETS.L	#0x80,r5
	mov	r5,r3
	sub	r1,r5		; X
	sub	r2,r3		; Y
	lds	r5,fpul
	float	fpul,fr14	; X
	lds	r3,fpul
	float	fpul,fr15	; Y
	SETS.L	#@FLOAT[BUTTON_FACTOR],r5
	lds	r5,fpul
	fsts	fpul,fr8
	fldi1	fr9
	tst	#4,r0
	bf	1f
	fdiv	fr8,fr9
1:	SHXR	#8,r0
	tst	#2,r0
	bf	1f
	fmul	fr8,fr9
1:	fmul	fr9,fr14
	fmul	fr9,fr15
	fmov	fr15,fr4
	fmov	fr14,fr5
	fneg	fr5
	fldi0	fr6		; (fr4,fr5,fr6) = axis vector
	fmov	fr14,fr0
	fmov	fr15,fr13
	fmul	fr13,fr13
	fmul	fr0,fr0
	fadd	fr0,fr13
	fmov	fr13,fr1
	fsqrt	fr13		; hypot(X,Y)
	ftrc	fr13,fpul
	sts	fpul,r0
	tst	r0,r0
	bt	1f
	fsrra	fr1
	fmul	fr1,fr4		; normalize axis vector
	fmul	fr1,fr5
	fmul	fr1,fr6
1:	mov.l	r0,@-r15
	fmov.s	fr6,@-r15
	fmov.s	fr5,@-r15
	fmov.s	fr4,@-r15
	SETS.L	#curistate,r1
	mov.l	@r1,r0
	SHXR	#16,r0,r1
	extu.b	r0,r1		; right trigger
	SHXR	#8,r0,r2
	extu.b	r0,r0		; left trigger
	sub	r0,r1
	fldi0	fr4
	fldi0	fr5
	fldi1	fr6
	cmp/pz	r1
	bt	1f
	neg	r1,r1
	fneg	fr6
1:	lds	r1,fpul
	float	fpul,fr0
	fmul	fr9,fr0
	bsr	rotate_scene
	 ftrc	fr0,fpul
	fmov.s	@r15+,fr4
	fmov.s	@r15+,fr5
	fmov.s	@r15+,fr6
	bsr	rotate_scene
	 lds.l	@r15+,fpul
	SETS.L	#curistate,r2
	SETS.L	#previstate,r3
	mov.w	@r2,r0
	mov.w	@r3,r1
	mov.w	r0,@r3
	not	r0,r0
	and	r0,r1
	; set bits in r0 indicate currently-pressed buttons
	; set bits in r1 indicate newly-pressed buttons
	; bits are ---- -XY- RLDU sAB-
	; (s = Start)
	tst	#2,r0
	bt/s	1f
	 mov	r1,r0
	; B is down; maybe set or clr bit
	tst	#0x10,r0
	bt	2f
	SETS.L	#setbit,r1
	bra	4f
	 nop
2:	tst	#0x20,r0
	bt	3f
	SETS.L	#clrbit,r1
4:	SETS.L	#texture_bit_cursor,r2
	mov.b	@r2,r2
	not	r2,r0
	and	#31,r0
	SETS.L	#1,r2
	shld	r0,r2
	SETS.L	#cur_texture_mode,r3
	jsr	@r1
	 mov.l	@r3,r4
	bra	3f
	 mov.l	r4,@r3
1:	; B is not down; maybe move cursor
	tst	#0x40,r0
	bf/s	4f
	 mov	#-1,r1
	tst	#0x80,r0
	bt	3f
	mov	#1,r1
4:	SETS.L	#texture_bit_cursor,r2
	mov.b	@r2,r0
	add	r0,r1
	mov	#31,r0
	and	r1,r0
	cmp/eq	r0,r1
	bf	3f
	mov.b	r0,@r2
3:	lds.l	@r15+,pr
	rts
	 nop
setbit:
	rts
	 or	r2,r4
clrbit:
	not	r2,r2
	rts
	 and	r2,r4
rotate_scene:
	sts	fpul,r0
	tst	r0,r0
	bt	1f
	sts.l	pr,@-r15
	SETS.L	#eye_x,r0
	fmov.s	@r0+,fr0
	fmov.s	@r0+,fr1
	bsr	rotate_around_axis
	 fmov.s	@r0+,fr2
	bsr	normalize
	 nop
	.if	eye_x+12 == eye_y
	mov	r0,r1
	.elif	@IS_SB[eye_y-[eye_x+12]]
	mov	r0,r1
	add	#eye_y-[eye_x+12],r1
	.else
	SETS.L	#eye_y,r1
	.endif
	fmov.s	fr2,@-r0
	fmov.s	fr1,@-r0
	fmov.s	fr0,@-r0
	fmov.s	@r1+,fr0
	fmov.s	@r1+,fr1
	bsr	rotate_around_axis
	 fmov.s	@r1+,fr2
	fmov.s	@r0+,fr8
	fmov.s	@r0+,fr9
	bsr	subtract_component
	 fmov.s	@r0+,fr10
	bsr	normalize
	 nop
	.if	@IS_SB[[eye_z+12]-[eye_y+12]]
	mov	r1,r2
	add	#[eye_z+12]-[eye_y+12],r2
	.else
	SETS.L	#eye_z+12,r2
	.endif
	fmov.s	fr2,@-r1
	fmov.s	fr1,@-r1
	fmov.s	fr0,@-r1
	fmov	fr8,fr3
	fmov	fr9,fr4
	bsr	crossproduct
	 fmov	fr10,fr5
	fmov.s	fr2,@-r2
	fmov.s	fr1,@-r2
	fmov.s	fr0,@-r2
2:	lds.l	@r15+,pr
1:	rts
	 nop
apply_rotations:
	SETS.L	#base_matrix,r0
	fschg
	fmov	@r0+,dr0
	fmov	@r0+,dr2
	fmov	@r0+,dr4
	fmov	@r0+,dr6
	fmov	@r0+,dr8
	fmov	@r0+,dr10
	fmov	@r0+,dr12
	fmov	@r0+,dr14
	fschg
	frchg
	SETS.L	#eye_x,r0
	fmov.s	@r0+,fr0
	fmov.s	@r0+,fr1
	fmov.s	@r0+,fr2
	.if eye_y == eye_x+12
	; nothing
	.elif @IS_SB[eye_y-[eye_x+12]]
	add	#eye_y-[eye_x+12],r0
	.else
	SETS.L	#eye_y,r0
	.endif
	fmov.s	@r0+,fr4
	fmov.s	@r0+,fr5
	fmov.s	@r0+,fr6
	.if eye_z == eye_y+12
	; nothing
	.elif @IS_SB[eye_z-[eye_y+12]]
	add	#eye_z-[eye_y+12],r0
	.else
	SETS.L	#eye_z,r0
	.endif
	fmov.s	@r0+,fr8
	fmov.s	@r0+,fr9
	fmov.s	@r0+,fr10
	fldi0	fr3
	fldi0	fr7
	fldi0	fr11
	ftrv	xmtrx,fv0
	ftrv	xmtrx,fv4
	ftrv	xmtrx,fv8
	; We could fldi0 on fr12, fr13, fr14 and fldi1 on fr15, then
	;  ftrv xmtrx,fv12.  That would be the conceptually clean
	;  answer.  But that costs 8 cycles; this way costs only half
	;  that, and is one instruction shorter to boot.
	;
	; We can't do four single moves; there is no way to fmov just
	;  one of the xmtrx registers anywhere.  (While talking about
	;  the speed of nonexistent instructions is always dubious, it
	;  feels like a one-cycle instruction, in which case it
	;  wouldn't be any faster than this - but no slower either.)
	fschg
	fmov	xd12,dr12
	fmov	xd14,dr14
	fschg
	rts
	 frchg

	SETCONST

transform_coords:
	SETS.L	#vertex_coords,r1
	pref	@r1
	mov	r1,r5
	add	#[3*4]-1,r5
	pref	@r5
	SETS.L	#xform_coords,r0
	SETS.L	#n_vertex_coords,r2
	SETS.L	#4,r3
	SETS.L	#8,r4
1:	add	#3*4,r5
	pref	@r5
	fmov.s	@r1+,fr0
	fmov.s	@r1+,fr1
	fmov.s	@r1+,fr2
	fldi1	fr3
	ftrv	xmtrx,fv0
	dt	r2
	; fr3 should always be exactly 1 here; this is paranoia
	fdiv	fr3,fr0
	fdiv	fr3,fr1
	fdiv	fr3,fr2
	fmov.s	fr0,@r0
	fmov.s	fr1,@(r0,r3)
	fmov.s	fr2,@(r0,r4)
	bf/s	1b
	 add	#3*4,r0
	; dump stuff if debug
	SETS.L	#debug,r1
	mov.b	@r1,r0
	tst	r0,r0
	bt	1f
	mov	#0,r0
	mov.b	r0,@r1
	sts.l	pr,@-r15
	SETS.L	#eye_x,r0
	bsr	2f
	 nop
	SETS.L	#eye_y,r0
	bsr	2f
	 nop
	SETS.L	#eye_z,r0
	bsr	2f
	 nop
	bra	3f
	 nop
2:	sts.l	pr,@-r15
	mov.l	@r0+,r1
	mov.l	@r0+,r4
	bsr	print_float
	 mov.l	@r0+,r5
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov	r4,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov	r5,r1
	bsr	putchar
	 mov	#13,r1
	lds.l	@r15+,pr
	bra	putchar
	 mov	#10,r1
3:	SETS.L	#vertex_coords,r9
	SETS.L	#xform_coords,r8
	SETS.L	#n_vertex_coords,r7
2:	bsr	print_float
	 mov.l	@r9+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r9+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r9+,r1
	mova	9f,r0
	bsr	putstr
	 mov	r0,r1
	bsr	print_float
	 mov.l	@r8+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r8+,r1
	bsr	putchar
	 mov	#',,r1
	bsr	print_float
	 mov.l	@r8+,r1
	bsr	putchar
	 mov	#13,r1
	bsr	putchar
	 mov	#10,r1
	dt	r7
	bf	2b
	lds.l	@r15+,pr
1:	rts
	 nop
	.align	4
9:	.asciz	" -> "
	.align	2
setup_cmd_list:
	; In tatest terms, this is ta_set_target, but with args
	;  computed here based on curbuf rather than being passed in.
	sts.l	pr,@-r15
	SETS.L	#curbuf,r1
	mov.b	@r1,r1
	SHLL	#2,r1
	SETS.L	#cmdlists,r2
	SETS.L	#tilebuffers,r3
	add	r1,r2
	mov.l	@r2,r2
	add	r1,r3
	mov.l	@r3,r3
	SETS.L	#0x007fffff,r4
	and	r4,r2
	and	r4,r3
	swap.w	r2,r4
	swap.w	r3,r5
	SETS.L	#cmdlist_params,r0
	SETS.L	#cmdlist_param_tilebuf_a,r1
	mov.w	r3,@(r0,r1)
	add	#2,r1
	mov.w	r5,@(r0,r1)
	SETS.L	#cmdlist_param_tilebuf_b,r1
	mov.w	r3,@(r0,r1)
	add	#2,r1
	mov.w	r5,@(r0,r1)
	SETS.L	#cmdlist_param_cmdlist,r1
	mov.w	r2,@(r0,r1)
	add	#2,r1
	mov.w	r4,@(r0,r1)
	bsr	set_params
	 mov	r0,r1
	SETS.L	#VIDREG_BASE+0x8144,r0
	mov.l	@r0,r0
	lds.l	@r15+,pr
	rts
	 nop

setup_bits:
	SETS.L	#bits_base_bit,r1
	SETS.L	#bits_base_pal,r2
	SETS.L	#bits_inc,r3
	SETS.L	#cur_texture_mode,r4
	mov.l	@r4,r4
	SETS.L	#32,r5
	SETS.L	#2,r6
	SETS.L	#bits_inc,r7
	SETS.L	#0,r8
1:	shll	r4
	movt	r0
	mov.l	r0,@r1
	mov.l	r6,@r2
	add	r3,r1
	dt	r5
	bf/s	1b
	 add	r3,r2
	SETS.L	#texture_bit_cursor,r5
	mov.b	@r5,r0
	mul.l	r0,r7
	sts	macl,r0
	SETS.L	#bits_base_pal,r2
	add	r2,r0
	rts
	 mov.l	r8,@r0

draw_scene:
	sts.l	pr,@-r15
	SETS.L	#scene_faces,r9
	SETS.L	#n_scene_faces,r8
	SETS.L	#ta_cmd,r7
	SETS.L	#0,r6
	SETS.L	#0f1,r5
	SETS.L	#xform_coords,r4
	SETS.L	#3*4,r3
1:	SETS.L	#TA_CMD_POLYGON|TA_CMD_POLYGON_TYPE_OPAQUE|TA_CMD_POLYGON_SUBLIST|TA_CMD_POLYGON_STRIPLENGTH_2|TA_CMD_POLYGON_TEXTURED,r0
	mov.l	r0,@r7		; cmd
	SETS.L	#TA_POLYMODE1_Z_GREATER|TA_POLYMODE1_CULL_CCW,r0
	mov.l	r0,@(4,r7)	; mode1
	SETS.L	#TA_TEXTUREMODE_CLUT8,r1
	mov.l	@(48,r9),r0	; palette number
	SHLL	#TA_TEXTUREMODE_CLUTBANK8_SHIFT,r0,r2
	or	r0,r1
	mov.l	@(52,r9),r0	; texture number
	SETS.L	#textures,r2
	SHLL	#3,r0
	add	r0,r2
	mov.l	@r2,r0		; texture pointer
	mov.l	@(4,r2),r2	; size bits
	SETS.L	#cur_texture_mode,r10
	mov.l	@r10,r10
	or	r2,r10
	mov.l	r10,@(8,r7)	; mode2
	SHXR	#TA_TEXTUREMODE_ADDRESS_SHIFT,r0
	SETS.L	#TA_TEXTUREMODE_ADDRESS_MASK,r2
	and	r2,r0
	or	r0,r1
	mov.l	r1,@(12,r7)	; texture
	mov.l	r6,@(16,r7)	; alpha
	mov.l	r6,@(20,r7)	; red
	mov.l	r6,@(24,r7)	; green
	bsr	commit_ta_cmd
	 mov.l	r6,@(28,r7)	; blue
	SETS.L	#TA_CMD_VERTEX,r1
	mov.l	r1,@r7		; cmd
	mov.l	r6,@(28,r7)	; ocolour
	not	r6,r1
	mov.l	r1,@(24,r7)	; colour
	mov.l	@r9,r1
	mulu.w	r1,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(4,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(8,r9),r0
	bsr	commit_ta_cmd
	 mov.l	r0,@(20,r7)	; v
	mov.l	@(12,r9),r0
	mulu.w	r0,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(16,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(20,r9),r0
	bsr	commit_ta_cmd
	 mov.l	r0,@(20,r7)	; v
	mov.l	@(24,r9),r0
	mulu.w	r0,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(28,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(32,r9),r0
	bsr	commit_ta_cmd
	 mov.l	r0,@(20,r7)	; v
	mov.l	@(36,r9),r0
	mulu.w	r0,r3
	sts	macl,r0
	add	r4,r0
	mov.l	@r0,r2
	mov.l	r2,@(4,r7)	; x
	mov.l	@(4,r0),r2
	mov.l	r2,@(8,r7)	; y
	mov.l	@(8,r0),r2
	mov.l	r2,@(12,r7)	; z
	mov.l	@(40,r9),r0
	mov.l	r0,@(16,r7)	; u
	mov.l	@(44,r9),r0
	mov.l	r0,@(20,r7)	; v
	SETS.L	#TA_CMD_VERTEX|TA_CMD_VERTEX_EOS,r1
	bsr	commit_ta_cmd
	 mov.l	r1,@r7		; cmd
	dt	r8
	bf/s	1b
	 add	#56,r9
	; making this a loop saves only one instruction and adds time.
	mov.l	r6,@r7
	mov.l	r6,@(4,r7)
	mov.l	r6,@(8,r7)
	mov.l	r6,@(12,r7)
	mov.l	r6,@(16,r7)
	mov.l	r6,@(20,r7)
	mov.l	r6,@(24,r7)
	bsr	commit_ta_cmd
	 mov.l	r6,@(28,r7)
	lds.l	@r15+,pr
	rts
	 nop
commit_ta_cmd:
	; In tatest terms, this is ta_commit_list(), with the argument
	;  always being ta_cmd.
.if debug_ta_commit
		sts.l	pr,@-r15
		bsr	putchar2
		 mov	#'(,r1
.endif
	SETS.L	#QACR0,r1
	SETS.L	#STOREQ_BASE,r14
	SETS.L	#[[TA_CMD_BASE>>26]&7]<<2,r13
	SETS.L	#ta_cmd,r12
	SETS.L	#8,r11
.if debug_ta_commit
		mov.l	r1,@-r15
		bsr	printhex8
		 mov.l	@r12+,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r12+,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r12+,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r12+,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r12+,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r12+,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r12+,r1
		bsr	putchar
		 mov	#' ,r1
		bsr	printhex8
		 mov.l	@r12+,r1
		add	#-8*4,r12
		mov.l	@r15+,r1
.endif
	mov.l	r13,@r1
	mov	r14,r10
1:	mov.l	@r12+,r0
	dt	r11
	mov.l	r0,@r14
	bf/s	1b
	 add	#4,r14
.if debug_ta_commit
		pref	@r10
		bsr	putchar2
		 mov	#'),r1
		lds.l	@r15+,pr
	rts
	 nop
.else
	rts
	 pref	@r10
.endif
handle_maple:
	SETS.L	#BUS_STATE,r3
1:	mov.l	@r3,r0
	tst	#BUS_STATE_RUNNING,r0
	bf	1b
	SETS.L	#maple_resp,r0
	; We ocbi only one cache line, because the parts of the
	;  response we care about fit in a single cache line.  The
	;  hardware's alignment requirements for maple buffers match
	;  cache line alignments, and we access only 8 bytes of it at
	;  low offsets.
	;	
	; We arguably should ocbi the line back just before we kick off
	;  the maple operation rather than waiting until here.  Since
	;  we never write to this cache line, the only difference I see
	;  is whether it sits around in the cache in the interim.  This
	;  might conceivably affect something, but even if it does I
	;  have trouble seeing the difference being more than one cache
	;  line fill penalty.
	ocbi	@r0
	mov.l	@(8,r0),r1
	mov.l	@(12,r0),r2
	SETS.L	#curistate,r0
	mov.l	r1,@r0
	mov.l	r2,@(4,r0)
	rts
	 nop
await_video:
	; In tatest terms, this is everything in the main loop after
	;  the call to ta_commit_end().
	; ta_wait_render()
	SETS.L	#TA_RENDER_EVENT,r1
	SETS.L	#TA_RENDER_BIT,r2
1:	mov.l	@r1,r0
	tst	r2,r0
	bt	1b
	mov.l	r2,@r1
	; wait_bovp()
	SETS.L	#VBLANK_REG,r1
	SETS.L	#VBLANK_VBIT,r2
	mov.l	r2,@r1
1:	mov.l	@r1,r0
	tst	r2,r0
	bt	1b
	rts
	 mov.l	r2,@r1
next_frame:
.if debug_start_render
		sts.l	pr,@-r15
.endif
	; Switch to the previously-rendered screen
	SETS.L	#curbuf,r10
	SETS.L	#render_buf,r11
	mov.b	@r10,r0
	SHLL	#2,r0
	mov.l	@(r0,r11),r1
	SETS.L	#0x007fffff,r12
	SETS.L	#DISPLAY_VRAM,r3
	and	r12,r1
	mov.l	r1,@r3
	SETS.L	#SHORT_FRAME_OFFSET,r0
	add	r0,r1
	mov.l	r1,@(4,r3)
	; Kick off rendering to the screen we just stopped displaying
	; In tatest terms, this is ta_begin_render.
	mov.b	@r10,r0			; curbuf
	SETS.L	#cmdlists,r1
	SHLL	#2,r0
	SETS.L	#tiledesc_cookies,r2
	mov.l	@(r0,r1),r1		; cmdlist
	mov.l	@(r0,r2),r2		; tiles
	xor	#4,r0
	mov.l	@(r0,r11),r3		; scrn
	SETS.L	#VIDREG_BASE+0x8138,r4
	SETS.L	#0x12,r5
	SETS.L	#0,r6
	mov.l	@r4,r4
	SETS.L	#VRAM_BASE_32,r0
	or	r0,r4			; taend
1:	mov.l	r6,@r4
	dt	r5
	bf/s	1b
	 add	#4,r4
	add	#-0x12*4,r4
	; We could use set_params here, but between the number of
	;  values to store and the need to break longs into two words,
	;  it's less pain to do it this way.
	;
	; Do we have to do all these in exactly this order?  I suspect
	;  not, but, absent documentation, it's hard to tell how much
	;  deviation is OK.  We stick strictly to tatest's order.
	SETS.L	#VIDREG_BASE+0x802c,r5
	and	r12,r2
.if debug_start_render
		bsr	9f
		 mov	r2,r0
.endif
	mov.l	r2,@r5			; 0xa05f802c
	add	#0x8020-0x802c,r5
	mov	r1,r0
	and	r12,r0
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f8020
	add	#0x8060-0x8020,r5
	and	r12,r3
.if debug_start_render
		bsr	9f
		 mov	r3,r0
.endif
	mov.l	r3,@r5			; 0xa05f8060
	add	#0x808c-0x8060,r5
	sub	r1,r4
	SHLL	#1,r4
	SETS.L	#0x01000000,r0
	or	r4,r0
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f808c
	add	#0x8088-0x808c,r5
	SETS.L	#0x3e4cccc0,r0		; tatest says "zclip"
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f8088
	add	#0x8068-0x8088,r5
	SETS.L	#[X_SIZE-1]<<16,r0	; tatest calls it "clipw"
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f8068
	add	#0x806c-0x8068,r5
	SETS.L	#[Y_SIZE-1]<<16,r0	; tatest calls it "cliph"
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f806c
	add	#0x804c-0x806c,r5
	SETS.L	#[X_SIZE*2]>>3,r0	; tatest calls it "modulo"
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f804c
	add	#0x8048-0x804c,r5
	SETS.L	#TA_PIXFMT_RGB565|TA_PIXFMT_DITHER,r0	; tatest calls it "pixfmt"
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f8048
	add	#0x8014-0x8048,r5
	SETS.L	#0xffffffff,r0		; tatest says "Launch!"
.if debug_start_render
		bsr	9f
		 nop
.endif
	mov.l	r0,@r5			; 0xa05f8014
	; curbuf = ! curbuf
	mov.b	@r10,r0
	tst	r0,r0
	bt/s	1f
	 add	#1,r0
	mov	#0,r0
1:
.if debug_start_render
		lds.l	@r15+,pr
.endif
	rts
	 mov.b	r0,@r10
.if debug_start_render
9:	; about to mov.l r0,@r5; print it
	; must preserve all input registers except pr
	mov.l	r0,@-r15
	mov.l	r1,@-r15
	sts.l	pr,@-r15
	bsr	printhex8
	 mov	r5,r1
	bsr	putchar
	 mov	#'=,r1
	bsr	printhex8
	 mov.l	@(8,r15),r1
	bsr	putchar
	 mov	#13,r1
	bsr	putchar
	 mov	#10,r1
	lds.l	@r15+,pr
	mov.l	@r15+,r1
	rts
	 mov.l	@r15+,r0
.endif

	SETCONST

; computes (fr3,fr4,fr5) � (fr0,fr1,fr2) -> (fr0,fr1,fr2)
; uses fr6 as temporary; destroys fr3/fr4/fr5 inputs too
; ( (fr4*fr2)-(fr5*fr1) , (fr5*fr0)-(fr3*fr2) , (fr3*fr1)-(fr4*fr0) )
;       A    B    C           D    E    F           G    H    I
crossproduct:
	fmov	fr0,fr6
	fmul	fr5,fr6	; D
	fmul	fr1,fr5	; C, input fr1 now dead
	fmul	fr3,fr1	; G, input fr3 now dead
	fmul	fr2,fr3	; F, input fr2 now dead
	fmul	fr4,fr2	; A, input fr4 now dead
	fmul	fr0,fr4	; I, input fr0 and fr5 now dead
	fmov	fr2,fr0	; A, temporary fr2 now dead
	fsub	fr5,fr0	; B, A and C now dead
	fmov	fr1,fr2	; G, temporary fr1 now dead
	fsub	fr4,fr2	; H, G and I now dead
	fmov	fr6,fr1	; D, temporary fr6 now dead
	rts
	 fsub	fr3,fr1	; E, D and F now dead

; Rotate (fr0,fr1,fr2) by fpul fsca units around axis (fr4,fr5,fr6).
; The axis vector must be normalized already.
; Output in (fr0,fr1,fr2).
; Preserves fr4-fr6, fr13-fr15, fpul, all CPU registers.
; Destroys fr3, fr7-fr12.
; Let s = sin(fpul), c = cos(fpul); output in terms of input is
;
;	fr0 =	(fr0 * ((fr4 * fr4 * (1-c)) + c)) +		A
;		(fr1 * ((fr4 * fr5 * (1-c)) - (fr6 * s))) +	B
;		(fr2 * ((fr4 * fr6 * (1-c)) + (fr5 * s)))	C
;
;	fr1 =	(fr0 * ((fr5 * fr4 * (1-c)) + (fr6 * s))) +	D
;		(fr1 * ((fr5 * fr5 * (1-c)) + c)) +		E
;		(fr2 * ((fr5 * fr6 * (1-c)) - (fr4 * s)))	F
;
;	fr2 =	(fr0 * ((fr6 * fr4 * (1-c)) - (fr5 * s))) +	G
;		(fr1 * ((fr6 * fr5 * (1-c)) + (fr4 * s))) +	H
;		(fr2 * ((fr6 * fr6 * (1-c)) + c))		I
rotate_around_axis:
	fsca	fpul,fr8	; fr8 = s, fr9 = c
	fldi1	fr3
	fsub	fr9,fr3		; fr3 = 1-c
	fmov	fr4,fr7		; fr4
	fmul	fr4,fr7		; fr4 * fr4
	fmul	fr3,fr7		; fr4 * fr4 * (1-c)
	fadd	fr9,fr7		; (fr4 * fr4 * (1-c)) + c
	fmul	fr0,fr7		; A
	fmov	fr4,fr10	; fr4
	fmul	fr5,fr10	; fr4 * fr5
	fmul	fr3,fr10	; fr4 * fr5 * (1-c)
	fmov	fr6,fr11	; fr6
	fmul	fr8,fr11	; fr6 * s
	fsub	fr11,fr10	; (fr4 * fr5 * (1-c)) - (fr6 * s)
	fmul	fr1,fr10	; B
	fadd	fr10,fr7	; A + B
	fmov	fr4,fr12	; fr4
	fmul	fr6,fr12	; fr4 * fr6
	fmul	fr3,fr12	; fr4 * fr6 * (1-c)
	fmov	fr5,fr11	; fr5
	fmul	fr8,fr11	; fr5 * s
	fadd	fr11,fr12	; (fr4 * fr6 * (1-c)) + (fr5 * s)
	fmul	fr2,fr12	; C
	fadd	fr7,fr12	; output fr0
	fmov	fr5,fr7		; fr5
	fmul	fr4,fr7		; fr5 * fr4
	fmul	fr3,fr7		; fr5 * fr4 * (1-c)
	fmov	fr6,fr10	; fr6
	fmul	fr8,fr10	; fr6 * s
	fadd	fr10,fr7	; (fr5 * fr4 * (1-c)) + (fr6 * s)
	fmul	fr0,fr7		; D
	fmov	fr5,fr11	; fr5
	fmul	fr6,fr11	; fr5 * fr6
	fmul	fr3,fr11	; fr5 * fr6 * (1-c)
	fmov	fr4,fr10	; fr4
	fmul	fr8,fr10	; fr4 * s
	; This is our point of maximum register use.
	; We have the following, all live, at this point:
	; fr0,fr1,fr2 = input values
	; fr3 = 1-c
	; fr4,fr5,fr6,fpul = input values to be preserved
	; fr7 = D
	; fr8 = s
	; fr9 = c
	; fr10 = fr4 * s
	; fr11 = fr5 * fr6 * (1-c)
	; fr12 = output fr0
	fsub	fr10,fr11	; (fr5 * fr6 * (1-c)) - (fr4 * s)
	fmul	fr2,fr11	; F
	fadd	fr7,fr11	; D + F
	fmov	fr5,fr10	; fr5
	fmul	fr5,fr10	; fr5 * fr5
	fmul	fr3,fr10	; fr5 * fr5 * (1-c)
	fadd	fr9,fr10	; (fr5 * fr5 * (1-c)) + c
	fmul	fr1,fr10	; E
	fadd	fr10,fr11	; output fr1
	fmov	fr6,fr7		; fr6
	fmul	fr6,fr7		; fr6 * fr6
	fmul	fr3,fr7		; fr6 * fr6 * (1-c)
	fadd	fr9,fr7		; (fr6 * fr6 * (1-c)) + c [fr9 dead]
	fmul	fr7,fr2		; I [fr2 dead]
	fmov	fr6,fr7		; fr6
	fmul	fr5,fr7		; fr6 * fr5
	fmul	fr3,fr7		; fr6 * fr5 * (1-c)
	fmov	fr4,fr10	; fr4
	fmul	fr8,fr10	; fr4 * s
	fadd	fr10,fr7	; (fr6 * fr5 * (1-c)) + (fr4 * s)
	fmul	fr1,fr7		; H [fr1 dead]
	fadd	fr7,fr2		; H + I
	fmov	fr6,fr7		; fr6
	fmul	fr4,fr7		; fr6 * fr4
	fmul	fr3,fr7		; fr6 * fr4 * (1-c) [fr3 dead]
	fmul	fr5,fr8		; fr5 * s [fr8 dead]
	fsub	fr8,fr7		; (fr6 * fr4 * (1-c)) - (fr5 * s)
	fmul	fr0,fr7		; G [fr0 dead]
	fadd	fr7,fr2		; output fr2
	fmov	fr11,fr1	; output fr1
	rts
	 fmov	fr12,fr0	; output fr0

; Modifies (fr0,fr1,fr2) by subtracting off the component in the
;  direction of (fr8,fr9,fr10).
; (fr8,fr9,fr10) must be normalized already.
; Output in (fr0,fr1,fr2).
; Preserves fr4-fr6, fr8-fr15, fpul, all CPU registers.
; Destroys fr3, fr7.
; Output in terms of input is
;
;	Let dp = (fr0 * fr8) + (fr1 * fr9) + (fr2 * fr10)
;
;	fr0 = fr0 - (dp * fr8)
;	fr1 = fr1 - (dp * fr9)
;	fr2 = fr2 - (dp * fr10)
subtract_component:
	fldi0	fr3
	fipr	fv8,fv0
	fmov	fr3,fr7
	fmul	fr8,fr7
	fsub	fr7,fr0
	fmov	fr3,fr7
	fmul	fr9,fr7
	fsub	fr7,fr1
	fmov	fr3,fr7
	fmul	fr10,fr7
	rts
	 fsub	fr7,fr2

; Normalize the vector in (fr0,fr1,fr2).
; Output in (fr0,fr1,fr2).
; Preserves fr4-fr15, fpul, all integer registers.
; Destroys fr3.
normalize:
	fldi0	fr3
	fipr	fv0,fv0
	fsrra	fr3
	fmul	fr3,fr0
	fmul	fr3,fr1
	rts
	 fmul	fr3,fr2

printhex8:
	mov	#8,r0
printhexN:
	mov.l	r4,@-r15
	mov	r0,r4
	add	#-8,r0
	neg	r0,r0
	SHLL	#2,r0
	shld	r0,r1
	mov.l	r3,@-r15
	mov.l	r2,@-r15
	sts.l	pr,@-r15
	mova	9f,r0
	mov	r0,r3
	mov	r1,r2
1:	mov	r2,r0
	SHLR	#28,r0,r1
	SHLL	#4,r2
	add	r3,r0
	bsr	putchar
	 mov.b	@r0,r1
	dt	r4
	bf	1b
	lds.l	@r15+,pr
	mov.l	@r15+,r2
	mov.l	@r15+,r3
	rts
	 mov.l	@r15+,r4
	.align	4
9:	.ascii	"0123456789abcdef"
	.align	2
putchar2:
	sts.l	pr,@-r15
	bsr	putchar
	 mov.l	r1,@-r15
	mov.l	@r15+,r1
	 lds.l	@r15+,pr
putchar:
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	and	#SCFDR2_TX_MASK,r0
	cmp/eq	#16,r0
	bt	1b
	mov	r1,r0
	mov.b	r0,@(SCFTDR2-SCIF_BASE,gbr)
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	tst	#SCFDR2_TX_MASK,r0
	bf	1b
	rts
	 nop
putstr:
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	and	#SCFDR2_TX_MASK,r0
	cmp/eq	#16,r0
	bt	1b
	mov.b	@r1+,r0
	tst	r0,r0
	bt	1f
	bra	1b
	 mov.b	r0,@(SCFTDR2-SCIF_BASE,gbr)
1:	; don't bother waiting for drain here; we do a putchar call,
	;  which will drain everything, after all putstr calls and
	;  before anything for which it matters.
	rts
	 nop
print_float:
	; float in r1
	; uses r0, r1, r2, fr0, fr1, fr2, fpul
	sts.l	pr,@-r15
	; check for negative; if so, print - and negate
	lds	r1,fpul
	fsts	fpul,fr0
	fldi0	fr1
	fcmp/gt	fr0,fr1
	bf	1f
	bsr	putchar
	 mov	#'-,r1
	fneg	fr0
1:	; divide by 10 until it's less than 10, and keep count
	mov	#10,r0
	lds	r0,fpul
	float	fpul,fr1
	mov	#0,r2
1:	fcmp/gt	fr0,fr1
	bt	1f
	fdiv	fr1,fr0
	bra	1b
	 add	#1,r2
1:	; now fr0 < 10 and r2 is the number of divisions we did
	; print the first (possibly only) digit before the .
	ftrc	fr0,fpul
	sts	fpul,r1
	bsr	putchar
	 add	#'0,r1
	float	fpul,fr2
	fsub	fr2,fr0
	; now, for r2 loops, print next digit
1:	cmp/pl	r2
	bf	1f
	fmul	fr1,fr0
	ftrc	fr0,fpul
	sts	fpul,r1
	float	fpul,fr2
	bsr	putchar
	 add	#'0,r1
	fsub	fr2,fr0
	bra	1b
	 add	#-1,r2
1:	; print as many digits as necessary to reach 0
	; print a . before the first one, if there are any
	mov	#'.,r1
	SETS.L	#0f0,r0
	lds	r0,fpul
1:	; Invariants at this point:
	;  - fpul contains integer part to be subtracted from fr0
	;  - r1 contains next character to print
	;  - loop if fr0 != 0 at this point
	fldi0	fr2
	fcmp/eq	fr0,fr2
	bt	2f
	float	fpul,fr2
	fsub	fr2,fr0
	fmul	fr1,fr0
	bsr	putchar
	 ftrc	fr0,fpul
	sts	fpul,r1
	bra	1b
	 add	#'0,r1
2:	; Done.
	lds.l	@r15+,pr
	rts
	 nop
nbgetchar:
	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_RX_SHIFT,r0,r1
	tst	#SCFDR2_RX_MASK,r0
	bt	1f
	mov.b	@(SCFRDR2-SCIF_BASE,gbr),r0
	extu.b	r0,r1
	mov.w	@(SCLSR2-SCIF_BASE,gbr),r0
	mov	#0,r0
	mov.w	r0,@(SCLSR2-SCIF_BASE,gbr)
	rts
	 mov	r1,r0
1:	rts
	 mov	#-1,r0

	SETCONST

	; Not sure we actually need to align the VBR; the only reason I
	;  have to suspect we might is that it's the kind of thing I've
	;  seen relatively often before - interrupt/trap vector tables
	;  often need to be aligned, not infrequently to a remarkably
	;  strict boundary.  I see no indication in the manuals that
	;  the SH requires _any_ alignment, but it's easy to do and
	;  definitely won't hurt anything.  (No explicit indication,
	;  that is.  It is implicit in the execution of code at
	;  VBR+0x100, VBR+0x400, and VBR+0x600 that VBR must be even.)
	.align	0x10000
	; Exception handling consists of:
	;	- Save PC and SR in SPC and SSR
	;	- Set SR bit BL to 1 (block exceptions/interrupts)
	;	- Set SR bit MD to 1 (privileged mode)
	;	- Set SR bit RB to 1 (r0-r7 bank 1)
	;	- Write code to EXPEVT or INTEVT
	;	- Set PC to vector addr, resume execution
intvec = .
. = intvec + 0x100
	SETS.L	#0x100,r2
	SETS.L	#EXPEVT,r0
	mov.l	@r0,r3
	SETS.L	#INTEVT,r0
	SETS.L	#regdump,r1
	jmp	@r1
	 mov.l	@r0,r4
	SETCONST
. = intvec + 0x400
	SETS.L	#0x400,r2
	SETS.L	#EXPEVT,r0
	mov.l	@r0,r3
	SETS.L	#INTEVT,r0
	SETS.L	#regdump,r1
	jmp	@r1
	 mov.l	@r0,r4
	SETCONST
. = intvec + 0x600
	SETS.L	#0x600,r2
	SETS.L	#EXPEVT,r0
	mov.l	@r0,r3
	SETS.L	#INTEVT,r0
	SETS.L	#regdump,r1
	jmp	@r1
	 mov.l	@r0,r4
	SETCONST
. = intvec + 0x1000
crash_msg_0:
	.asciz	(13,10,10)"FATAL TRAP"(13,10)"R0  "
crash_msg_1:
	.asciz	"   R1  "
crash_msg_2:
	.asciz	"   R2  "
crash_msg_3:
	.asciz	"   R3  "
crash_msg_4:
	.asciz	(13,10)"R4  "
crash_msg_5:
	.asciz	"   R5  "
crash_msg_6:
	.asciz	"   R6  "
crash_msg_7:
	.asciz	"   R7  "
crash_msg_8:
	.asciz	(13,10)"R8  "
crash_msg_9:
	.asciz	"   R9  "
crash_msg_10:
	.asciz	"   R10 "
crash_msg_11:
	.asciz	"   R11 "
crash_msg_12:
	.asciz	(13,10)"R12 "
crash_msg_13:
	.asciz	"   R13 "
crash_msg_14:
	.asciz	"   R14 "
crash_msg_15:
	.asciz	"   R15 "
crash_msg_gbr:
	.asciz	(13,10)"GBR "
crash_msg_sr:
	.asciz	"   SR  "
crash_msg_pc:
	.asciz	"   PC  "
crash_msg_mach:
	.asciz	(13,10)"MACH"
crash_msg_macl:
	.asciz	"   MACL"
crash_msg_pr:
	.asciz	"   PR  "
crash_msg_vec:
	.asciz	(13,10)"vector"
crash_msg_expevt:
	.asciz	"   EXPEVT"
crash_msg_intevt:
	.asciz	"   INTEVT"
crash_msg_done:
	.asciz	(13,10)
crash_msg_equal:
	.asciz	" = "
	.align	4
crash_msgs:
	.long	crash_msg_0
	.long	crash_msg_1
	.long	crash_msg_2
	.long	crash_msg_3
	.long	crash_msg_4
	.long	crash_msg_5
	.long	crash_msg_6
	.long	crash_msg_7
	.long	crash_msg_8
	.long	crash_msg_9
	.long	crash_msg_10
	.long	crash_msg_11
	.long	crash_msg_12
	.long	crash_msg_13
	.long	crash_msg_14
	.long	crash_msg_15
	.long	crash_msg_gbr
	.long	crash_msg_sr
	.long	crash_msg_pc
	.long	crash_msg_mach
	.long	crash_msg_macl
	.long	crash_msg_pr
	.long	crash_msg_vec
	.long	crash_msg_expevt
	.long	crash_msg_intevt
	.long	0
	.align	2
regdump:
	mov	r15,r5
	SETS.L	#intstacktop,r15
	mov.l	r4,@-r15
	mov.l	r3,@-r15
	mov.l	r2,@-r15
	sts.l	pr,@-r15
	sts.l	macl,@-r15
	sts.l	mach,@-r15
	stc.l	spc,@-r15
	stc.l	ssr,@-r15
	stc.l	gbr,@-r15
	mov.l	r5,@-r15
	mov.l	r14,@-r15
	mov.l	r13,@-r15
	mov.l	r12,@-r15
	mov.l	r11,@-r15
	mov.l	r10,@-r15
	mov.l	r9,@-r15
	mov.l	r8,@-r15
	stc.l	r7_bank,@-r15
	stc.l	r6_bank,@-r15
	stc.l	r5_bank,@-r15
	stc.l	r4_bank,@-r15
	stc.l	r3_bank,@-r15
	stc.l	r2_bank,@-r15
	stc.l	r1_bank,@-r15
	stc.l	r0_bank,@-r15
	SETS.L	#SCIF_BASE,r14
	SETS.L	#crash_msgs,r9
	SETS.L	#putstr,r8
	SETS.L	#printhex8,r7
	SETS.L	#putchar,r6
1:	mov.l	@r9+,r1
	tst	r1,r1
	bt	1f
	jsr	@r8
	 nop
	SETS.L	#crash_msg_equal,r1
	jsr	@r8
	 nop
	jsr	@r7
	 mov.l	@r15+,r1
	bra	1b
	 nop
1:	SETS.L	#crash_msg_done,r1
	jsr	@r8
	 nop
	jsr	@r6
	 mov	#0,r1
	SETS.L	#0xa0000000,r0	; hard-reset vector
	jmp	@r0
	 nop
	SETCONST
	.align	4
	.space	0x1000
intstacktop = .