; This is designed to be serial-line downloaded to cdcode.
;
; Our memory map:
;
;	[8c000000,8c010000)	Stack (r15 set by cdcode)
;	[8c010000,8c01????)	cdcode
;	[8c020000,8c0?????)	Us

TRIG_TABLE_SIZE = 2048 ; must be a power of two

	.include "regs.s"

. = 0x8c020000

	SETS.L	#main,r0
	jmp	@r0
	 nop
	SETCONST

	.align	8
twopi:
	.long	0x401921fb ; s=0, exp=1023+2, mant=(1.)921fbxxxxxxxx
	.long	0x54442d18 ; mant=(1.)xxxxx54442d18(469898cc...)

	.align	4
sincos_table:
	.space	TRIG_TABLE_SIZE*4

	.align	2
main:
	mov.l	r14,@-r15
	mov.l	r13,@-r15
	mov.l	r12,@-r15
	mov.l	r11,@-r15
	mov.l	r10,@-r15
	ldc	r14,gbr
	stc	sr,r1
	SETS.L	#~[SR_FD|SR_RB|SR_BL],r2
	and	r2,r1
	ldc	r1,sr
	; Note that r0-r7 may have just changed if we switched banks.
	mov	#0,r1
	lds	r1,fpscr
	SETS.L	#intvec,r0
	ldc	r0,vbr
; Real code begins here.
	bsr	init_trig
	 nop
	bsr	putchar
	 mov	#13,r1
	bsr	putchar
	 mov	#10,r1
	; Turn SR.BL (back) on before returning to cdcode.
	stc	sr,r1
	SETS.L	#SR_BL,r2
	or	r2,r1
	ldc	r1,sr
	mov.l	@r15+,r10
	mov.l	@r15+,r11
	mov.l	@r15+,r12
	mov.l	@r15+,r13
	lds	r11,pr
	rts
	 mov.l	@r15+,r14

; Load the trig table.  We do this by summing the infinite series for
;  sin(x), carrying it far enough that the next term doesn't change the
;  sum.  We compute in a double and then convert to single for the
;  table, the extra precision just to ensure single-float accuracy.
;
; To work out the sum for sin(x),
;
; e^x = 1 + x + x^2/2! + x^3/3! + x^4/4! + x^5/5! + ...
; (which can be deduced from d/dx e^x = e^x)
;
; e^(ia) = cos(a) + i sin(a)  (de Moivre's formula)
;
; Let x = ia in the series above and collect real and imaginary terms:
;
; cos(a) = 1 - a^2/2! + a^4/4! - a^6/6! + a^8/8! - a^10/10! + ...
; sin(a) = a - a^3/3! + a^5/5! - a^7/7! + a^9/9! - a^11/11! + ...
;
; We compute sin(a) using the second of these.
;
; Pseudocode:
;	for i = TRIG_TABLE_SIZE .. 1
;		x = ((i-1) * 2pi) / TRIG_TABLE_SIZE
;		x2 = - x * x
;		s = x
;		p = x
;		n = 2
;		do
;			prevs = s
;			p = (p * x2) / (n * (n+1))
;			s += p
;			n += 2
;		while s != prevs
;		table[i-1] is in s
;
; We don't store x in the inner loop, since it's dead once x2, s, and p
;  are set.  We also develop the x value in p.
;
; We have 8 registers available, since we're running with PR=1.
; We use them for
;	dr0	scratch
;	dr2	p
;	dr4	s
;	dr6	x2
;	dr8	1
;	dr10	n
;	dr12	2pi/TRIG_TABLE_SIZE
;	dr14	prevs
; Other values in the pseudocode above are in
;	r0	scratch
;	r1	scratch
;	r2	i
;	r3	ptr to table[i]
init_trig:
	SETS.L	#FPSCR_SZ|FPSCR_PR,r0
	lds	r0,fpscr
	.pr	1
	.sz	1
	SETS.L	#twopi,r0		; dr12 = 2pi/TRIG_TABLE_SIZE
	fmov	@r0,dr12
	SETS.L	#TRIG_TABLE_SIZE,r0
	lds	r0,fpul
	float	fpul,dr0
	fdiv	dr0,dr12
	; grr, fldi1 is single-only!
	SETS.L	#1,r2			; dr8 = 1
	lds	r2,fpul
	float	fpul,dr8
	SETS.L	#TRIG_TABLE_SIZE,r2	; i = TRIG_TABLE_SIZE
	SETS.L	#[sincos_table+[4*TRIG_TABLE_SIZE]],r3
5:	mov	r2,r0			; p = (i-1) * (2pi / TRIG_TABLE_SIZE)
	add	#-1,r0
	lds	r0,fpul
	float	fpul,dr2
	fmul	dr12,dr2
	fmov	dr2,dr6			; x2 = - p * p
	fmul	dr2,dr6
	fneg	dr6
	fmov	dr2,dr4			; s = p
	SETS.L	#2,r0			; n = 2
	lds	r0,fpul
	float	fpul,dr10
4:	fmov	dr4,dr14		; prevs = s
	fmul	dr6,dr2			; p = (p * x2) / (n * (n+1))
	fmov	dr10,dr0		; merged with n += 2
	fadd	dr8,dr10
	fmul	dr10,dr0
	fadd	dr8,dr10
	fdiv	dr0,dr2
	fadd	dr2,dr4			; s += p
	fcmp/eq	dr4,dr14		; compare s vs prevs
	bf	4b
	fcnvds	dr4,fpul		; *--r3 = s (single-float)
	sts.l	fpul,@-r3
	dt	r2
	bf	5b
1:	SETS.L	#0,r0
	lds	r0,fpscr
	.pr	0
	.sz	0
	sts.l	pr,@-r15
	SETS.L	#TRIG_TABLE_SIZE,r2
	SETS.L	#sincos_table,r3
	SETS.L	#0,r4
	SETS.L	#10,r0
	lds	r0,fpul
	float	fpul,fr15
	fldi0	fr14
1:	fmov.s	@r3+,fr2
	mov	r4,r0
	SHLL	#16-11,r0,r1	; 2K to 64K
	lds	r0,fpul
	fsca	fpul,fr4
	bsr	print_float
	 fmov	fr2,fr0
	bsr	putchar
	 mov	#' ,r1
	bsr	print_float
	 fmov	fr4,fr0
	bsr	putchar
	 mov	#' ,r1
	fmov	fr4,fr0
	bsr	print_float
	 fsub	fr2,fr0
	bsr	putchar
	 mov	#' ,r1
	flds	fr2,fpul
	sts	fpul,r1
	flds	fr4,fpul
	sts	fpul,r0
	bsr	printhex8
	 xor	r0,r1
	bsr	putchar
	 mov	#13,r1
	bsr	putchar
	 mov	#10,r1
	dt	r2
	bf/s	1b
	 add	#1,r4
	lds.l	@r15+,pr
	rts
	 nop
print_float:
	sts.l	pr,@-r15
	fcmp/gt	fr0,fr14
	bf	1f
	bsr	putchar
	 mov	#'-,r1
	fneg	fr0
1:	ftrc	fr0,fpul
	float	fpul,fr1
	fsub	fr1,fr0
	sts	fpul,r1
	bsr	putchar
	 add	#'0,r1
	bsr	putchar
	 mov	#'.,r1
1:	fmul	fr15,fr0
	ftrc	fr0,fpul
	float	fpul,fr1
	sts	fpul,r1
	bsr	putchar
	 add	#'0,r1
	fsub	fr1,fr0
	fcmp/eq	fr0,fr14
	bf	1b
	lds.l	@r15+,pr
	rts
	 nop

	SETCONST

printhex8:
	mov	#8,r0
printhexN:
	mov.l	r4,@-r15
	mov	r0,r4
	add	#-8,r0
	neg	r0,r0
	SHLL	#2,r0
	shld	r0,r1
	mov.l	r3,@-r15
	mov.l	r2,@-r15
	sts.l	pr,@-r15
	mova	9f,r0
	mov	r0,r3
	mov	r1,r2
1:	mov	r2,r0
	SHLR	#28,r0,r1
	SHLL	#4,r2
	add	r3,r0
	bsr	putchar
	 mov.b	@r0,r1
	dt	r4
	bf	1b
	lds.l	@r15+,pr
	mov.l	@r15+,r2
	mov.l	@r15+,r3
	rts
	 mov.l	@r15+,r4
	.align	4
9:	.ascii	"0123456789abcdef"
	.align	2
putchar2:
	sts.l	pr,@-r15
	bsr	putchar
	 mov.l	r1,@-r15
	mov.l	@r15+,r1
	 lds.l	@r15+,pr
putchar:
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	and	#SCFDR2_TX_MASK,r0
	cmp/eq	#16,r0
	bt	1b
	mov	r1,r0
	mov.b	r0,@(SCFTDR2-SCIF_BASE,gbr)
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	tst	#SCFDR2_TX_MASK,r0
	bf	1b
	rts
	 nop
putstr:
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	and	#SCFDR2_TX_MASK,r0
	cmp/eq	#16,r0
	bt	1b
	mov.b	@r1+,r0
	tst	r0,r0
	bt	1f
	bra	1b
	 mov.b	r0,@(SCFTDR2-SCIF_BASE,gbr)
1:	; don't bother waiting for drain here; we do a putchar call,
	;  which will drain everything, after all putstr calls and
	;  before anything for which it matters.
	rts
	 nop
nbgetchar:
	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_RX_SHIFT,r0,r1
	tst	#SCFDR2_RX_MASK,r0
	bt	1f
	mov.b	@(SCFRDR2-SCIF_BASE,gbr),r0
	extu.b	r0,r1
	mov.w	@(SCLSR2-SCIF_BASE,gbr),r0
	mov	#0,r0
	mov.w	r0,@(SCLSR2-SCIF_BASE,gbr)
	rts
	 mov	r1,r0
1:	rts
	 mov	#-1,r0

	SETCONST

	; Not sure we actually need to align the VBR; the only reason I
	;  have to suspect we might is that it's the kind of thing I've
	;  seen relatively often before - interrupt/trap vector tables
	;  often need to be aligned, not infrequently to a remarkably
	;  strict boundary.  I see no indication in the manuals that
	;  the SH requires _any_ alignment, but it's easy to do and
	;  definitely won't hurt anything.  (No explicit indication,
	;  that is.  It is implicit in the execution of code at
	;  VBR+0x100, VBR+0x400, and VBR+0x600 that VBR must be even.)
	.align	0x10000
	; Exception handling consists of:
	;	- Save PC and SR in SPC and SSR
	;	- Set SR bit BL to 1 (block exceptions/interrupts)
	;	- Set SR bit MD to 1 (privileged mode)
	;	- Set SR bit RB to 1 (r0-r7 bank 1)
	;	- Write code to EXPEVT or INTEVT
	;	- Set PC to vector addr, resume execution
intvec = .
. = intvec + 0x100
	SETS.L	#0x100,r2
	SETS.L	#EXPEVT,r0
	mov.l	@r0,r3
	SETS.L	#INTEVT,r0
	SETS.L	#regdump,r1
	jmp	@r1
	 mov.l	@r0,r4
	SETCONST
. = intvec + 0x400
	SETS.L	#0x400,r2
	SETS.L	#EXPEVT,r0
	mov.l	@r0,r3
	SETS.L	#INTEVT,r0
	SETS.L	#regdump,r1
	jmp	@r1
	 mov.l	@r0,r4
	SETCONST
. = intvec + 0x600
	SETS.L	#0x600,r2
	SETS.L	#EXPEVT,r0
	mov.l	@r0,r3
	SETS.L	#INTEVT,r0
	SETS.L	#regdump,r1
	jmp	@r1
	 mov.l	@r0,r4
	SETCONST
. = intvec + 0x1000
crash_msg_0:
	.asciz	(13,10,10)"FATAL TRAP"(13,10)"R0  "
crash_msg_1:
	.asciz	"   R1  "
crash_msg_2:
	.asciz	"   R2  "
crash_msg_3:
	.asciz	"   R3  "
crash_msg_4:
	.asciz	(13,10)"R4  "
crash_msg_5:
	.asciz	"   R5  "
crash_msg_6:
	.asciz	"   R6  "
crash_msg_7:
	.asciz	"   R7  "
crash_msg_8:
	.asciz	(13,10)"R8  "
crash_msg_9:
	.asciz	"   R9  "
crash_msg_10:
	.asciz	"   R10 "
crash_msg_11:
	.asciz	"   R11 "
crash_msg_12:
	.asciz	(13,10)"R12 "
crash_msg_13:
	.asciz	"   R13 "
crash_msg_14:
	.asciz	"   R14 "
crash_msg_15:
	.asciz	"   R15 "
crash_msg_gbr:
	.asciz	(13,10)"GBR "
crash_msg_sr:
	.asciz	"   SR  "
crash_msg_pc:
	.asciz	"   PC  "
crash_msg_mach:
	.asciz	(13,10)"MACH"
crash_msg_macl:
	.asciz	"   MACL"
crash_msg_pr:
	.asciz	"   PR  "
crash_msg_vec:
	.asciz	(13,10)"vector"
crash_msg_expevt:
	.asciz	"   EXPEVT"
crash_msg_intevt:
	.asciz	"   INTEVT"
crash_msg_done:
	.asciz	(13,10)
crash_msg_equal:
	.asciz	" = "
	.align	4
crash_msgs:
	.long	crash_msg_0
	.long	crash_msg_1
	.long	crash_msg_2
	.long	crash_msg_3
	.long	crash_msg_4
	.long	crash_msg_5
	.long	crash_msg_6
	.long	crash_msg_7
	.long	crash_msg_8
	.long	crash_msg_9
	.long	crash_msg_10
	.long	crash_msg_11
	.long	crash_msg_12
	.long	crash_msg_13
	.long	crash_msg_14
	.long	crash_msg_15
	.long	crash_msg_gbr
	.long	crash_msg_sr
	.long	crash_msg_pc
	.long	crash_msg_mach
	.long	crash_msg_macl
	.long	crash_msg_pr
	.long	crash_msg_vec
	.long	crash_msg_expevt
	.long	crash_msg_intevt
	.long	0
	.align	2
regdump:
	mov	r15,r5
	SETS.L	#intstacktop,r15
	mov.l	r4,@-r15
	mov.l	r3,@-r15
	mov.l	r2,@-r15
	sts.l	pr,@-r15
	sts.l	macl,@-r15
	sts.l	mach,@-r15
	stc.l	spc,@-r15
	stc.l	ssr,@-r15
	stc.l	gbr,@-r15
	mov.l	r5,@-r15
	mov.l	r14,@-r15
	mov.l	r13,@-r15
	mov.l	r12,@-r15
	mov.l	r11,@-r15
	mov.l	r10,@-r15
	mov.l	r9,@-r15
	mov.l	r8,@-r15
	stc.l	r7_bank,@-r15
	stc.l	r6_bank,@-r15
	stc.l	r5_bank,@-r15
	stc.l	r4_bank,@-r15
	stc.l	r3_bank,@-r15
	stc.l	r2_bank,@-r15
	stc.l	r1_bank,@-r15
	stc.l	r0_bank,@-r15
	SETS.L	#SCIF_BASE,r14
	SETS.L	#crash_msgs,r9
	SETS.L	#putstr,r8
	SETS.L	#printhex8,r7
	SETS.L	#putchar,r6
1:	mov.l	@r9+,r1
	tst	r1,r1
	bt	1f
	jsr	@r8
	 nop
	SETS.L	#crash_msg_equal,r1
	jsr	@r8
	 nop
	jsr	@r7
	 mov.l	@r15+,r1
	bra	1b
	 nop
1:	SETS.L	#crash_msg_done,r1
	jsr	@r8
	 nop
	jsr	@r6
	 mov	#0,r1
	SETS.L	#0xa0000000,r0	; hard-reset vector
	jmp	@r0
	 nop
	SETCONST
	.align	4
	.space	0x1000
intstacktop = .