. = 0x8c010000

.include "regs.s"

DC_PCLOCK = 49900000 ; Hz
BAUDRATE = 57600

PROGRESS_DELAY = 50100100

LF = 10
CR = 13

VIDBEG = 0xa520a000
VIDSTRIDE = 640 * 4

DOTWD = 16	; bytes (putdot assumes DOTWD is a multiple of 4)
DOTHT = 5	; lines

XDOT = VIDSTRIDE / DOTWD

; Not set in CCR_CONFIG: IIX OIX ORA WT
CCR_CONFIG = CCR_ICI | CCR_ICE | CCR_OCI | CCR_CB | CCR_OCE

	.entry	.

stacktop:
; r0 = scratch
; r1 = video ram base
; r2 = vidstride in bytes
; r3 = next dot X
; r4 = next dot Y
; r5 = dot X
; r6 = dot Y
; r7 = dot V
; r8 = scratch
; r9 = scratch
; r10 = scratch
; r11 = scratch
; r12 = scratch
	SETS.L	#VIDBEG,r1
	SETS.L	#VIDSTRIDE,r2
	mov	#64-1,r4
2:	mov	#128-1,r3
	mov	#0,r7
1:	mov	r4,r6
	bsr	putdot
	 mov	r3,r5
	cmp/pl	r3
	bt/s	1b
	 add	#-1,r3
	cmp/pl	r4
	bt/s	2b
	 add	#-1,r4
	bsr	progress
	 mov	#1,r0

	; We want to be running in P2, to fiddle CCR.  (The hardware
	; PDF, page 77, says that "CCR modifications must only be made
	; by a program in the non-cached P2 area".)
	mova	enable_cache,r0
	SETS.L	#Px_MASKOFF,r8	; mask off Px-selecting bits
	SETS.L	#P2_BITS,r9	; bits for P2
	and	r8,r0
	or	r9,r0
	jmp	@r0
	 nop

	.align	4
enable_cache:
	; Now running in P2, so we can turn on the cache.
	SETS.L	#CCR,r0
	SETS.L	#CCR_CONFIG,r9
	mov.l	r9,@r0
	; The hardware PDF, page 77, says that "After CCR is updated,
	; an instruction that performs data access to the P0, P1, P3,
	; or U0 area should be located at least four instructions after
	; the CCR update instruction.  Also, a branch instruction to
	; the P0, P1, P3, or U0 area should be located at least eight
	; instructions after the CCR update instruction."  It doesn't
	; say why this is "should" rather than "must", nor does it
	; describe the consequences if this is not done, nor does it
	; say whether this "beyond" refers to address space or
	; instruction execution order (eg, does a three-instruction
	; loop that's executed three times count as nine instructions
	; or three? does a branch seven instructions forward count?).
	; We treat it pessimistically, making sure we burn eight
	; instructions by any of these measures.
	;
	; Gotta love incomplete doc.
	;
	mova	cacheon,r0	; #1
	SETS.L	#P1_BITS,r9	; #2
	and	r8,r0		; #3
	or	r9,r0		; #4
	mov.l	9f,r15		; #5
	nop			; #6
	nop			; #7
	nop			; #8
	jmp	@r0
	 nop

	.align	4
9:	.long	stacktop

	.align	2
progress:
; Displays progress dot at (r0,2).
; Requires VIDBEG in r1 and VIDSTRIDE in r2.
; Destroys r0, r5, r6, r7, r8, r9, r10
	sts	pr,r10
	mov	r0,r5
	mov	#2,r6
	bsr	putdot
	 mov	#1,r7
	SETS.L	#PROGRESS_DELAY,r6
1:	dt	r6
	bf	1b
	jmp	@r10
	 nop

putdot:
; Stores dot of all 0s or all 1s, depending on low bit of r7, at (r5,r6)
; Requires VIDBEG in r1 and VIDSTRIDE in r2.
; Destroys r0, r8, r9
	add	#1,r5
	add	#1,r6
	mov	#DOTWD,r0
	mul.l	r0,r5
	SETS.L	#DOTHT*VIDSTRIDE,r0
	sts	macl,r8		; X offset in bytes
	mul.l	r0,r6
	mov	r7,r9
	sts	macl,r0		; Y offset in bytes
	shlr	r9
	add	r8,r0
	bt/s	1f
	 add	r1,r0
	bra	2f
	 mov	#0,r8
1:	mov	#-1,r8
2:	mov	#DOTHT,r9
3:	; We want to do DOTWD/4 mov.l instructions, with offsets
	; increasing by 4 each time.  This recursive macro does this.
	; This code assumes DOTWD is a multiple of 4.
	.macro	foo	b, o
	.if	$(b) >= 4
	mov.l	r8,@($(o),r0)
	foo	$(b)-4,$(o)+4
	.endif
	.endm
	foo	DOTWD,0
	dt	r9
	bf/s	3b
	 add	r2,r0
	add	#-1,r5
	rts
	 add	#-1,r6

	.align	4
cacheon:
	bsr	progress
	 mov	#4,r0
	; Initialize the SCIF.  Mostly follows hardware PDF figure
	; 16.6, but not entirely (eg, 16.6 shows turning on CKE1, but
	; we don't want external clock, so we don't).
	SETS.L	#SCIF_BASE,r0
	ldc	r0,gbr
	bsr	progress
	 mov	#6,r0
	; Clear SCSCR2 (in particular, clear TE and RE).
	mov	#0,r0
	mov.w	r0,@(SCSCR2-SCIF_BASE,gbr)
	bsr	progress
	 mov	#8,r0
	; Clear out the FIFOs.
	SETS.W	#SCFCR2_TFRST|SCFCR2_RFRST,r0
	mov.w	r0,@(SCFCR2-SCIF_BASE,gbr)
	bsr	progress
	 mov	#10,r0
	; Configure for 8N1.
	SETS.W	#SCSMR2_CHR_8|SCSMR2_PE_DIS|SCSMR2_STOP_1|SCSMR2_CKS_DIV1,r0
	mov.w	r0,@(SCSMR2-SCIF_BASE,gbr)
	bsr	progress
	 mov	#12,r0
	; Set the BRG constant.
	mov	#[[[[DC_PCLOCK*2]/[32*BAUDRATE]]+1]/2],r0
	mov.b	r0,@(SCBRR2-SCIF_BASE,gbr)
	bsr	progress
	 mov	#14,r0
	; Delay at least one bit time.
	; The value for delaytime assumes this loop takes only one
	; clock per iteration.  This seems implausible, but it's about
	; what I see when testing; perhaps the CPU clock is twice the
	; BRG divisor chain's clock.  Or perhaps this is superscalarity
	; in action.  Whatever - if it takes more than one clock per
	; loop, we delay longer than expected, but that's harmless.
	SETS.L	#[2*DC_PCLOCK]/BAUDRATE,r0	; 2 bit times, in clocks
1:	dt	r0
	bf	1b
	bsr	progress
	 mov	#16,r0
	; Set the FIFO interrupt trigger points and clear the reset
	; bits.  We don't actually care about the trigger points,
	; because we don't use interrupts; we might be able to skip
	; this step, but it's easy and harmless.
	SETS.W	#SCFCR2_RXT_8|SCFCR2_TXT_8,r0
	mov.w	r0,@(SCFCR2-SCIF_BASE,gbr)
	bsr	progress
	 mov	#18,r0
	; Setup complete.  Enable transmitter and receiver.
	SETS.W	#SCSCR2_TE|SCSCR2_RE,r0
	mov.w	r0,@(SCSCR2-SCIF_BASE,gbr)
	bsr	progress
	 mov	#20,r0
	; Flush any lingering statuses.
	mov.w	@(SCFSR2-SCIF_BASE,gbr),r0
	mov	#0,r0
	mov.w	r0,@(SCFSR2-SCIF_BASE,gbr)
	mov.w	@(SCLSR2-SCIF_BASE,gbr),r0
	bsr	progress
	 mov	#22,r0
	mov	#0,r0
	mov.w	r0,@(SCLSR2-SCIF_BASE,gbr)
	bsr	progress
	 mov	#24,r0

main:
; r15 = conventional stack pointer (already set up)
; r14 = SCIF base pointer (in gbr on entry)
; r13 = shift register, high half
; r12 = shift register, low half
; r11 = top-of-loop pointer
; r10 = current size (1, 2, or 4)
; r9 = scratch
; r8 = scratch
; r7 = scratch
; r6 = scratch
; r5 = scratch
; r4 = scratch
; r3 = scratch
; r2 = scratch
; r1 = scratch
; r0 = scratch
; gbr = scratch
	stc	gbr,r14
	bsr	putchar
	 mov	#'~,r1
	bsr	putchar
	 mov	#CR,r1
	bsr	putchar
	 mov	#LF,r1
	mova	maintop,r0
	mov	r0,r11
	mov	#1,r10
	jmp	@r11
	 nop
mainloop:
	add	#1,r4
	SETS.L	#VIDBEG,r1
	mov	r4,r7
	SETS.L	#VIDSTRIDE,r2
	SHLR	#17,r7/r6
	mov	#20,r5
	bsr	putdot
	 mov	#5,r6
	.if	. & 2
	nop
	.endif
maintop:
	ldc	r14,gbr
	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_RX_SHIFT,r0/r1
	tst	#SCFDR2_RX_MASK,r0
	bt	mainloop
	mov.w	@(SCFSR2-SCIF_BASE,gbr),r0
	tst	#SCFSR2_FER|SCFSR2_PER,r0
	mov.b	@(SCFRDR2-SCIF_BASE,gbr),r0
	extu.b	r0,r1
	mov.w	@(SCLSR2-SCIF_BASE,gbr),r0
	mov	#0,r0
	mov.w	r0,@(SCLSR2-SCIF_BASE,gbr)
	bf	mainloop
	mov	r1,r2
	mova	chartbl,r0
	SHLL	#2,r2
	mov.l	@(r0,r2),r2
	jmp	@r2
	 nop
	SETCONST

	.align	4
chartbl:
	.long	char_default	; 0x00
	.long	char_default	; 0x01
	.long	char_default	; 0x02
	.long	char_default	; 0x03
	.long	char_default	; 0x04
	.long	char_default	; 0x05
	.long	char_default	; 0x06
	.long	char_default	; 0x07
	.long	char_default	; 0x08
	.long	char_default	; 0x09
	.long	char_default	; 0x0a
	.long	char_default	; 0x0b
	.long	char_default	; 0x0c
	.long	char_default	; 0x0d
	.long	char_default	; 0x0e
	.long	char_default	; 0x0f
	.long	char_default	; 0x10
	.long	char_default	; 0x11
	.long	char_default	; 0x12
	.long	char_default	; 0x13
	.long	char_default	; 0x14
	.long	char_default	; 0x15
	.long	char_default	; 0x16
	.long	char_default	; 0x17
	.long	char_default	; 0x18
	.long	char_default	; 0x19
	.long	char_default	; 0x1a
	.long	char_default	; 0x1b
	.long	char_default	; 0x1c
	.long	char_default	; 0x1d
	.long	char_default	; 0x1e
	.long	char_default	; 0x1f
	.long	char_default	; 0x20 = space
	.long	char_store	; 0x21 = !
	.long	char_default	; 0x22 = "
	.long	char_default	; 0x23 = #
	.long	char_default	; 0x24 = $
	.long	char_default	; 0x25 = %
	.long	char_default	; 0x26 = &
	.long	char_default	; 0x27 = '
	.long	char_default	; 0x28 = (
	.long	char_default	; 0x29 = )
	.long	char_indir	; 0x2a = *
	.long	char_plus	; 0x2b = +
	.long	char_default	; 0x2c = ,
	.long	char_minus	; 0x2d = -
	.long	char_show_d	; 0x2e = .
	.long	char_default	; 0x2f = /
	.long	char_digit	; 0x30 = 0
	.long	char_digit	; 0x31 = 1
	.long	char_digit	; 0x32 = 2
	.long	char_digit	; 0x33 = 3
	.long	char_digit	; 0x34 = 4
	.long	char_digit	; 0x35 = 5
	.long	char_digit	; 0x36 = 6
	.long	char_digit	; 0x37 = 7
	.long	char_digit	; 0x38 = 8
	.long	char_digit	; 0x39 = 9
	.long	char_default	; 0x3a = :
	.long	char_default	; 0x3b = ;
	.long	char_default	; 0x3c = <
	.long	char_default	; 0x3d = =
	.long	char_default	; 0x3e = >
	.long	char_show	; 0x3f = ?
	.long	char_fetch	; 0x40 = @
	.long	char_default	; 0x41 = A
	.long	char_size_1	; 0x42 = B
	.long	char_default	; 0x43 = C
	.long	char_default	; 0x44 = D
	.long	char_default	; 0x45 = E
	.long	char_default	; 0x46 = F
	.long	char_default	; 0x47 = G
	.long	char_default	; 0x48 = H
	.long	char_default	; 0x49 = I
	.long	char_jmp	; 0x4a = J
	.long	char_default	; 0x4b = K
	.long	char_size_4	; 0x4c = L
	.long	char_default	; 0x4d = M
	.long	char_default	; 0x4e = N
	.long	char_default	; 0x4f = O
	.long	char_default	; 0x50 = P
	.long	char_default	; 0x51 = Q
	.long	char_default	; 0x52 = R
	.long	char_default	; 0x53 = S
	.long	char_tbl	; 0x54 = T
	.long	char_default	; 0x55 = U
	.long	char_vid	; 0x56 = V
	.long	char_size_2	; 0x57 = W
	.long	char_default	; 0x58 = X
	.long	char_default	; 0x59 = Y
	.long	char_default	; 0x5a = Z
	.long	char_default	; 0x5b = [
	.long	char_default	; 0x5c = \
	.long	char_default	; 0x5d = ]
	.long	char_default	; 0x5e = ^
	.long	char_default	; 0x5f = _
	.long	char_default	; 0x60 = `
	.long	char_xdigit	; 0x61 = a
	.long	char_xdigit	; 0x62 = b
	.long	char_xdigit	; 0x63 = c
	.long	char_xdigit	; 0x64 = d
	.long	char_xdigit	; 0x65 = e
	.long	char_xdigit	; 0x66 = f
	.long	char_default	; 0x67 = g
	.long	char_default	; 0x68 = h
	.long	char_default	; 0x69 = i
	.long	char_default	; 0x6a = j
	.long	char_default	; 0x6b = k
	.long	char_default	; 0x6c = l
	.long	char_default	; 0x6d = m
	.long	char_default	; 0x6e = n
	.long	char_default	; 0x6f = o
	.long	char_default	; 0x70 = p
	.long	char_default	; 0x71 = q
	.long	char_default	; 0x72 = r
	.long	char_default	; 0x73 = s
	.long	char_default	; 0x74 = t
	.long	char_upload	; 0x75 = u
	.long	char_default	; 0x76 = v
	.long	char_default	; 0x77 = w
	.long	char_default	; 0x78 = x
	.long	char_default	; 0x79 = y
	.long	char_default	; 0x7a = z
	.long	char_default	; 0x7b = {
	.long	char_default	; 0x7c = |
	.long	char_default	; 0x7d = }
	.long	char_default	; 0x7e = ~
	.long	char_default	; 0x7f = DEL
	.long	char_default	; 0x80
	.long	char_default	; 0x81
	.long	char_default	; 0x82
	.long	char_default	; 0x83
	.long	char_default	; 0x84
	.long	char_default	; 0x85
	.long	char_default	; 0x86
	.long	char_default	; 0x87
	.long	char_default	; 0x88
	.long	char_default	; 0x89
	.long	char_default	; 0x8a
	.long	char_default	; 0x8b
	.long	char_default	; 0x8c
	.long	char_default	; 0x8d
	.long	char_default	; 0x8e
	.long	char_default	; 0x8f
	.long	char_default	; 0x90
	.long	char_default	; 0x91
	.long	char_default	; 0x92
	.long	char_default	; 0x93
	.long	char_default	; 0x94
	.long	char_default	; 0x95
	.long	char_default	; 0x96
	.long	char_default	; 0x97
	.long	char_default	; 0x98
	.long	char_default	; 0x99
	.long	char_default	; 0x9a
	.long	char_default	; 0x9b
	.long	char_default	; 0x9c
	.long	char_default	; 0x9d
	.long	char_default	; 0x9e
	.long	char_default	; 0x9f
	.long	char_default	; 0xa0 = non-break space
	.long	char_default	; 0xa1 = ¡
	.long	char_default	; 0xa2 = ¢
	.long	char_default	; 0xa3 = £
	.long	char_default	; 0xa4 = ¤
	.long	char_default	; 0xa5 = ¥
	.long	char_default	; 0xa6 = ¦
	.long	char_default	; 0xa7 = §
	.long	char_default	; 0xa8 = ¨
	.long	char_default	; 0xa9 = ©
	.long	char_default	; 0xaa = ª
	.long	char_default	; 0xab = «
	.long	char_default	; 0xac = ¬
	.long	char_default	; 0xad = ­
	.long	char_default	; 0xae = ®
	.long	char_default	; 0xaf = ¯
	.long	char_default	; 0xb0 = °
	.long	char_default	; 0xb1 = ±
	.long	char_default	; 0xb2 = ²
	.long	char_default	; 0xb3 = ³
	.long	char_default	; 0xb4 = ´
	.long	char_default	; 0xb5 = µ
	.long	char_default	; 0xb6 = ¶
	.long	char_default	; 0xb7 = ·
	.long	char_default	; 0xb8 = ¸
	.long	char_default	; 0xb9 = ¹
	.long	char_default	; 0xba = º
	.long	char_default	; 0xbb = »
	.long	char_default	; 0xbc = ¼
	.long	char_default	; 0xbd = ½
	.long	char_default	; 0xbe = ¾
	.long	char_default	; 0xbf = ¿
	.long	char_default	; 0xc0 = À
	.long	char_default	; 0xc1 = Á
	.long	char_default	; 0xc2 = Â
	.long	char_default	; 0xc3 = Ã
	.long	char_default	; 0xc4 = Ä
	.long	char_default	; 0xc5 = Å
	.long	char_default	; 0xc6 = Æ
	.long	char_default	; 0xc7 = Ç
	.long	char_default	; 0xc8 = È
	.long	char_default	; 0xc9 = É
	.long	char_default	; 0xca = Ê
	.long	char_default	; 0xcb = Ë
	.long	char_default	; 0xcc = Ì
	.long	char_default	; 0xcd = Í
	.long	char_default	; 0xce = Î
	.long	char_default	; 0xcf = Ï
	.long	char_default	; 0xd0 = Ð
	.long	char_default	; 0xd1 = Ñ
	.long	char_default	; 0xd2 = Ò
	.long	char_default	; 0xd3 = Ó
	.long	char_default	; 0xd4 = Ô
	.long	char_default	; 0xd5 = Õ
	.long	char_default	; 0xd6 = Ö
	.long	char_default	; 0xd7 = ×
	.long	char_default	; 0xd8 = Ø
	.long	char_default	; 0xd9 = Ù
	.long	char_default	; 0xda = Ú
	.long	char_default	; 0xdb = Û
	.long	char_default	; 0xdc = Ü
	.long	char_default	; 0xdd = Ý
	.long	char_default	; 0xde = Þ
	.long	char_default	; 0xdf = ß
	.long	char_default	; 0xe0 = à
	.long	char_default	; 0xe1 = á
	.long	char_default	; 0xe2 = â
	.long	char_default	; 0xe3 = ã
	.long	char_default	; 0xe4 = ä
	.long	char_default	; 0xe5 = å
	.long	char_default	; 0xe6 = æ
	.long	char_default	; 0xe7 = ç
	.long	char_default	; 0xe8 = è
	.long	char_default	; 0xe9 = é
	.long	char_default	; 0xea = ê
	.long	char_default	; 0xeb = ë
	.long	char_default	; 0xec = ì
	.long	char_default	; 0xed = í
	.long	char_default	; 0xee = î
	.long	char_default	; 0xef = ï
	.long	char_default	; 0xf0 = ð
	.long	char_default	; 0xf1 = ñ
	.long	char_default	; 0xf2 = ò
	.long	char_default	; 0xf3 = ó
	.long	char_default	; 0xf4 = ô
	.long	char_default	; 0xf5 = õ
	.long	char_default	; 0xf6 = ö
	.long	char_default	; 0xf7 = ÷
	.long	char_default	; 0xf8 = ø
	.long	char_default	; 0xf9 = ù
	.long	char_default	; 0xfa = ú
	.long	char_default	; 0xfb = û
	.long	char_default	; 0xfc = ü
	.long	char_default	; 0xfd = ý
	.long	char_default	; 0xfe = þ
	.long	char_default	; 0xff = ÿ

alignment_error:
	.asciz	"Alignment error"(CR,LF)
size_error:
	.asciz	"Internal size error"(CR,LF)
crlf:
	.asciz	(CR,LF)

	.align	2
; Print char from r1.
; Destroys r0, gbr.
putchar:
	ldc	r14,gbr
1:	mov.w	@(SCFDR2-SCIF_BASE,gbr),r0
	SHXR	#SCFDR2_TX_SHIFT,r0
	and	#SCFDR2_TX_MASK,r0
	cmp/eq	#16,r0
	bt	1b
	mov	r1,r0
	rts
	 mov.b	r0,@(SCFTDR2-SCIF_BASE,gbr)

; Print .asciz string pointed to by r0.
; Destroys r0-r3, gbr, pr.
; This code is a bit of an abuse of the call/return paradigm pr is
; designed for; we stuff a loop top address in pr (with bsr) and then
; have putchar return to it multiple times.  This depends on the
; definitions of bsr as "pr=.+4; bra" and rts as "jmp @pr".
putstr:
	sts	pr,r3
	bsr	1f
	 mov	r0,r2
1:	mov.b	@r2+,r0
	cmp/eq	#0,r0
	bf/s	putchar
	 mov	r0,r1
	jmp	@r3
	 nop

; Prints the low 1+r2 nibbles of r1 in hex, r2>0.
; Destroys r0-r5, gbr, pr.
puthex:
	sts	pr,r5
	mov.l	9f,r4
	mov	r1,r3
1:	mov	r2,r1
	shll2	r1
	neg	r1,r1
	mov	r3,r0
	shld	r1,r0
	and	#0xf,r0
	mov.b	@(r0,r4),r1
	bsr	putchar
	 add	#-1,r2
	cmp/pz	r2
	bt	1b
	jmp	@r5
	 nop
	.align	4
9:	.long	xdigits
xdigits:
	.ascii	"0123456789abcdef"
	.align	2

; Fetches r0-byte data through r1.
; r0 must be 1, 2, or 4.
; Fetched value is returned in r1; if r0<4, rest of r1 is unspecified.
; If r1%r0!=0, or r0 is invalid, prints a message and jumps through r11.
; Destroys r0; on error, also destroys as for putstr.
do_ifetch:
	add	#-1,r0
	tst	r0,r1
	bt	1f
	mov.l	9f,r0
	bra	putstr
	 lds	r11,pr
1:	cmp/eq	#0,r0
	bt	1f
	cmp/eq	#1,r0
	bt	2f
	cmp/eq	#3,r0
	bt	4f
	mov.l	8f,r0
	bra	putstr
	 lds	r11,pr
1:	rts
	 mov.b	@r1,r1
2:	rts
	 mov.w	@r1,r1
4:	rts
	 mov.l	@r1,r1
	.align	4
9:	.long	alignment_error
8:	.long	size_error
	.align	2

; Stores r0-byte data through r1; data comes from r2.
; r0 must be 1, 2, or 4.
; If r1%r0!=0, or r0 is invalid, prints a message and jumps through r11.
; Destroys r0; on error, also destroys as for putstr.
do_istore:
	add	#-1,r0
	tst	r0,r1
	bt	1f
	mov.l	9f,r0
	bra	putstr
	 lds	r11,pr
1:	cmp/eq	#0,r0
	bt	1f
	cmp/eq	#1,r0
	bt	2f
	cmp/eq	#3,r0
	bt	4f
	mov.l	8f,r0
	bra	putstr
	 lds	r11,pr
1:	rts
	 mov.b	r2,@r1
2:	rts
	 mov.w	r2,@r1
4:	rts
	 mov.l	r2,@r1
	.align	4
9:	.long	alignment_error
8:	.long	size_error
	.align	2

char_default:
	mov	#7,r1
	bra	putchar
	 lds	r11,pr

char_xdigit:
	add	#9,r1
char_digit:
	mov	#0xf,r2
	and	r2,r1
	SHLL	#4,r13/r2
	mov	r12,r0
	SHLR	#28,r0/r2
	or	r0,r13
	SHLL	#4,r12/r2
	jmp	@r11
	 or	r1,r12

char_size_1:
	jmp	@r11
	 mov	#1,r10

char_size_2:
	jmp	@r11
	 mov	#2,r10

char_size_4:
	jmp	@r11
	 mov	#4,r10

char_show:
	mova	9f,r0
	bsr	putstr
	 nop
	mov	r12,r1
	bsr	puthex
	 mov	#8-1,r2
	mova	8f,r0
	bsr	putstr
	 nop
char_show_d:
	mov	r10,r2
	mov	r13,r1
	shll	r2
	bsr	puthex
	 add	#-1,r2
	mov.l	7f,r0
	bra	putstr
	 lds	r11,pr
	.align	4
9:	.asciz	"addr = "
	.align	4
8:	.asciz	" value = "
	.align	4
7:	.long	crlf
	.align	2

char_fetch:
	mov	r12,r1
	bsr	do_ifetch
	 mov	r10,r0
	mov	r10,r2
	mov	r1,r13
	shll	r2
	bsr	puthex
	 add	#-1,r2
	mov.l	9f,r0
	bra	putstr
	 lds	r11,pr
	.align	4
9:	.long	crlf
	.align	2

char_indir:
	mov	r12,r1
	bsr	do_ifetch
	 mov	r10,r0
	jmp	@r11
	 mov	r1,r13

char_store:
	mov	r10,r0
	mov	r12,r1
	mov	r13,r2
	bra	do_istore
	 lds	r11,pr

char_plus:
	jmp	@r11
	 add	r10,r12

char_minus:
	jmp	@r11
	 sub	r10,r12

printval:
	sts	pr,r6
	bsr	puthex
	 mov	#8-1,r2
	mov.l	9f,r0
	bra	putstr
	 lds	r6,pr
	.align	4
9:	.long	crlf

char_tbl:
	bra	printval
	 mov	r0,r1

char_vid:
	SETS.L	#VIDBEG,r1
	bra	printval
	 nop

char_upload:
	; Shifts r13/r12 right by r10 bytes, putting the shifted-out
	; data in r13.  Then does a store, a la char_store; then does
	; an increment, a la char_plus.  The idea is that you can set
	; up the base address and size, then just stream
	; (data) u (data) u ... to upload bulk data.
	; Echoes a . in response, to allow lockstep operation.
	mov	r10,r0
	cmp/eq	#4,r0
	bt	4f
	cmp/eq	#2,r0
	bt	2f
	cmp/eq	#1,r0
	bt	1f
	mov.l	9f,r0
	bra	putstr
	 lds	r11,pr
4:	mov	r12,r0
	mov	r13,r12
	bra	3f
	 mov	r0,r13
2:	mov	r12,r0
	xtrct	r13,r12
	bra	3f
	 extu.w	r0,r13
1:	mov	r12,r0
	SHLR	#8,r12/r2
	SHLL	#24,r13/r3
	or	r13,r12
	extu.b	r0,r13
3:	bsr	putchar
	 mov	#'.,r1
	mov	r12,r1
	mov	r13,r2
	mov	r10,r0
	add	r10,r12
	bra	do_istore
	 lds	r11,pr
	.align	4
9:	.long	size_error
	.align	2

char_jmp:
	; We'd like to just "jmp @r12" here.  But we've probably just
	; stuffed code into RAM and want to run it, and that won't work
	; unless either there's a lot of it or we push all dirty d$
	; blocks first.  (We could configure the cache write-through
	; instead, which works, but we'd prefer the performance of CB.
	; And we may have to invalidate the i$ even for WT.)

	; Must be running in P2 for all this.
	mova	1f,r0
	SETS.L	#Px_MASKOFF,r1
	SETS.L	#P2_BITS,r2
	and	r1,r0
	or	r2,r0
	jmp	@r0
	 nop

	.align	4
1:	; Now running in P2.

	; First, push all dirty d$ blocks.
	; Read the d$ entries and invalidate/push as needed.
	; Fortunately invalidating via the array provokes pushes; I
	; don't relish the prospect of working out addresses for ocbp.
	; The d$ contents are frozen during this, because the loop does
	; no data accesses except to the d$ array view (which of course
	; is itself uncached).
	SETS.L	#DCAA_ENTRY_MASK,r5
	SETS.L	#~DCAA_U,r4
	SETS.L	#DCAA_BASE,r3
1:	mov.l	@r3,r0
	and	#DCAA_U|DCAA_V,r0
	cmp/eq	#DCAA_U|DCAA_V,r0
	bf	2f
	and	r4,r0
	mov.l	r4,@r3
2:	dt	r5
	bf/s	1b
	 add	#32,r3

	; Now that we've pushed all d$ blocks, invalidate the i$.
	; We could use the cache array as we did for the d$, but since
	; there are no dirty line pushes to worry about in i$, we can
	; just use the CCR to invalidate the whole i$ at once.  Of
	; course, the i$ will fill up again as we execute code, but (a)
	; that doesn't happen until we leave P2 and (b) that doesn't
	; matter, since it will fill with the current values from
	; external RAM, which are what we want - now that the d$ is
	; flushed, external RAM is synced.
	;
	; This does assume that the d$ flush pushes finish before we
	; try to read them into the i$.  The 7750 does have buffering
	; here; see sections 4.3.4 and 4.3.5 of the programmer's PDF
	; (page 69), but, as far as I can see, there is no way to wait
	; until data has been flushed from them to external RAM.  We
	; just have to assume it will happen fast enough - or that
	; clearing the U bit through the array view ignores these
	; buffers, or, equivalently from our point of view, waits until
	; data has left them and hit main memory.
	SETS.L	#CCR,r3
	SETS.L	#CCR_ICI,r4
	mov.l	@r3,r0
	or	r4,r0
	mov.l	r0,@r3
	; We have the same eight-instruction dance to do here as above
	; (see the code at the "enable_cache" label).  We might not
	; actually need to do this, if r12 points to P2, but it's cheap
	; and easy to do.
	nop			; #1
	nop			; #2
	nop			; #3
	nop			; #4
	nop			; #5
	nop			; #6
	nop			; #7
	nop			; #8
	jmp	@r12
	 nop
	SETCONST