. = 0x8c010000 .include "regs.s" DC_PCLOCK = 49900000 ; Hz BAUDRATE = 57600 PROGRESS_DELAY = 50100100 LF = 10 CR = 13 VIDBEG = 0xa520a000 VIDSTRIDE = 640 * 4 DOTWD = 16 ; bytes (putdot assumes DOTWD is a multiple of 4) DOTHT = 5 ; lines XDOT = VIDSTRIDE / DOTWD ; Not set in CCR_CONFIG: IIX OIX ORA WT CCR_CONFIG = CCR_ICI | CCR_ICE | CCR_OCI | CCR_CB | CCR_OCE .entry . stacktop: ; r0 = scratch ; r1 = video ram base ; r2 = vidstride in bytes ; r3 = next dot X ; r4 = next dot Y ; r5 = dot X ; r6 = dot Y ; r7 = dot V ; r8 = scratch ; r9 = scratch ; r10 = scratch ; r11 = scratch ; r12 = scratch SETS.L #VIDBEG,r1 SETS.L #VIDSTRIDE,r2 mov #64-1,r4 2: mov #128-1,r3 mov #0,r7 1: mov r4,r6 bsr putdot mov r3,r5 cmp/pl r3 bt/s 1b add #-1,r3 cmp/pl r4 bt/s 2b add #-1,r4 bsr progress mov #1,r0 ; We want to be running in P2, to fiddle CCR. (The hardware ; PDF, page 77, says that "CCR modifications must only be made ; by a program in the non-cached P2 area".) mova enable_cache,r0 SETS.L #Px_MASKOFF,r8 ; mask off Px-selecting bits SETS.L #P2_BITS,r9 ; bits for P2 and r8,r0 or r9,r0 jmp @r0 nop .align 4 enable_cache: ; Now running in P2, so we can turn on the cache. SETS.L #CCR,r0 SETS.L #CCR_CONFIG,r9 mov.l r9,@r0 ; The hardware PDF, page 77, says that "After CCR is updated, ; an instruction that performs data access to the P0, P1, P3, ; or U0 area should be located at least four instructions after ; the CCR update instruction. Also, a branch instruction to ; the P0, P1, P3, or U0 area should be located at least eight ; instructions after the CCR update instruction." It doesn't ; say why this is "should" rather than "must", nor does it ; describe the consequences if this is not done, nor does it ; say whether this "beyond" refers to address space or ; instruction execution order (eg, does a three-instruction ; loop that's executed three times count as nine instructions ; or three? does a branch seven instructions forward count?). ; We treat it pessimistically, making sure we burn eight ; instructions by any of these measures. ; ; Gotta love incomplete doc. ; mova cacheon,r0 ; #1 SETS.L #P1_BITS,r9 ; #2 and r8,r0 ; #3 or r9,r0 ; #4 mov.l 9f,r15 ; #5 nop ; #6 nop ; #7 nop ; #8 jmp @r0 nop .align 4 9: .long stacktop .align 2 progress: ; Displays progress dot at (r0,2). ; Requires VIDBEG in r1 and VIDSTRIDE in r2. ; Destroys r0, r5, r6, r7, r8, r9, r10 sts pr,r10 mov r0,r5 mov #2,r6 bsr putdot mov #1,r7 SETS.L #PROGRESS_DELAY,r6 1: dt r6 bf 1b jmp @r10 nop putdot: ; Stores dot of all 0s or all 1s, depending on low bit of r7, at (r5,r6) ; Requires VIDBEG in r1 and VIDSTRIDE in r2. ; Destroys r0, r8, r9 add #1,r5 add #1,r6 mov #DOTWD,r0 mul.l r0,r5 SETS.L #DOTHT*VIDSTRIDE,r0 sts macl,r8 ; X offset in bytes mul.l r0,r6 mov r7,r9 sts macl,r0 ; Y offset in bytes shlr r9 add r8,r0 bt/s 1f add r1,r0 bra 2f mov #0,r8 1: mov #-1,r8 2: mov #DOTHT,r9 3: ; We want to do DOTWD/4 mov.l instructions, with offsets ; increasing by 4 each time. This recursive macro does this. ; This code assumes DOTWD is a multiple of 4. .macro foo b, o .if $(b) >= 4 mov.l r8,@($(o),r0) foo $(b)-4,$(o)+4 .endif .endm foo DOTWD,0 dt r9 bf/s 3b add r2,r0 add #-1,r5 rts add #-1,r6 .align 4 cacheon: bsr progress mov #4,r0 ; Initialize the SCIF. Mostly follows hardware PDF figure ; 16.6, but not entirely (eg, 16.6 shows turning on CKE1, but ; we don't want external clock, so we don't). SETS.L #SCIF_BASE,r0 ldc r0,gbr bsr progress mov #6,r0 ; Clear SCSCR2 (in particular, clear TE and RE). mov #0,r0 mov.w r0,@(SCSCR2-SCIF_BASE,gbr) bsr progress mov #8,r0 ; Clear out the FIFOs. SETS.W #SCFCR2_TFRST|SCFCR2_RFRST,r0 mov.w r0,@(SCFCR2-SCIF_BASE,gbr) bsr progress mov #10,r0 ; Configure for 8N1. SETS.W #SCSMR2_CHR_8|SCSMR2_PE_DIS|SCSMR2_STOP_1|SCSMR2_CKS_DIV1,r0 mov.w r0,@(SCSMR2-SCIF_BASE,gbr) bsr progress mov #12,r0 ; Set the BRG constant. mov #[[[[DC_PCLOCK*2]/[32*BAUDRATE]]+1]/2],r0 mov.b r0,@(SCBRR2-SCIF_BASE,gbr) bsr progress mov #14,r0 ; Delay at least one bit time. ; The value for delaytime assumes this loop takes only one ; clock per iteration. This seems implausible, but it's about ; what I see when testing; perhaps the CPU clock is twice the ; BRG divisor chain's clock. Or perhaps this is superscalarity ; in action. Whatever - if it takes more than one clock per ; loop, we delay longer than expected, but that's harmless. SETS.L #[2*DC_PCLOCK]/BAUDRATE,r0 ; 2 bit times, in clocks 1: dt r0 bf 1b bsr progress mov #16,r0 ; Set the FIFO interrupt trigger points and clear the reset ; bits. We don't actually care about the trigger points, ; because we don't use interrupts; we might be able to skip ; this step, but it's easy and harmless. SETS.W #SCFCR2_RXT_8|SCFCR2_TXT_8,r0 mov.w r0,@(SCFCR2-SCIF_BASE,gbr) bsr progress mov #18,r0 ; Setup complete. Enable transmitter and receiver. SETS.W #SCSCR2_TE|SCSCR2_RE,r0 mov.w r0,@(SCSCR2-SCIF_BASE,gbr) bsr progress mov #20,r0 ; Flush any lingering statuses. mov.w @(SCFSR2-SCIF_BASE,gbr),r0 mov #0,r0 mov.w r0,@(SCFSR2-SCIF_BASE,gbr) mov.w @(SCLSR2-SCIF_BASE,gbr),r0 bsr progress mov #22,r0 mov #0,r0 mov.w r0,@(SCLSR2-SCIF_BASE,gbr) bsr progress mov #24,r0 main: ; r15 = conventional stack pointer (already set up) ; r14 = SCIF base pointer (in gbr on entry) ; r13 = shift register, high half ; r12 = shift register, low half ; r11 = top-of-loop pointer ; r10 = current size (1, 2, or 4) ; r9 = scratch ; r8 = scratch ; r7 = scratch ; r6 = scratch ; r5 = scratch ; r4 = scratch ; r3 = scratch ; r2 = scratch ; r1 = scratch ; r0 = scratch ; gbr = scratch stc gbr,r14 bsr putchar mov #'~,r1 bsr putchar mov #CR,r1 bsr putchar mov #LF,r1 mova maintop,r0 mov r0,r11 mov #1,r10 jmp @r11 nop mainloop: add #1,r4 SETS.L #VIDBEG,r1 mov r4,r7 SETS.L #VIDSTRIDE,r2 SHLR #17,r7/r6 mov #20,r5 bsr putdot mov #5,r6 .if . & 2 nop .endif maintop: ldc r14,gbr mov.w @(SCFDR2-SCIF_BASE,gbr),r0 SHXR #SCFDR2_RX_SHIFT,r0/r1 tst #SCFDR2_RX_MASK,r0 bt mainloop mov.w @(SCFSR2-SCIF_BASE,gbr),r0 tst #SCFSR2_FER|SCFSR2_PER,r0 mov.b @(SCFRDR2-SCIF_BASE,gbr),r0 extu.b r0,r1 mov.w @(SCLSR2-SCIF_BASE,gbr),r0 mov #0,r0 mov.w r0,@(SCLSR2-SCIF_BASE,gbr) bf mainloop mov r1,r2 mova chartbl,r0 SHLL #2,r2 mov.l @(r0,r2),r2 jmp @r2 nop SETCONST .align 4 chartbl: .long char_default ; 0x00 .long char_default ; 0x01 .long char_default ; 0x02 .long char_default ; 0x03 .long char_default ; 0x04 .long char_default ; 0x05 .long char_default ; 0x06 .long char_default ; 0x07 .long char_default ; 0x08 .long char_default ; 0x09 .long char_default ; 0x0a .long char_default ; 0x0b .long char_default ; 0x0c .long char_default ; 0x0d .long char_default ; 0x0e .long char_default ; 0x0f .long char_default ; 0x10 .long char_default ; 0x11 .long char_default ; 0x12 .long char_default ; 0x13 .long char_default ; 0x14 .long char_default ; 0x15 .long char_default ; 0x16 .long char_default ; 0x17 .long char_default ; 0x18 .long char_default ; 0x19 .long char_default ; 0x1a .long char_default ; 0x1b .long char_default ; 0x1c .long char_default ; 0x1d .long char_default ; 0x1e .long char_default ; 0x1f .long char_default ; 0x20 = space .long char_store ; 0x21 = ! .long char_default ; 0x22 = " .long char_default ; 0x23 = # .long char_default ; 0x24 = $ .long char_default ; 0x25 = % .long char_default ; 0x26 = & .long char_default ; 0x27 = ' .long char_default ; 0x28 = ( .long char_default ; 0x29 = ) .long char_indir ; 0x2a = * .long char_plus ; 0x2b = + .long char_default ; 0x2c = , .long char_minus ; 0x2d = - .long char_show_d ; 0x2e = . .long char_default ; 0x2f = / .long char_digit ; 0x30 = 0 .long char_digit ; 0x31 = 1 .long char_digit ; 0x32 = 2 .long char_digit ; 0x33 = 3 .long char_digit ; 0x34 = 4 .long char_digit ; 0x35 = 5 .long char_digit ; 0x36 = 6 .long char_digit ; 0x37 = 7 .long char_digit ; 0x38 = 8 .long char_digit ; 0x39 = 9 .long char_default ; 0x3a = : .long char_default ; 0x3b = ; .long char_default ; 0x3c = < .long char_default ; 0x3d = = .long char_default ; 0x3e = > .long char_show ; 0x3f = ? .long char_fetch ; 0x40 = @ .long char_default ; 0x41 = A .long char_size_1 ; 0x42 = B .long char_default ; 0x43 = C .long char_default ; 0x44 = D .long char_default ; 0x45 = E .long char_default ; 0x46 = F .long char_default ; 0x47 = G .long char_default ; 0x48 = H .long char_default ; 0x49 = I .long char_jmp ; 0x4a = J .long char_default ; 0x4b = K .long char_size_4 ; 0x4c = L .long char_default ; 0x4d = M .long char_default ; 0x4e = N .long char_default ; 0x4f = O .long char_default ; 0x50 = P .long char_default ; 0x51 = Q .long char_default ; 0x52 = R .long char_default ; 0x53 = S .long char_tbl ; 0x54 = T .long char_default ; 0x55 = U .long char_vid ; 0x56 = V .long char_size_2 ; 0x57 = W .long char_default ; 0x58 = X .long char_default ; 0x59 = Y .long char_default ; 0x5a = Z .long char_default ; 0x5b = [ .long char_default ; 0x5c = \ .long char_default ; 0x5d = ] .long char_default ; 0x5e = ^ .long char_default ; 0x5f = _ .long char_default ; 0x60 = ` .long char_xdigit ; 0x61 = a .long char_xdigit ; 0x62 = b .long char_xdigit ; 0x63 = c .long char_xdigit ; 0x64 = d .long char_xdigit ; 0x65 = e .long char_xdigit ; 0x66 = f .long char_default ; 0x67 = g .long char_default ; 0x68 = h .long char_default ; 0x69 = i .long char_default ; 0x6a = j .long char_default ; 0x6b = k .long char_default ; 0x6c = l .long char_default ; 0x6d = m .long char_default ; 0x6e = n .long char_default ; 0x6f = o .long char_default ; 0x70 = p .long char_default ; 0x71 = q .long char_default ; 0x72 = r .long char_default ; 0x73 = s .long char_default ; 0x74 = t .long char_upload ; 0x75 = u .long char_default ; 0x76 = v .long char_default ; 0x77 = w .long char_default ; 0x78 = x .long char_default ; 0x79 = y .long char_default ; 0x7a = z .long char_default ; 0x7b = { .long char_default ; 0x7c = | .long char_default ; 0x7d = } .long char_default ; 0x7e = ~ .long char_default ; 0x7f = DEL .long char_default ; 0x80 .long char_default ; 0x81 .long char_default ; 0x82 .long char_default ; 0x83 .long char_default ; 0x84 .long char_default ; 0x85 .long char_default ; 0x86 .long char_default ; 0x87 .long char_default ; 0x88 .long char_default ; 0x89 .long char_default ; 0x8a .long char_default ; 0x8b .long char_default ; 0x8c .long char_default ; 0x8d .long char_default ; 0x8e .long char_default ; 0x8f .long char_default ; 0x90 .long char_default ; 0x91 .long char_default ; 0x92 .long char_default ; 0x93 .long char_default ; 0x94 .long char_default ; 0x95 .long char_default ; 0x96 .long char_default ; 0x97 .long char_default ; 0x98 .long char_default ; 0x99 .long char_default ; 0x9a .long char_default ; 0x9b .long char_default ; 0x9c .long char_default ; 0x9d .long char_default ; 0x9e .long char_default ; 0x9f .long char_default ; 0xa0 = non-break space .long char_default ; 0xa1 = ¡ .long char_default ; 0xa2 = ¢ .long char_default ; 0xa3 = £ .long char_default ; 0xa4 = ¤ .long char_default ; 0xa5 = ¥ .long char_default ; 0xa6 = ¦ .long char_default ; 0xa7 = § .long char_default ; 0xa8 = ¨ .long char_default ; 0xa9 = © .long char_default ; 0xaa = ª .long char_default ; 0xab = « .long char_default ; 0xac = ¬ .long char_default ; 0xad = ­ .long char_default ; 0xae = ® .long char_default ; 0xaf = ¯ .long char_default ; 0xb0 = ° .long char_default ; 0xb1 = ± .long char_default ; 0xb2 = ² .long char_default ; 0xb3 = ³ .long char_default ; 0xb4 = ´ .long char_default ; 0xb5 = µ .long char_default ; 0xb6 = ¶ .long char_default ; 0xb7 = · .long char_default ; 0xb8 = ¸ .long char_default ; 0xb9 = ¹ .long char_default ; 0xba = º .long char_default ; 0xbb = » .long char_default ; 0xbc = ¼ .long char_default ; 0xbd = ½ .long char_default ; 0xbe = ¾ .long char_default ; 0xbf = ¿ .long char_default ; 0xc0 = À .long char_default ; 0xc1 = Á .long char_default ; 0xc2 = Â .long char_default ; 0xc3 = Ã .long char_default ; 0xc4 = Ä .long char_default ; 0xc5 = Å .long char_default ; 0xc6 = Æ .long char_default ; 0xc7 = Ç .long char_default ; 0xc8 = È .long char_default ; 0xc9 = É .long char_default ; 0xca = Ê .long char_default ; 0xcb = Ë .long char_default ; 0xcc = Ì .long char_default ; 0xcd = Í .long char_default ; 0xce = Î .long char_default ; 0xcf = Ï .long char_default ; 0xd0 = Ð .long char_default ; 0xd1 = Ñ .long char_default ; 0xd2 = Ò .long char_default ; 0xd3 = Ó .long char_default ; 0xd4 = Ô .long char_default ; 0xd5 = Õ .long char_default ; 0xd6 = Ö .long char_default ; 0xd7 = × .long char_default ; 0xd8 = Ø .long char_default ; 0xd9 = Ù .long char_default ; 0xda = Ú .long char_default ; 0xdb = Û .long char_default ; 0xdc = Ü .long char_default ; 0xdd = Ý .long char_default ; 0xde = Þ .long char_default ; 0xdf = ß .long char_default ; 0xe0 = à .long char_default ; 0xe1 = á .long char_default ; 0xe2 = â .long char_default ; 0xe3 = ã .long char_default ; 0xe4 = ä .long char_default ; 0xe5 = å .long char_default ; 0xe6 = æ .long char_default ; 0xe7 = ç .long char_default ; 0xe8 = è .long char_default ; 0xe9 = é .long char_default ; 0xea = ê .long char_default ; 0xeb = ë .long char_default ; 0xec = ì .long char_default ; 0xed = í .long char_default ; 0xee = î .long char_default ; 0xef = ï .long char_default ; 0xf0 = ð .long char_default ; 0xf1 = ñ .long char_default ; 0xf2 = ò .long char_default ; 0xf3 = ó .long char_default ; 0xf4 = ô .long char_default ; 0xf5 = õ .long char_default ; 0xf6 = ö .long char_default ; 0xf7 = ÷ .long char_default ; 0xf8 = ø .long char_default ; 0xf9 = ù .long char_default ; 0xfa = ú .long char_default ; 0xfb = û .long char_default ; 0xfc = ü .long char_default ; 0xfd = ý .long char_default ; 0xfe = þ .long char_default ; 0xff = ÿ alignment_error: .asciz "Alignment error"(CR,LF) size_error: .asciz "Internal size error"(CR,LF) crlf: .asciz (CR,LF) .align 2 ; Print char from r1. ; Destroys r0, gbr. putchar: ldc r14,gbr 1: mov.w @(SCFDR2-SCIF_BASE,gbr),r0 SHXR #SCFDR2_TX_SHIFT,r0 and #SCFDR2_TX_MASK,r0 cmp/eq #16,r0 bt 1b mov r1,r0 rts mov.b r0,@(SCFTDR2-SCIF_BASE,gbr) ; Print .asciz string pointed to by r0. ; Destroys r0-r3, gbr, pr. ; This code is a bit of an abuse of the call/return paradigm pr is ; designed for; we stuff a loop top address in pr (with bsr) and then ; have putchar return to it multiple times. This depends on the ; definitions of bsr as "pr=.+4; bra" and rts as "jmp @pr". putstr: sts pr,r3 bsr 1f mov r0,r2 1: mov.b @r2+,r0 cmp/eq #0,r0 bf/s putchar mov r0,r1 jmp @r3 nop ; Prints the low 1+r2 nibbles of r1 in hex, r2>0. ; Destroys r0-r5, gbr, pr. puthex: sts pr,r5 mov.l 9f,r4 mov r1,r3 1: mov r2,r1 shll2 r1 neg r1,r1 mov r3,r0 shld r1,r0 and #0xf,r0 mov.b @(r0,r4),r1 bsr putchar add #-1,r2 cmp/pz r2 bt 1b jmp @r5 nop .align 4 9: .long xdigits xdigits: .ascii "0123456789abcdef" .align 2 ; Fetches r0-byte data through r1. ; r0 must be 1, 2, or 4. ; Fetched value is returned in r1; if r0<4, rest of r1 is unspecified. ; If r1%r0!=0, or r0 is invalid, prints a message and jumps through r11. ; Destroys r0; on error, also destroys as for putstr. do_ifetch: add #-1,r0 tst r0,r1 bt 1f mov.l 9f,r0 bra putstr lds r11,pr 1: cmp/eq #0,r0 bt 1f cmp/eq #1,r0 bt 2f cmp/eq #3,r0 bt 4f mov.l 8f,r0 bra putstr lds r11,pr 1: rts mov.b @r1,r1 2: rts mov.w @r1,r1 4: rts mov.l @r1,r1 .align 4 9: .long alignment_error 8: .long size_error .align 2 ; Stores r0-byte data through r1; data comes from r2. ; r0 must be 1, 2, or 4. ; If r1%r0!=0, or r0 is invalid, prints a message and jumps through r11. ; Destroys r0; on error, also destroys as for putstr. do_istore: add #-1,r0 tst r0,r1 bt 1f mov.l 9f,r0 bra putstr lds r11,pr 1: cmp/eq #0,r0 bt 1f cmp/eq #1,r0 bt 2f cmp/eq #3,r0 bt 4f mov.l 8f,r0 bra putstr lds r11,pr 1: rts mov.b r2,@r1 2: rts mov.w r2,@r1 4: rts mov.l r2,@r1 .align 4 9: .long alignment_error 8: .long size_error .align 2 char_default: mov #7,r1 bra putchar lds r11,pr char_xdigit: add #9,r1 char_digit: mov #0xf,r2 and r2,r1 SHLL #4,r13/r2 mov r12,r0 SHLR #28,r0/r2 or r0,r13 SHLL #4,r12/r2 jmp @r11 or r1,r12 char_size_1: jmp @r11 mov #1,r10 char_size_2: jmp @r11 mov #2,r10 char_size_4: jmp @r11 mov #4,r10 char_show: mova 9f,r0 bsr putstr nop mov r12,r1 bsr puthex mov #8-1,r2 mova 8f,r0 bsr putstr nop char_show_d: mov r10,r2 mov r13,r1 shll r2 bsr puthex add #-1,r2 mov.l 7f,r0 bra putstr lds r11,pr .align 4 9: .asciz "addr = " .align 4 8: .asciz " value = " .align 4 7: .long crlf .align 2 char_fetch: mov r12,r1 bsr do_ifetch mov r10,r0 mov r10,r2 mov r1,r13 shll r2 bsr puthex add #-1,r2 mov.l 9f,r0 bra putstr lds r11,pr .align 4 9: .long crlf .align 2 char_indir: mov r12,r1 bsr do_ifetch mov r10,r0 jmp @r11 mov r1,r13 char_store: mov r10,r0 mov r12,r1 mov r13,r2 bra do_istore lds r11,pr char_plus: jmp @r11 add r10,r12 char_minus: jmp @r11 sub r10,r12 printval: sts pr,r6 bsr puthex mov #8-1,r2 mov.l 9f,r0 bra putstr lds r6,pr .align 4 9: .long crlf char_tbl: bra printval mov r0,r1 char_vid: SETS.L #VIDBEG,r1 bra printval nop char_upload: ; Shifts r13/r12 right by r10 bytes, putting the shifted-out ; data in r13. Then does a store, a la char_store; then does ; an increment, a la char_plus. The idea is that you can set ; up the base address and size, then just stream ; (data) u (data) u ... to upload bulk data. ; Echoes a . in response, to allow lockstep operation. mov r10,r0 cmp/eq #4,r0 bt 4f cmp/eq #2,r0 bt 2f cmp/eq #1,r0 bt 1f mov.l 9f,r0 bra putstr lds r11,pr 4: mov r12,r0 mov r13,r12 bra 3f mov r0,r13 2: mov r12,r0 xtrct r13,r12 bra 3f extu.w r0,r13 1: mov r12,r0 SHLR #8,r12/r2 SHLL #24,r13/r3 or r13,r12 extu.b r0,r13 3: bsr putchar mov #'.,r1 mov r12,r1 mov r13,r2 mov r10,r0 add r10,r12 bra do_istore lds r11,pr .align 4 9: .long size_error .align 2 char_jmp: ; We'd like to just "jmp @r12" here. But we've probably just ; stuffed code into RAM and want to run it, and that won't work ; unless either there's a lot of it or we push all dirty d$ ; blocks first. (We could configure the cache write-through ; instead, which works, but we'd prefer the performance of CB. ; And we may have to invalidate the i$ even for WT.) ; Must be running in P2 for all this. mova 1f,r0 SETS.L #Px_MASKOFF,r1 SETS.L #P2_BITS,r2 and r1,r0 or r2,r0 jmp @r0 nop .align 4 1: ; Now running in P2. ; First, push all dirty d$ blocks. ; Read the d$ entries and invalidate/push as needed. ; Fortunately invalidating via the array provokes pushes; I ; don't relish the prospect of working out addresses for ocbp. ; The d$ contents are frozen during this, because the loop does ; no data accesses except to the d$ array view (which of course ; is itself uncached). SETS.L #DCAA_ENTRY_MASK,r5 SETS.L #~DCAA_U,r4 SETS.L #DCAA_BASE,r3 1: mov.l @r3,r0 and #DCAA_U|DCAA_V,r0 cmp/eq #DCAA_U|DCAA_V,r0 bf 2f and r4,r0 mov.l r4,@r3 2: dt r5 bf/s 1b add #32,r3 ; Now that we've pushed all d$ blocks, invalidate the i$. ; We could use the cache array as we did for the d$, but since ; there are no dirty line pushes to worry about in i$, we can ; just use the CCR to invalidate the whole i$ at once. Of ; course, the i$ will fill up again as we execute code, but (a) ; that doesn't happen until we leave P2 and (b) that doesn't ; matter, since it will fill with the current values from ; external RAM, which are what we want - now that the d$ is ; flushed, external RAM is synced. ; ; This does assume that the d$ flush pushes finish before we ; try to read them into the i$. The 7750 does have buffering ; here; see sections 4.3.4 and 4.3.5 of the programmer's PDF ; (page 69), but, as far as I can see, there is no way to wait ; until data has been flushed from them to external RAM. We ; just have to assume it will happen fast enough - or that ; clearing the U bit through the array view ignores these ; buffers, or, equivalently from our point of view, waits until ; data has left them and hit main memory. SETS.L #CCR,r3 SETS.L #CCR_ICI,r4 mov.l @r3,r0 or r4,r0 mov.l r0,@r3 ; We have the same eight-instruction dance to do here as above ; (see the code at the "enable_cache" label). We might not ; actually need to do this, if r12 points to P2, but it's cheap ; and easy to do. nop ; #1 nop ; #2 nop ; #3 nop ; #4 nop ; #5 nop ; #6 nop ; #7 nop ; #8 jmp @r12 nop SETCONST