home *** CD-ROM | disk | FTP | other *** search
- ; Chunky2Planar algorithm, originally by James McCoull
- ; Modified by Peter McGavin for variable size and depth
- ; and "dirty list" (hope I didn't slow it down too much)
- ;
- ; Cpu only solution
- ; Optimised for 020+fastram
- ; Aim for less than 90ms for 320x200x256 on 14MHz 020
-
- ;void __asm c2p_8 (register __a0 UBYTE *chunky_data,
- ; register __a1 PLANEPTR raster,
- ; register __a2 UBYTE *dirty_list,
- ; register __d1 ULONG plsiz,
- ; register __a5 UBYTE *tmp_buffer);
-
- ; a0 -> width*height chunky pixels in fastmem
- ; a1 -> contiguous bitplanes in chipmem
- ; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating)
- ; d1 = width*height/8 (width*height must be a multiple of 32)
- ; a5 -> width*height tmp buffer in fastmem
-
- ifeq depth-8
- xdef _c2p_8_020
- _c2p_8_020:
- else
- ifeq depth-6
- xdef _c2p_6_020
- _c2p_6_020:
- else
- ifeq depth-4
- xdef _c2p_4_020
- _c2p_4_020:
- else
- fail "unsupported depth!"
- endc
- endc
- endc
-
-
- wordmerge macro ; i1 i2 tmp
- ; \1 \2 \3
- move.l \2,\3 ;\3 = CD
- move.w \1,\2 ;\2 = CB
- swap \2 ;\2 = BC
- move.w \2,\1 ;\1 = AC
- move.w \3,\2 ;\2 = BD
- endm
-
-
- merge macro ; io in out tmp msk sft
- ; \1 \2 \3 \4 \5 \6
- ; \1 = abqr
- ; \2 = ijyz
- move.l \5,\3 ; \3 = 0x0x
- move.l \3,\4 ; \4 = 0x0x
- and.l \1,\3 ; \3 = 0b0r
- and.l \2,\4 ; \4 = 0j0z
- eor.l \3,\1 ; \1 = a0q0
- eor.l \4,\2 ; \2 = i0y0
- lsr.l #\6,\2 ; \2 = 0i0y
- ifeq \6-1
- add.l \3,\3
- else
- lsl.l #\6,\3 ; \3 = b0r0
- endc
- or.l \2,\1 ; \1 = aiqy
- or.l \4,\3 ; \3 = bjrz
- endm
-
- merge4 macro ; io in out tmp msk
- ; \1 \2 \3 \4 \5
- ; \1 = abqr
- ; \2 = ijyz
- ifgt depth-4
- move.l \5,\3 ; \3 = 0x0x
- move.l \3,\4 ; \4 = 0x0x
- and.l \1,\3 ; \3 = 0b0r
- and.l \2,\4 ; \4 = 0j0z
- eor.l \3,\1 ; \1 = a0q0
- eor.l \4,\2 ; \2 = i0y0
- lsr.l #4,\2 ; \2 = 0i0y
- or.l \2,\1 ; \1 = aiqy
- move.l \1,(a5)+ ; write to tmp buffer
- lsl.l #4,\3 ; \3 = b0r0
- or.l \4,\3 ; \3 = bjrz
- move.l \3,(a5)+ ; write to tmp buffer
- else
- move.l \5,\3 ; this version returns only 1 result
- and.l \3,\2 ; \2 = 0j0z
- and.l \1,\3 ; \3 = 0b0r
- lsl.l #4,\3 ; \3 = b0r0
- or.l \2,\3 ; \3 = bjrz
- move.l \3,(a5)+ ; write to tmp buffer
- endc
- endm
-
- merge1 macro ; io in out tmp msk flg
- ; \1 \2 \3 \4 \5 \6
- ; \1 = abqr
- ; \2 = ijyz
- move.l \5,\3 ; \3 = 0x0x
- move.l \3,\4 ; \4 = 0x0x
- and.l \1,\3 ; \3 = 0b0r
- and.l \2,\4 ; \4 = 0j0z
- eor.l \3,\1 ; \1 = a0q0
- eor.l \4,\2 ; \2 = i0y0
- lsr.l #1,\2 ; \2 = 0i0y
- or.l \2,\1 ; \1 = aiqy
- move.l \1,(a2) ; write to output plane
- suba.l a5,a2 ; -plsiz
- add.l \3,\3 ; \3 = b0r0
- or.l \4,\3 ; \3 = bjrz
- ifne \6
- move.l \3,(a2) ; write to output plane
- suba.l a5,a2 ; -plsiz
- endc
- endm
-
-
- start: jmp next ; self-modified code here
- next: movem.l d1/a0-a1/a6,-(sp)
-
- ; Relocate c2p so that firstsweep2 is at a quad-longword-aligned address.
- ; Firstsweep2 loop doesn't fit in '020/'030 cache unless it is exactly aligned.
- ; Speed penalty of misalignment is about 30%.
-
- lea (firstsweep2,pc),a0
- move.l a0,d0
- and.w #%00001111,d0 ; relocate by -d0.w bytes
-
- lea (c2p,pc),a0 ; a0 = src
- movea.l a0,a1
- sub.w d0,a1 ; a1 = dst
-
- move.l a1,start+2 ; patch jmp
-
- move.w #(end-c2p)/2-1,d0
- loop: move.w (a0)+,(a1)+ ; relocate code loop
- dbra d0,loop
-
- move.l (4).w,a6 ; flush cache
- jsr (_LVOCacheClearU,a6)
-
- movem.l (sp)+,d1/a0-a1/a6
- bra.b start ; restart
-
- ds.w 8 ; space for relocation of c2p routine
-
- ; the real c2p routine starts here
- c2p:
- movem.l d2-d7/a2-a6,-(sp)
-
- sub.w #24,sp ; space for temporary variables
-
- ; a0 = chunky buffer
- ; a1 = output area
- ; a2 = dirty list
- ; d1 = plsiz
- ; a5 = tmp buffer
-
- move.l a1,(4,sp) ; save output address
- move.l a2,(8,sp) ; save dirty list ptr
- move.l d1,(12,sp) ; save plsiz
- lsl.l #3,d1
- movea.l a0,a1
- adda.l d1,a1 ; a1 -> end of chunky buffer
- sub.l (12,sp),d1
- ifle depth-6
- sub.l (12,sp),d1
- sub.l (12,sp),d1
- endc
- ifle depth-4
- sub.l (12,sp),d1
- sub.l (12,sp),d1
- endc
- move.l d1,(16,sp) ; save 7*plsiz (or 5*plsiz) (or 3*plsiz)
- move.l a5,(20,sp) ; save tmp buffer address
-
- ;; Sweep thru the whole chunky data once,
- ;; Performing 3 merge operations on it.
-
- move.l #$00ff00ff,a3 ; load byte merge mask
- move.l #$0f0f0f0f,a4 ; load nibble merge mask
-
- ; pass 1
- firstsweep: tst.b (a2)+ ; does next 32 pixel unit need updating?
- bne.b firstsweep3
-
- adda.w #32,a0 ; skip 32 pixels on input/output
-
- cmpa.l a0,a1
- bne.b firstsweep
- bra.w exit ; exit if no changes
-
- ; this becomes the first sweep's main loop after the first change is found
- firstsweep2: tst.b (a2)+ ; does next 32 pixel unit need updating?
- bne.b firstsweep3
-
- adda.w #32,a0 ; skip 32 pixels on input
-
- cmpa.l a0,a1
- bne.b firstsweep2
- bra.w secondsweep ; on to second sweep if changes
-
- firstsweep3:
- movem.l (a0)+,d0-d7 ; get 32 pixels in registers
- ; d0-7 = abcd efgh ijkl mnop qrst uvwx yzAB CDEF
-
- wordmerge d0,d4,a6 ;d0/4 = abqr cdst
- wordmerge d1,d5,a6 ;d1/5 = efuv ghwx
- wordmerge d2,d6,a6 ;d2/6 = ijyz klAB
- wordmerge d3,d7,a6 ;d3/7 = mnCD opEF
-
- ; temporarily save off some registers
- movea.l d7,a6
- move.l d6,(sp)
-
- ; pass 2
- merge d0,d2,d6,d7,a3,8 ;d0/d6 = aiqy bjrz
- merge d1,d3,d7,d2,a3,8 ;d1/d7 = emuc fnvD
-
- ; pass 3
- merge4 d0,d1,d2,d3,a4,4 ;d0/d2 = ae74... ae30...
- merge4 d6,d7,d3,d1,a4,4 ;d6/d3 = bf74... bf30...
-
- ; bring them back
- move.l a6,d7
- move.l (sp),d6
-
- ; pass 2
- merge d4,d6,d0,d1,a3,8 ;d4/d0 = cksA dltB
- merge d5,d7,d1,d6,a3,8 ;d5/d1 = gowE hpxF
-
- ; pass 3
- merge4 d4,d5,d6,d7,a4,4 ;d4/d6 = cg74.. cg30..
- merge4 d0,d1,d7,d5,a4,4 ;d0/d7 = dh74.. dh30..
-
- cmpa.l a0,a1
- bne.w firstsweep2 ; end of firstsweep, 250 bytes
- ; only just fits in instr cache
-
- ; (a0) ae74.. ae30.. bf74.. bf30.. cg74.. cg30.. dh74.. dh30..
-
- secondsweep:
- movea.l a5,a1 ; a1 -> end of tmp buffer
- movea.l (4,sp),a2 ; a2 -> plane0
- movea.l (8,sp),a6 ; a6 -> dirty list
- movea.l (12,sp),a5 ; a5 = plsiz
- adda.l (16,sp),a2 ; a2 -> plane7
- movea.l (20,sp),a0 ; a0 -> tmp buffer
-
- movea.l #$33333333,a3
- movea.l #$55555555,a4
-
- bra.b secondsweep2
-
- secondsweep3: addq.l #4,a2 ; skip 32 pixels on output
-
- secondsweep2: tst.b (a6)+ ; does next 32 pixel unit need updating?
- beq.b secondsweep3
-
- ifgt depth-4
-
- movem.l (a0)+,d0-d6 ; read tmp buffer, not d7 yet
-
- ; save d5 temporarily
- move.l d5,(sp)
-
- ;; pass 4
- merge d0,d4,d5,d7,a3,2 ; d0/d5 = aceg76.. aceg54..
- merge d2,d6,d7,d4,a3,2 ; d2/d7 = bdhf76.. bdhf54..
-
- ;; pass 5
- ifgt depth-6
- merge1 d0,d2,d4,d6,a4,1 ; d0/d4 = abcd7... abcd6...
- endc
- merge1 d5,d7,d6,d2,a4,1 ; d5/d6 = abcd5... abcd4...
-
-
- ; restore d5 and finally get d7
- move.l (sp),d5
- move.l (a0)+,d7
-
- else
-
- movem.l (a0)+,d1/d3/d5/d7 ; read tmp buf, depth 4 version
-
- endc
-
- ;; pass 4
- merge d1,d5,d4,d6,a3,2 ; d1/d4 = aceg32.. aceg10..
- merge d3,d7,d6,d5,a3,2 ; d3/d6 = bdhf32.. bdhf10..
-
- ;; pass 5
- merge1 d1,d3,d5,d7,a4,1 ; d1/d5 = abcd3... abcd2...
- merge1 d4,d6,d7,d3,a4,0 ; d4/d7 = abcd1... abcd0...
-
- move.l d7,(a2)+ ; plane 0
- adda.l (16,sp),a2 ; +7*plsiz (or 5*plsiz) (or 3*plsiz)
-
- cmp.l a0,a1
- bne.w secondsweep2 ; end of secondsweep, 216 bytes
-
- exit:
- add.w #24,sp
- movem.l (sp)+,d2-d7/a2-a6
- rts
- end:
- end
-