home *** CD-ROM | disk | FTP | other *** search
- ; Chunky2Planar algorithm, originally by James McCoull
- ; Modified by Peter McGavin for variable size and depth
- ; and "dirty list" (hope I didn't slow it down too much)
- ;
- ; Cpu only solution VERSION 2
- ; Optimised for 040+fastram
- ; bitplanes are assumed contiguous!
- ; analyse instruction offsets to check performance
-
- ;void __asm c2p_8_040 (register __a0 UBYTE *chunky_data,
- ; register __a1 PLANEPTR raster,
- ; register __a2 UBYTE *dirty_list,
- ; register __d1 ULONG plsiz);
-
- ; a0 -> width*height chunky pixels
- ; a1 -> contiguous bitplanes
- ; a2 -> dirty list (1-byte flag for whether each 32 pixel "unit" needs updating)
- ; d1 = width*height/8 (width*height must be a multiple of 32)
-
- ifeq depth-8
- xdef _c2p_8_040
- _c2p_8_040:
- else
- ifeq depth-6
- xdef _c2p_6_040
- _c2p_6_040:
- else
- ifeq depth-4
- xdef _c2p_4_040
- _c2p_4_040:
- else
- fail "unsupported depth!"
- endc
- endc
- endc
-
- merge macro ; in1,in2,tmp3,tmp4,mask,shift
- ; \1 = abqr
- ; \2 = ijyz
- move.l \2,\4
- move.l #\5,\3
- and.l \3,\2 ; \2 = 0j0z
- and.l \1,\3 ; \3 = 0b0r
- eor.l \3,\1 ; \1 = a0q0
- eor.l \2,\4 ; \4 = i0y0
- ifeq \6-1
- add.l \3,\3
- else
- lsl.l #\6,\3 ; \3 = b0r0
- endc
- lsr.l #\6,\4 ; \4 = 0i0y
- or.l \3,\2 ; \2 = bjrz
- or.l \4,\1 ; \1 = aiqy
- endm
-
- merge4 macro ; in1,in2,tmp3,tmp4,mask
- ifgt depth-4
- merge \1,\2,\3,\4,\5,4
- else
- move.l #\5,\3 ; \3 = mask
- and.l \3,\2 ; \2 = 0j0z
- and.l \1,\3 ; \3 = 0b0r
- lsl.l #4,\3 ; \3 = b0r0
- or.l \3,\2 ; \2 = bjrz
- endc
- endm
-
-
- start:
- ; jmp next ; self-modified code here
- ;next:
- ; movem.l d1/a0-a2,-(sp)
- ;; relocate c2p to a 16-aligned address
- ; lea (c2p,pc),a0
- ; move.l a0,d0
- ; and.b #%11110000,d0
- ; move.l d0,a1
- ;
- ;; patch jmp
- ; move.l d0,start+2
- ; move.w #(end-c2p)-1,d0
- ;loop: move.b (a0)+,(a1)+
- ; dbra d0,loop
- ;
- ;; flush cache
- ; move.l (4).w,a6
- ; jsr (_LVOCacheClearU,a6)
- ;
- ;; restore parameters and restart
- ; movem.l (sp)+,d1/a0-a2
- ; bra.b start
- ;
- ; ds.w 8 ; space for relocation
-
- ; the real c2p routine starts here
- c2p:
- movem.l d2-d7/a2-a6,-(sp)
-
- sub.w #44,sp ; space for temporary variables
-
- ; a0 = chunky buffer
- ; a1 = output area
- ; a2 = dirty list
- ; d1 = plsiz
-
- movea.l d1,a3 ; a3 = plsiz
-
- move.l a0,a4
- lsl.l #3,d1
- add.l d1,a4 ; a4 -> end of chunky data
-
- first_loop: tst.l (a2)+ ; do the next 128 pixels need updating?
- bne.b first_patch ; branch if yes
-
- adda.w #128,a0 ; skip 128 pixels on input
- adda.w #16,a1 ; skip 128 pixels on output
-
- cmpa.l a0,a4
- bcc.b first_loop
- bra.w exit ; exit if no changes found
-
- first_patch: subq.l #4,a2 ; restore input address
- tst.b (a2)+ ; do the next 32 pixels need updating?
- bne.b first_case ; branch if yes
- adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- tst.b (a2)+ ; do the next 32 pixels need updating?
- bne.b first_case ; branch if yes
- adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- tst.b (a2)+ ; do the next 32 pixels need updating?
- bne.b first_case ; branch if yes
- adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- tst.b (a2)+ ; do the next 32 pixels need updating?
- bne.b first_case ; branch if yes
- adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- bra.b first_loop ; this should never happen
-
- first_case: move.l (0,a0),d1
- move.l (4,a0),d3
- move.l (8,a0),d0
- move.l (12,a0),d2
- move.l (2,a0),d4
- move.l (10,a0),d5
- move.l (6,a0),d6
- move.l (14,a0),d7
-
- move.w (16,a0),d1
- move.w (24,a0),d0
- move.w (20,a0),d3
- move.w (28,a0),d2
- move.w (18,a0),d4
- move.w (26,a0),d5
- move.w (22,a0),d6
- move.w (30,a0),d7
-
- adda.w #32,a0
-
- move.l d6,a5
- move.l d7,a6
-
- merge d1,d0,d6,d7,$00ff00ff,8
- merge d3,d2,d6,d7,$00ff00ff,8
-
- merge4 d1,d3,d6,d7,$0f0f0f0f,4
- merge4 d0,d2,d6,d7,$0f0f0f0f,4
-
- exg d1,a5
- exg d0,a6
-
- merge d4,d5,d6,d7,$00ff00ff,8
- merge d1,d0,d6,d7,$00ff00ff,8
-
- merge4 d4,d1,d6,d7,$0f0f0f0f,4
- merge4 d5,d0,d6,d7,$0f0f0f0f,4
-
- merge d3,d1,d6,d7,$33333333,2
- merge d2,d0,d6,d7,$33333333,2
-
- merge d3,d2,d6,d7,$55555555,1
- merge d1,d0,d6,d7,$55555555,1
-
- move.l d0,(0*4,sp) ;plane0 (movem.l is slower!)
- move.l d1,(1*4,sp) ;plane1
- move.l d2,(2*4,sp) ;plane2
- move.l d3,(3*4,sp) ;plane3
-
- ifgt depth-4
-
- move.l a5,d3
- move.l a6,d2
-
- merge d3,d4,d6,d7,$33333333,2
- merge d2,d5,d6,d7,$33333333,2
-
- ifgt depth-6
- merge d3,d2,d6,d7,$55555555,1
- endc
- merge d4,d5,d6,d7,$55555555,1
-
- move.l d5,(4*4,sp) ;plane4
- move.l d4,(5*4,sp) ;plane5
-
- ifgt depth-6
- move.l d2,(6*4,sp) ;plane6
- move.l d3,(7*4,sp) ;plane7
- endc
-
- endc
-
- move.l a1,(32,sp) ; save output address
- addq.l #4,a1 ; skip 32 pixels on output
-
- cmpa.l a0,a4
- beq.w final_case
-
-
- main_loop: tst.l (a2)+ ; do the next 128 pixels need updating?
- bne.b main_patch ; branch if yes
-
- adda.w #128,a0 ; skip 128 pixels on input
- adda.w #16,a1 ; skip 128 pixels on output
-
- cmpa.l a0,a4
- bcc.b main_loop
- bra.w final_case ; exit if no changes found
-
- main_patch: subq.l #4,a2 ; restore input address
- tst.b (a2)+ ; do the next 32 pixels need updating?
- beq.b 1$ ; branch if no
- bsr.b main_case
- 1$: adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- tst.b (a2)+ ; do the next 32 pixels need updating?
- beq.b 2$ ; branch if no
- bsr.b main_case
- 2$: adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- tst.b (a2)+ ; do the next 32 pixels need updating?
- beq.b 3$ ; branch if no
- bsr.b main_case
- 3$: adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- tst.b (a2)+ ; do the next 32 pixels need updating?
- beq.b 4$ ; branch if no
- bsr.b main_case
- 4$: adda.w #32,a0 ; skip 32 pixels on input
- addq.l #4,a1 ; skip 32 pixels on output
- cmpa.l a0,a4
- bcc.b main_loop
- bra.w final_case ; exit if no changes found
-
- main_case:
- move.l a1,(36+4,sp) ; save current output address
- move.l (32+4,sp),a1 ; a1 = previous output address
-
- move.l (0,a0),d1
- move.l (4,a0),d3
- move.l (8,a0),d0
- move.l (12,a0),d2
- move.l (2,a0),d4
- move.l (10,a0),d5
- move.l (6,a0),d6
- move.l (14,a0),d7
-
- move.w (16,a0),d1
- move.w (24,a0),d0
- move.w (20,a0),d3
- move.w (28,a0),d2
- move.w (18,a0),d4
- move.w (26,a0),d5
- move.w (22,a0),d6
- move.w (30,a0),d7
-
- move.l d6,a5
- move.l d7,a6
-
- move.l (0*4+4,sp),(a1) ;plane0
- adda.l a3,a1 ;a1+=plsiz
-
- merge d1,d0,d6,d7,$00ff00ff,8
- merge d3,d2,d6,d7,$00ff00ff,8
-
- move.l (1*4+4,sp),(a1) ;plane1
- adda.l a3,a1 ;a1+=plsiz
-
- merge d1,d3,d6,d7,$0f0f0f0f,4
- merge d0,d2,d6,d7,$0f0f0f0f,4
-
- exg d1,a5
- exg d0,a6
-
- move.l (2*4+4,sp),(a1) ;plane2
- adda.l a3,a1 ;a1+=plsiz
-
- merge d4,d5,d6,d7,$00ff00ff,8
- merge d1,d0,d6,d7,$00ff00ff,8
-
- move.l (3*4+4,sp),(a1) ;plane3
- adda.l a3,a1 ;a1+=plsiz
-
- merge d4,d1,d6,d7,$0f0f0f0f,4
- merge d5,d0,d6,d7,$0f0f0f0f,4
-
- ifgt depth-4
- move.l (4*4+4,sp),(a1) ;plane4
- adda.l a3,a1 ;a1+=plsiz
- endc
-
- merge d3,d1,d6,d7,$33333333,2
- merge d2,d0,d6,d7,$33333333,2
-
- ifgt depth-4
- move.l (5*4+4,sp),(a1) ;plane5
- adda.l a3,a1 ;a1+=plsiz
- endc
-
- merge d3,d2,d6,d7,$55555555,1
- merge d1,d0,d6,d7,$55555555,1
-
- move.l d0,(0*4+4,sp) ;plane0 (movem.l is slower!)
- move.l d1,(1*4+4,sp) ;plane1
- move.l d2,(2*4+4,sp) ;plane2
- move.l d3,(3*4+4,sp) ;plane3
-
- ifgt depth-4
- move.l a5,d3
- move.l a6,d2
-
- ifgt depth-6
- move.l (6*4+4,sp),(a1) ;plane6
- adda.l a3,a1 ;a1+=plsiz
- endc
-
- merge d3,d4,d6,d7,$33333333,2
- merge d2,d5,d6,d7,$33333333,2
-
- ifgt depth-6
- move.l (7*4+4,sp),(a1) ;plane7
- adda.l a3,a1 ;a1+=plsiz
- endc
-
- ifgt depth-6
- merge d3,d2,d6,d7,$55555555,1
- endc
- merge d4,d5,d6,d7,$55555555,1
-
- move.l d5,(4*4+4,sp) ;plane4
- move.l d4,(5*4+4,sp) ;plane5
-
- ifgt depth-6
- move.l d2,(6*4+4,sp) ;plane6
- move.l d3,(7*4+4,sp) ;plane7
- endc
-
- endc
-
- movea.l (36+4,sp),a1 ; restore current output address
- move.l a1,(32+4,sp) ; save output address
-
- rts
-
-
- final_case: move.l (32,sp),a1 ; a1 = previous output address
-
- move.l (0*4,sp),(a1) ;plane0
- adda.l a3,a1 ;a1+=plsiz
- move.l (1*4,sp),(a1) ;plane1
- adda.l a3,a1 ;a1+=plsiz
- move.l (2*4,sp),(a1) ;plane2
- adda.l a3,a1 ;a1+=plsiz
- move.l (3*4,sp),(a1) ;plane3
- ifgt depth-4
- adda.l a3,a1 ;a1+=plsiz
- move.l (4*4,sp),(a1) ;plane4
- adda.l a3,a1 ;a1+=plsiz
- move.l (5*4,sp),(a1) ;plane5
- ifgt depth-6
- adda.l a3,a1 ;a1+=plsiz
- move.l (6*4,sp),(a1) ;plane6
- adda.l a3,a1 ;a1+=plsiz
- move.l (7*4,sp),(a1) ;plane7
- endc
- endc
-
- exit: add.w #44,sp
- movem.l (sp)+,d2-d7/a2-a6
- rts
-
- cnop 0,4
- end:
-
- end
-