home *** CD-ROM | disk | FTP | other *** search
- Path: sparky!uunet!ogicse!cs.uoregon.edu!mystix.cs.uoregon.edu!mkelly
- From: mkelly@mystix.cs.uoregon.edu (Michael A. Kelly)
- Newsgroups: comp.sys.mac.programmer
- Subject: Re: Help! making an assembly routine faster
- Message-ID: <1992Nov16.014850.28678@cs.uoregon.edu>
- Date: 16 Nov 92 01:48:50 GMT
- Article-I.D.: cs.1992Nov16.014850.28678
- References: <1992Nov14.091905.29520@cs.uoregon.edu> <1992Nov14.200831.20477@nntp.hut.fi>
- Sender: news@cs.uoregon.edu (Netnews Owner)
- Organization: University of Oregon Computer and Information Sciences Dept.
- Lines: 394
-
- In article <1992Nov14.200831.20477@nntp.hut.fi> jmunkki@vipunen.hut.fi (Juri Munkki) writes:
- >In article <1992Nov14.091905.29520@cs.uoregon.edu> mkelly@mystix.cs.uoregon.edu (Michael A. Kelly) writes:
- >>Hey, all you assembly hackers! How can I make this routine faster? As it
- >>is it's only about 40% faster than CopyMask. (I'm using Think C 5.)
- >
- >This sounds like something I might be able to help with... let's see...
- >
- >> src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3FFF) * srcPt.v + srcPt.h;
- >
- >Shouldn't you cast to long before the multiply? It looks to me like you are
- >casting the result of a short multiply, but I could be wrong, since I don't
- >want to check this from a C book right now.
-
- Yep, but if you look closely at the parens, I think you'll find that that's
- what I'm doing.
-
- >Another possibility is to grab just a few mask bits (like 4, as I
- >suggested) at a time and write special code for all the 16 possible
- >cases. Use a jump table to select the code to use.
-
- OK, I did that, and managed to almost triple the speed of my original routine,
- making the new routine about four times as fast as CopyMask. And yet, I'd
- like to make it even faster. So suggestions are welcome.
-
- Someone else suggested that I just make the mask the same depth as the pixmaps,
- so that I could use the mask directly instead of having to extract bits from
- it. This turned out to be slower than the jump table approach, only about
- three times as fast as CopyMask. Of course, the problem could be with my
- assembly skills rather than with the theory.
-
- So, here are the resulting routines. The first uses the jump table approach,
- the second uses the wide mask approach. Can they be made even faster??
-
-
- /*
- * Quick8CopyMask
- *
- * The QuickXCopyMask family are much faster versions of CopyMask
- * that don't do clipping, dithering, etc. The source and destination
- * PixMaps are expected to have the same bit depth. The X in the name
- * represents the expected bit depth of the source and destination PixMaps.
- *
- * The mask is expected to be exactly the same size as the rectangle
- * that is being copied.
- *
- */
-
- void Quick8CopyMask(
- PixMapHandle srcMap,
- PixMapHandle dstMap,
- Ptr mask,
- Point srcPt,
- Point dstPt,
- short width,
- short height )
- {
-
- register char *src;
- register char *dst;
- register long srcNewline;
- register long dstNewline;
- char mode32 = QD32COMPATIBLE;
- short w = (width >> 3) - 1;
- short e = (width & 0x07) - 1;
- short h = height - 1;
-
- // Set up pointers to the beginning of the memory to copy
- // and calculate the newline value for the source and destination
-
- src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
- srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
-
- dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
- dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
-
- // Switch into 32 bit addressing mode
-
- SwapMMUMode( &mode32 );
-
- // Copy the rect from the source to the destination
-
- asm {
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
-
- @1: ; copy the next row
- MOVE.W w, D1
-
- @2: ; copy the next eight bytes in the row
-
- MOVE.B (A2), D2 ; copy the next mask byte
-
- TST.B D2
- BEQ @nocopy ; if zero, don't copy anything
-
- CMPI.B #0xFF, D2
- BNE @hardway ; don't copy everything
-
- MOVE.L (A0)+, (A1)+ ; copy all bytes
- MOVE.L (A0)+, (A1)+
- ADDQ.L #1, A2
- JMP @endloop
-
- @nocopy: ; copy no bytes
- ADDQ.L #8, A0
- ADDQ.L #8, A1
- ADDQ.L #1, A2
- JMP @endloop
-
- @hardway:
- ANDI.L #0xF0, D2 ; mask off the low four bits
- LSR.W #4, D2 ; shift bits 4-7 into bits 0-3
- ADD.W D2, D2 ; double the index
- ADD.W @table(D2.W), D2 ; calculate the address
- JSR @table(D2.W) ; plot four pixels
-
- CLR.L D2 ; clear the mask register
- MOVE.B (A2)+, D2 ; copy the next mask byte
- ANDI.B #0xF, D2 ; mask off the high four bits
- ADD.W D2, D2 ; double the index
- ADD.W @table(D2.W), D2 ; calculate the address
- JSR @table(D2.W) ; plot four pixels
-
- @endloop:
- DBF D1, @2
-
- TST.W e
- BLT @4 ; continue if e is less than 0
-
- MOVE.W e, D1 ; copy the extra bytes, if any
-
- @3: ; copy the next byte
-
- MOVEQ.L #0, D3 ; initialize the bit counter
- BTST D3, (A2) ; test the next bit in the mask
- BEQ @skip ; if zero, continue
- MOVE.B (A0)+, (A1)+ ; else copy the pixel
- JMP @incb
- @skip:
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- @incb:
- ADDQ.L #1, D3 ; increment the bit number
-
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
-
- DBF D0, @1
-
- JMP @end ; skip to the end
-
- @table:
- DC.W @sub0
- DC.W @sub1
- DC.W @sub2
- DC.W @sub3
- DC.W @sub4
- DC.W @sub5
- DC.W @sub6
- DC.W @sub7
- DC.W @sub8
- DC.W @sub9
- DC.W @sub10
- DC.W @sub11
- DC.W @sub12
- DC.W @sub13
- DC.W @sub14
- DC.W @sub15
-
- @sub0: ; mask = 0000, draw nothing
- ADDQ.L #4, A0
- ADDQ.L #4, A1
- RTS
-
- @sub1: ; mask = 0001
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub2: ; mask = 0010
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub3: ; mask = 0011
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.W (A0)+, (A1)+
- RTS
-
- @sub4: ; mask = 0100
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- RTS
-
- @sub5: ; mask = 0101
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub6: ; mask = 0110
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub7: ; mask = 0111
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- MOVE.W (A0)+, (A1)+
- RTS
-
- @sub8: ; mask = 1000
- MOVE.B (A0)+, (A1)+
- ADDQ.L #3, A0
- ADDQ.L #3, A1
- RTS
-
- @sub9: ; mask = 1001
- MOVE.B (A0)+, (A1)+
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub10: ; mask = 1010
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub11: ; mask = 1011
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.W (A0)+, (A1)+
- RTS
-
- @sub12: ; mask = 1100
- MOVE.W (A0)+, (A1)+
- ADDQ.L #2, A0
- ADDQ.L #2, A1
- RTS
-
- @sub13: ; mask = 1101
- MOVE.W (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- MOVE.B (A0)+, (A1)+
- RTS
-
- @sub14: ; mask = 1110
- MOVE.W (A0)+, (A1)+
- MOVE.B (A0)+, (A1)+
- ADDQ.L #1, A0
- ADDQ.L #1, A1
- RTS
-
- @sub15: ; mask = 1111
- MOVE.L (A0)+, (A1)+
- RTS
-
- @end:
-
- }
-
- // Switch back to the previous addressing mode
-
- SwapMMUMode( &mode32 );
-
- }
-
-
-
- And the wide mask approach:
-
-
- void Quick8CopyMask(
- PixMapHandle srcMap,
- PixMapHandle dstMap,
- Ptr mask,
- Point srcPt,
- Point dstPt,
- short width,
- short height )
- {
-
- register char *src;
- register char *dst;
- register long srcNewline;
- register long dstNewline;
- char mode32 = QD32COMPATIBLE;
- short w = (width >> 2) - 1;
- short e = (width & 0x3) - 1;
- short h = height - 1;
-
- // Set up pointers to the beginning of the memory to copy
- // and calculate the newline value for the source and destination
-
- src = GetPixBaseAddr( srcMap ) + (long) ((*srcMap)->rowBytes & 0x3fff) * srcPt.v + srcPt.h;
- srcNewline = ((*srcMap)->rowBytes & 0x3fff) - width;
-
- dst = GetPixBaseAddr( dstMap ) + (long) ((*dstMap)->rowBytes & 0x3fff) * dstPt.v + dstPt.h;
- dstNewline = ((*dstMap)->rowBytes & 0x3fff) - width;
-
- // Switch into 32 bit addressing mode
-
- SwapMMUMode( &mode32 );
-
- // Copy the rect from the source to the destination
-
- asm {
-
- MOVE.W h, D0 ; put height loop variable in D0
- MOVEA.L src, A0 ; put the source pixmap address in A0
- MOVEA.L dst, A1 ; put the destination address in A1
- MOVEA.L mask, A2 ; put the mask address in A2
-
- @1: ; copy the next row
- MOVE.W w, D1
-
- @2: ; copy the next four bytes in the row
-
- MOVE.L (A2)+, D2 ; copy the mask to D2
- MOVE.L D2, D4 ; save the mask
- NOT.L D4 ; invert the mask
- AND.L (A0)+, D2 ; compute the pixels to be copied
- AND.L (A1), D4 ; compute the pixels to be saved
- OR.L D2, D4 ; combine the copied and saved pixels
- MOVE.L D4, (A1)+ ; copy the pixels
-
- DBF D1, @2
-
- TST.W e
- BLT @4 ; continue if e is less than 0
-
- MOVE.W e, D1 ; copy the extra bytes, if any
-
- @3: ; copy the next byte
-
- MOVE.B (A2)+, D2 ; copy the mask to D2
- MOVE.B D2, D4 ; save the mask
- NOT.B D4 ; invert the mask
- AND.B (A0)+, D2 ; compute the pixels to be copied
- AND.B (A1), D4 ; compute the pixels to be saved
- OR.B D2, D4 ; combine the copied and saved pixels
- MOVE.B D4, (A1)+ ; copy the pixels
-
- DBF D1, @3
-
- @4:
- ADDA.L srcNewline, A0 ; bring the src pointer to the start of the next row
- ADDA.L dstNewline, A1 ; bring the dst pointer to the start of the next row
-
- DBF D0, @1
-
- }
-
- // Switch back to the previous addressing mode
-
- SwapMMUMode( &mode32 );
-
- }
-
-
-
- --
- _____________________________________________________________________________
- Michael A. Kelly Senior Partner
- mkelly@cs.uoregon.edu High Risk Ventures
- _____________________________________________________________________________
-