home *** CD-ROM | disk | FTP | other *** search
- ;
- ; This code implements the basic idct on a 8x8 pixel block.
- ; Basically, it's the same as in the JPEG engine, with the sole difference
- ; that it's inlined and register-wise a little bit more optimized there.
- ;
- ; This is a complete rewrite in assembler. Heavy stuff. Lotsa work.
- ;
- ; Michael Rausch 14-4-94 1:14:00
- ;
-
- ;
- ; The whole code handles D-Frames not very well, but I'll fix it on day.
- ;
-
-
- DCTSIZE EQU 8
-
- PASS1_BITS EQU 2
- CONST_BITS EQU 13
-
-
- FIX_0_298631336 EQU 2446 ;1 + $98e 100110001110
- _FIX_0_390180644 EQU -3196 ;2 - $c7c
- FIX_0_541196100 EQU 4433 ;3 + $1151 u
- FIX_0_765366865 EQU 6270 ;4 + $187e u
- _FIX_0_899976223 EQU -7373 ;5 - $1ccd
- FIX_1_175875602 EQU 9633 ;6 + $25a1
- FIX_1_501321110 EQU 12299 ;7 + $300b
- _FIX_1_847759065 EQU -15137 ;8 - $3b21 u
- _FIX_1_961570560 EQU -16069 ;9 - $3ec5
- FIX_2_053119869 EQU 16819 ;10 + $41b3
- _FIX_2_562915447 EQU -20995 ;11 - $5203
- FIX_3_072711026 EQU 25172 ;12 + $6254
-
- ; FIX_1_847759065-FIX_0_765366865 = 2* FIX_0_541196100
-
-
- ; **************************************************************************
-
- jrevdct:
-
- sub.w #16,sp
-
- move.l a0,-(sp)
- lea compose1(pc),a5
- moveq #DCTSIZE-1,d7
- idct1: move.l d7,-(sp)
-
- lea 2(a0),a1
- move.l (a1)+,d2
- move.l d2,d0
- move.l (a1)+,d4
- move.l (a1)+,d3
- or.l d4,d0
- or.w (a1)+,d0
- or.l d3,d0
- bne.s idct1_no_ac0
- move.w (a0),d0
- lsl.w #PASS1_BITS,d0
- move.w d0,d1
- swap d0
- move.w d1,d0
- REPT 4
- move.l d0,(a0)+
- ENDR
- bra idct1_next
- idct1_no_ac0:
-
- move.w d2,d1 ; 2
- add.w d3,d1 ; 6
- muls #FIX_0_541196100,d1
- muls #_FIX_1_847759065,d3
- add.l d1,d3
- muls #FIX_0_765366865,d2
- add.l d1,d2
-
-
- move.w (a0),d0
- ext.l d0 ; 0
- ext.l d4 ; 4
- move.l d0,d5
- sub.l d4,d5
- add.l d0,d4
-
- lsl.l #5,d4
- lsl.l #5,d5
- addq.l #1<<2,d4
- addq.l #1<<2,d5
- lsl.l #8,d4
- lsl.l #8,d5
-
- lea 12(sp),a1 ; top + 2 longs -> 16 bytes platz auf dem stack
- move.l d4,d0
- add.l d2,d4
- move.l d4,(a1)+ ; tmp10
- sub.l d2,d0
- move.l d5,d1
- add.l d3,d5
- move.l d5,(a1)+ ; tmp11
- sub.l d3,d1
- move.l d1,(a1)+ ; tmp12
- move.l d0,(a1)+ ; tmp13
-
- odd_part1:
- move.w 7*2(a0),d1 ;7
- beq o0xxx
- o1xxx: move.w 5*2(a0),d2 ;5
- beq o10xx
- o11xx: move.w 3*2(a0),d3 ;3
- beq o110x
- o111x: move.w 1*2(a0),d4 ;1
- bne.s odd1_1111
-
-
- ; 7531
- odd1_1110:
- move.w d2,d6
- move.w d1,d0
- moveq #0,d4
- bra.s abk_2
-
- ; 7531
- odd1_1111:
- move.w d2,d6
- add.w d4,d6
- move.w d1,d0
- add.w d4,d0
- muls #FIX_1_501321110,d4
- abk_2: move.w d1,d5
- add.w d3,d5
- move.w d5,d7
- add.w d6,d7
- muls #FIX_1_175875602,d7
- muls #_FIX_1_961570560,d5
- muls #_FIX_0_390180644,d6
- add.l d7,d5
- add.l d7,d6
- move.w d2,d7
- add.w d3,d7
- muls #FIX_0_298631336,d1
- muls #FIX_2_053119869,d2
- muls #FIX_3_072711026,d3
- muls #_FIX_0_899976223,d0
- muls #_FIX_2_562915447,d7
- add.l d0,d1
- add.l d7,d2
- add.l d5,d1
- add.l d6,d2
- add.l d3,d5
- add.l d4,d6
- add.l d7,d5
- add.l d0,d6
- jmp (a5)
-
- o2110x: move.w 1*DCTSIZE*2(a0),d4 ;1
- bne.s odd1_1101
-
- ; 7531
- odd1_1100:
- move.w d2,d6
- move.w d1,d3
- moveq #0,d4
- bra.s abk_3
-
- o110x: move.w 1*2(a0),d4 ;1
- beq.s odd1_1100
-
- ; 7531
- odd1_1101:
- move.w d2,d6
- move.w d1,d3
- add.w d4,d6
- add.w d4,d3
- muls #FIX_1_501321110,d4
- abk_3:
- move.w d1,d5
- move.w d5,d7
- add.w d6,d7
- muls #FIX_1_175875602,d7
- muls #_FIX_1_961570560,d5
- muls #_FIX_0_390180644,d6
- add.l d7,d5
- add.l d7,d6
- move.w d2,d0
- muls #FIX_0_298631336,d1
- muls #FIX_2_053119869,d2
- muls #_FIX_0_899976223,d3
- muls #_FIX_2_562915447,d0
- add.l d3,d1
- add.l d0,d2
- add.l d5,d1
- add.l d6,d2
- add.l d4,d6
- add.l d0,d5
- add.l d3,d6
- jmp (a5)
-
- o10xx: move.w 3*2(a0),d3 ;3
- beq o100x
- o101x: move.w 1*2(a0),d4 ;1
- beq.s odd1_1010
-
- ; 7531
- odd1_1011:
- move.w d1,d5
- add.w d3,d5
- move.w d1,d0
- move.w d4,d6
- add.w d4,d0
- muls #FIX_1_501321110,d4
- move.w d5,d7
- add.w d6,d7
- muls #_FIX_0_390180644,d6
- abk_4: muls #FIX_1_175875602,d7
- muls #_FIX_1_961570560,d5
- add.l d7,d6
- add.l d7,d5
- move.w d3,d7
- muls #FIX_0_298631336,d1
- muls #FIX_3_072711026,d3
- muls #_FIX_0_899976223,d0
- muls #_FIX_2_562915447,d7
- add.l d0,d1
- move.l d6,d2
- add.l d5,d1
- add.l d7,d2
- add.l d3,d5
- add.l d4,d6
- add.l d7,d5
- add.l d0,d6
- jmp (a5)
-
- o210xx: move.w 3*DCTSIZE*2(a0),d3 ;3
- beq o2100x
- o2101x: move.w 1*DCTSIZE*2(a0),d4 ;1
- bne.s odd1_1011
-
- ; 7531
- odd1_1010:
- move.w d1,d5
- add.w d3,d5
- move.w d1,d0
- moveq #0,d4
- move.w d5,d7
- moveq #0,d6
- bra.s abk_4
-
- o100x: move.w 1*2(a0),d4 ;1
- beq.s odd1_1000
-
- ; 7531
- odd1_1001:
- move.w d1,d0
- add.w d4,d0
- move.w d1,d5
- move.w d4,d6
- move.w d0,d7
- muls #FIX_1_175875602,d7
- muls #_FIX_1_961570560,d5
- muls #_FIX_0_390180644,d6
- add.l d7,d5
- add.l d7,d6
- muls #FIX_0_298631336,d1
- muls #FIX_1_501321110,d4
- muls #_FIX_0_899976223,d0
- add.l d0,d1
- add.l d5,d1
- move.l d6,d2
- add.l d4,d6
- add.l d0,d6
- jmp (a5)
-
- o2100x: move.w 1*DCTSIZE*2(a0),d4 ;1
- bne.s odd1_1001
-
- ; 7531
- odd1_1000:
- move.w d1,d2
- move.w d1,d5
- move.w d1,d6
- muls #FIX_1_175875602,d2
- muls #FIX_1_175875602+_FIX_0_899976223,d6
- muls #FIX_1_175875602+_FIX_1_961570560,d5
- muls #FIX_1_175875602+_FIX_0_899976223+_FIX_1_961570560+FIX_0_298631336,d1
- jmp (a5)
-
-
- o0xxx: move.w 5*2(a0),d2 ;5
- beq o00xx
- o01xx: move.w 3*2(a0),d3 ;3
- beq o010x
- o011x: move.w 1*2(a0),d4 ;1
- beq.s odd1_0110
-
- ; 7531
- odd1_0111: ; opt8
- move.w d2,d6
- add.w d4,d6
- move.w d4,d1
- muls #FIX_1_501321110,d4
- muls #_FIX_0_899976223,d1
- abk_1: move.w d2,d0
- add.w d3,d0
- move.w d3,d5
-
- move.w d5,d7
- add.w d6,d7
- muls #FIX_1_175875602,d7
- muls #_FIX_1_961570560,d5
- muls #_FIX_0_390180644,d6 ; ???? 2
- add.l d7,d5
- add.l d7,d6
-
- muls #FIX_2_053119869,d2
- muls #FIX_3_072711026,d3
- muls #_FIX_2_562915447,d0
- add.l d0,d2
- add.l d6,d2
- add.l d4,d6
- add.l d1,d6
- add.l d5,d1
- add.l d3,d5
- add.l d0,d5
- jmp (a5)
-
- o20xxx: move.w 5*DCTSIZE*2(a0),d2 ;5
- beq o200xx
- o201xx: move.w 3*DCTSIZE*2(a0),d3 ;3
- beq.s o2010x
- o2011x: move.w 1*DCTSIZE*2(a0),d4 ;1
- bne.s odd1_0111
-
- ; 7531
- odd1_0110:
- move.w d2,d6
- moveq.l #0,d1
- moveq.l #0,d4
- bra.s abk_1
-
- o010x: move.w 1*2(a0),d4 ;1
- beq.s odd1_0100
-
- ; 7531
- odd1_0101:
- move.w d2,d6
- move.w d2,d7
- add.w d4,d6
- move.w d4,d1
- move.w d6,d5
- muls #FIX_1_175875602,d5
- muls #_FIX_0_390180644+FIX_1_175875602,d6
- muls #FIX_2_053119869+_FIX_2_562915447,d2
- muls #FIX_1_501321110,d4
- muls #_FIX_0_899976223,d1
- muls #_FIX_2_562915447,d7
- add.l d6,d2
- add.l d1,d6
- add.l d5,d1
- add.l d7,d5
- add.l d4,d6
- jmp (a5)
-
- o2010x: move.w 1*DCTSIZE*2(a0),d4 ;1
- bne.s odd1_0101
-
- ; 7531
- odd1_0100:
- move.w d2,d6
- move.w d2,d1
- move.w d2,d5
- muls #FIX_1_175875602,d1
- muls #FIX_1_175875602+_FIX_2_562915447,d5
- muls #FIX_1_175875602+_FIX_0_390180644,d6
- muls #FIX_1_175875602+_FIX_2_562915447+_FIX_0_390180644+FIX_2_053119869,d2
- jmp (a5)
-
- o00xx: move.w 3*2(a0),d5 ;3
- beq.s o000x
- o001x: move.w 1*2(a0),d4 ;1
- beq.s odd1_0010
-
- ; 7531
- odd1_0011: ; opt12
- move.w d5,d2
- move.w d5,d3
- move.w d4,d1
- move.w d4,d6
- move.w d3,d7
- add.w d4,d7
- muls #FIX_1_175875602,d7
- muls #_FIX_1_961570560,d5
- muls #_FIX_0_390180644,d6
- add.l d7,d5
- add.l d7,d6
- muls #_FIX_2_562915447+FIX_3_072711026,d3
- muls #_FIX_0_899976223+FIX_1_501321110,d4
- muls #_FIX_0_899976223,d1
- muls #_FIX_2_562915447,d2
- add.l d5,d1
- add.l d6,d2
- add.l d3,d5
- add.l d4,d6
- jmp (a5)
-
- o200xx: move.w 3*DCTSIZE*2(a0),d5 ;3
- beq o2000x
- o2001x: move.w 1*DCTSIZE*2(a0),d4 ;1
- bne.s odd1_0011
-
- ; 7531
- odd1_0010:
- move.w d5,d6
- move.w d5,d2
- move.w d5,d1
- muls #FIX_1_175875602,d6
- muls #FIX_1_175875602+_FIX_2_562915447,d2
- muls #FIX_1_175875602+_FIX_1_961570560,d1
- muls #FIX_1_175875602+_FIX_2_562915447+_FIX_1_961570560+FIX_3_072711026,d5
- jmp (a5)
-
- o000x: move.w 1*2(a0),d6 ;1
- beq.s odd1_0000
-
- ; 7531
- odd1_0001: ; opt 14
- move.w d6,d5
- move.w d6,d1
- move.w d6,d2
- muls #FIX_1_175875602,d5
- muls #FIX_1_175875602+_FIX_0_899976223,d1
- muls #FIX_1_175875602+_FIX_0_390180644,d2
- muls #FIX_1_175875602+_FIX_0_899976223+_FIX_0_390180644+FIX_1_501321110,d6
- jmp (a5)
-
- ; priority: 14 12 8 0
-
- ; 7531
- odd1_0000:
- moveq #CONST_BITS-PASS1_BITS,d7 ; optimized compose !
-
- lea 12(sp),a1
- move.l (a1)+,d0 ; tmp10
- lsl.l #16-(CONST_BITS-PASS1_BITS),d0
- move.l (a1)+,d1 ; tmp11
- lsr.l d7,d1
- move.w d1,d0
-
- move.l (a1)+,d2 ; tmp12
- lsl.l #16-(CONST_BITS-PASS1_BITS),d2
- move.l (a1)+,d3 ; tmp13
- lsr.l d7,d3
- move.w d3,d2
-
- move.l d0,(a0)+
- swap d0
- move.l d2,(a0)+
- swap d2
- move.l d2,(a0)+
- move.l d0,(a0)+
-
- move.l (sp)+,d7
- dbra d7,idct1
- bra.s idct1_ready
-
- ; keep 1 2 5 6
-
- compose1: moveq #CONST_BITS-PASS1_BITS,d7
-
- lea 12(sp),a1
- move.l (a1)+,d4 ; tmp10
- sub.l d6,d4
- add.l d6,d6
- add.l d4,d6
- lsl.l #16-(CONST_BITS-PASS1_BITS),d6
- move.l (a1)+,d3 ; tmp11
- sub.l d5,d3
- add.l d5,d5
- add.l d3,d5
- lsr.l d7,d5
- move.w d5,d6
- move.l d6,(a0)+
- move.l (a1)+,d6 ; tmp12
- sub.l d2,d6
- add.l d2,d2
- add.l d6,d2
- lsl.l #16-(CONST_BITS-PASS1_BITS),d2
- move.l (a1)+,d5 ; tmp13
- sub.l d1,d5
- add.l d1,d1
- add.l d5,d1
- lsr.l d7,d1
- move.w d1,d2
- move.l d2,(a0)+
- lsl.l #16-(CONST_BITS-PASS1_BITS),d5
- lsr.l d7,d6
- move.w d6,d5
- move.l d5,(A0)+
- lsl.l #16-(CONST_BITS-PASS1_BITS),d3
- lsr.l d7,d4
- move.w d4,d3
- move.l d3,(a0)+
-
- idct1_next:
- move.l (sp)+,d7
- dbra d7,idct1
-
- idct1_ready:
- ; *******************************************************
-
- move.l (sp)+,a0
- lea compose2(pc),a5
- moveq #DCTSIZE-1,d7
- idct2: move.l d7,-(sp)
-
-
- odd_part2:
- move.w 7*DCTSIZE*2(a0),d1 ;7
- beq o20xxx
- o21xxx: move.w 5*DCTSIZE*2(a0),d2 ;5
- beq o210xx
- o211xx: move.w 3*DCTSIZE*2(a0),d3 ;3
- beq o2110x
- o2111x: move.w 1*DCTSIZE*2(a0),d4 ;1
- beq odd1_1110
- bra odd1_1111
-
- o2000x: move.w 1*DCTSIZE*2(a0),d6 ;1
- bne odd1_0001
-
- odd0_0000:
- move.w 2*DCTSIZE*2(a0),d2
- move.w 4*DCTSIZE*2(a0),d4
- move.w 6*DCTSIZE*2(a0),d3
- move.w d2,d0
- add.w d3,d0
- muls #FIX_0_541196100/4,d0
- muls #_FIX_1_847759065/4,d3
- add.l d0,d3
- muls #FIX_0_765366865/4,d2
- add.l d0,d2
-
- move.w (a0),d0
- add.w #1<<(PASS1_BITS+3-1),d0 ; precalc from the descaling part below
- ext.l d4
- ext.l d0
- move.l d0,d5
- sub.l d4,d5
- add.l d0,d4
-
- moveq #CONST_BITS-2,d0
- lsl.l d0,d4
- lsl.l d0,d5
-
- move.l d4,d0
- add.l d2,d4
- swap d4
- move.w d4,(a0)+
- sub.l d2,d0
- move.w d4,7*DCTSIZE*2-2(a0)
- swap d0
- move.w d0,3*DCTSIZE*2-2(a0)
- move.l d5,d4
- move.w d0,4*DCTSIZE*2-2(a0)
- add.l d3,d5
- swap d5
- sub.l d3,d4
- move.w d5,1*DCTSIZE*2-2(a0)
- swap d4
- move.w d5,6*DCTSIZE*2-2(a0)
- move.w d4,2*DCTSIZE*2-2(a0)
- move.w d4,5*DCTSIZE*2-2(a0)
-
- move.l (sp)+,d7
- dbra d7,idct2
- bra idct2_ready
-
-
- compose2:
- move.w 2*DCTSIZE*2(a0),d3
- move.w 4*DCTSIZE*2(a0),d4
- move.w 6*DCTSIZE*2(a0),d7
-
- move.w d3,d0
- add.w d7,d0
- muls #FIX_0_541196100,d0
- muls #_FIX_1_847759065,d7
- add.l d0,d7
- muls #FIX_0_765366865,d3
- add.l d0,d3
-
- asr.l #2,d7
- asr.l #2,d3
- move.l d7,a3
-
- move.w (a0),d0
- add.w #1<<(PASS1_BITS+3-1),d0 ; precalc from the descaling part below
- ext.l d4
- ext.l d0
- move.l d0,d7
- sub.l d4,d7
- add.l d0,d4
-
- moveq #CONST_BITS-2,d0
- lsl.l d0,d4
- lsl.l d0,d7
-
-
- asr.l #2,d6
- asr.l #2,d5
- asr.l #2,d2
- asr.l #2,d1
-
-
- move.l d4,d0
- add.l d3,d4
- sub.l d3,d0
-
- move.l d7,d3
- add.l a3,d7
- sub.l a3,d3
-
-
- sub.l d6,d4
- add.l d6,d6
- add.l d4,d6
-
- swap d6 ; moveq #CONST_BITS+PASS1_BITS+3 -2 ,d6 ; asr.l d6,d3
- move.w d6,(a0)+
- swap d4
- move.w d4,7*DCTSIZE*2-2(a0)
-
- sub.l d1,d0
- add.l d1,d1
- add.l d0,d1
-
- swap d1
- move.w d1,3*DCTSIZE*2-2(a0)
- swap d0
- move.w d0,4*DCTSIZE*2-2(a0)
-
- sub.l d5,d7
- add.l d5,d5
- add.l d7,d5
-
- swap d5
- move.w d5,1*DCTSIZE*2-2(a0)
- swap d7
- move.w d7,6*DCTSIZE*2-2(a0)
-
- sub.l d2,d3
- add.l d2,d2
- add.l d3,d2
-
- swap d2
- move.w d2,2*DCTSIZE*2-2(a0)
- swap d3
- move.w d3,5*DCTSIZE*2-2(a0)
-
- idct2_next:
- move.l (sp)+,d7
- dbra d7,idct2
-
- idct2_ready;
- add.w #16,sp
- ; movem.l (sp)+,JREVDCTREGS
-
- movem.l (sp)+,ri_regs
- rts
-
-
- XDEF @j_rev_dct
- @j_rev_dct:
- movem.l ri_regs,-(sp)
- bra jrevdct
-
-
- ifeq 1
- ; **************************************************************************
-
- ; Pre compute singleton coefficient IDCT values.
- ;
- ; void init_pre_idct(void)
-
- ; XDEF @init_pre_idct
- @init_pre_idct:
- movem.l d2/a2,-(sp)
-
- lea PreIDCT,a2
- move.w #64*64/4/4-1,d2
- preidctclr:
- clr.l (a2)+
- clr.l (a2)+
- clr.l (a2)+
- clr.l (a2)+
- dbra d2,preidctclr
-
- lea PreIDCT+63*64*2,a2
- moveq #63,d2
- preidctloop:
- move.w #2048,(a2,d2.w)
- move.l a2,a0
- bsr @j_rev_dct
- sub.w #64,a2
- dbra d2,preidctloop
-
- movem.l (sp)+,d2/a2
- rts
-
- ; ************************************************************************************
-
- ; Perform the inverse DCT on one block of coefficients.
- ;
- ; void j_rev_dct_sparse (DCTBLOCK data, int pos)
-
- ; XDEF @j_rev_dct_sparse
- @j_rev_dct_sparse:
-
- tst.l d0
- bne itsnotthedc
-
- ; the single element to cope with is the dc coefficient
-
- move.w (a0),d1
- bpl.s scale_dc
- subq.w #3+4,d1 ; "implement" the rounding error
- scale_dc:addq.w #4,d1
- asr.w #3,d1
-
- move.w d1,d0 ; extend to longword
- swap d0
- move.w d1,d0
-
- moveq #7,d1
- set_dc: move.l d0,(a0)+
- move.l d0,(a0)+
- move.l d0,(a0)+
- move.l d0,(a0)+
- dbra d1,set_dc
-
- rts ; not that pretty
- ; bra exit_jrds
- itsnotthedc:
- movem.l d2/d3,-(sp)
-
- ; Some other coefficient.
-
- move.w (a0,d0.w),d1 ; get coeff
-
- lea PreIDCT,a1 ; get precalculated DCT
- lsl.l #7,d0
- add.l d0,a1
-
- moveq #CONST_BITS-PASS1_BITS-8,d3 ; scale down
-
- moveq.l #31,d0
- set_ac: move.w d1,d2
- muls (a1)+,d2
- lsr.l d3,d2
- move.w d2,(a0)+
- move.w d1,d2
- muls (a1)+,d2
- lsr.l d3,d2
- move.w d2,(a0)+
- dbra d0,set_ac
-
- movem.l (sp)+,d2/d3
- exit_jrds: rts
-
- ; ************************************************************************************
-
- section bss,BSS
-
- ;
- ; Precomputed idct value arrays
- ;
- PreIDCT: ds.w 64*64
-
- endc
-
-
- ; END
-