]> git.sesse.net Git - x264/commitdiff
dequant_mmx made incorrect assumptions about extreme inputs. now uses 32bit in more...
authorLoren Merritt <pengvado@videolan.org>
Fri, 10 Feb 2006 20:52:48 +0000 (20:52 +0000)
committerLoren Merritt <pengvado@videolan.org>
Fri, 10 Feb 2006 20:52:48 +0000 (20:52 +0000)
patch by Christian Heine.

git-svn-id: svn://svn.videolan.org/x264/trunk@428 df754926-b1dd-0310-bc7b-ec298dee348c

common/amd64/quant-a.asm
common/i386/quant-a.asm
common/quant.c
tools/checkasm.c

index 45d14946782f6544273a6999e7c89385a7257f3e..fcb5db1ac0ce6e7605a0a88ccefeebb246e36741 100644 (file)
@@ -36,7 +36,6 @@ BITS 64
 %include "amd64inc.asm"
 
 SECTION .rodata
-pw_1:  times 4 dw 1
 pd_1:  times 2 dd 1
 
 SECTION .text
@@ -398,22 +397,6 @@ x264_quant_8x8_core32_mmxext:
     movq     %1,  mm0
 %endmacro
 
-%macro DEQUANT16_R_1x4 3
-;;; %1      dct[y][x]
-;;; %2,%3   dequant_mf[i_mf][y][x]
-;;; mm5     -i_qbits
-;;; mm6     f as words
-
-    movq     mm1, %2
-    movq     mm2, %3
-    movq     mm0, %1
-    packssdw mm1, mm2
-    pmullw   mm0, mm1
-    paddw    mm0, mm6
-    psraw    mm0, mm5
-    movq     %1,  mm0
-%endmacro
-
 %macro DEQUANT32_R_1x4 3
 ;;; %1      dct[y][x]
 ;;; %2,%3   dequant_mf[i_mf][y][x]
@@ -464,10 +447,7 @@ ALIGN 16
     add  rsi, rdx   ; dequant_mf[i_mf]
 
     sub  eax, %3
-    cmp  eax, -2
-    jle  .rshift32  ; dct * dequant overflows 16bit
-    cmp  eax, -1
-    jle  .rshift16  ; negative qbits => rightshift
+    jl   .rshift32  ; negative qbits => rightshift
 
 .lshift:
     movd mm5, eax
@@ -480,22 +460,6 @@ ALIGN 16
 
     ret
 
-.rshift16:
-    neg   eax
-    movd  mm5, eax
-    movq  mm6, [pw_1 GLOBAL]
-    pxor  mm7, mm7
-    psllw mm6, mm5
-    psrlw mm6, 1
-
-%rep %2
-    DEQUANT16_R_1x4 [rdi], [rsi], [rsi+8]
-    add  rsi, byte 16
-    add  rdi, byte 8
-%endrep
-
-    ret
-
 .rshift32:
     neg   eax
     movd  mm5, eax
index d5634d82275d8ba80fdf0ea86a84cb0934019314..d92f53032c2d3f694a11303d114021741aa17374 100644 (file)
@@ -36,7 +36,6 @@ BITS 32
 %include "i386inc.asm"
 
 SECTION .rodata
-pw_1:  times 4 dw 1
 pd_1:  times 2 dd 1
 
 SECTION .text
@@ -461,10 +460,7 @@ ALIGN 16
     mov  ecx, [esp+4] ; dct
 
     sub  eax, %3
-    jge  .lshift
-    cmp  eax, byte -1
-    je   .rshift16    ; negative qbits => rightshift
-    jmp  .rshift32    ; dct * dequant overflows 16bit
+    jl   .rshift32    ; negative qbits => rightshift
 
 .lshift:
     movd mm5, eax
@@ -480,28 +476,6 @@ ALIGN 16
     nop
     ret
 
-.rshift16:
-    neg   eax
-    picpush ebx
-    picgetgot ebx
-    movq  mm6, [pw_1 GLOBAL]
-    picpop ebx
-    movd  mm5, eax
-    pxor  mm7, mm7
-    psllw mm6, mm5
-    psrlw mm6, 1
-
-    mov  eax, 8*(%2-1)
-.loopr16
-%rep 2
-    DEQUANT16_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
-    sub  eax, byte 8
-%endrep
-    jge  .loopr16
-
-    nop
-    ret
-
 .rshift32:
     neg   eax
     picpush ebx
index 72c18cd9401685c9d458e5f0f990bb254765207e..e06c8a4af608e08c94b850cc2c24ad99c227081e 100644 (file)
@@ -262,7 +262,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
     }
 
-    if( cpu&X264_CPU_MMXEXT )
+    if( cpu&X264_CPU_MMX )
     {
         /* dequant is not subject to the above CQM-dependent overflow issues,
          * as long as the inputs are in the range generable by dct+quant.
index 5ac08704221619a26007b9fea838f0212422fbe5..0148c2fab3992c568b0d82bb029e57f086b792e7 100644 (file)
@@ -398,6 +398,30 @@ static int check_quant( int cpu_ref, int cpu_new )
         x264_quant_init( h, cpu_ref, &qf_ref );
         x264_quant_init( h, cpu_new, &qf_a );
 
+#define INIT_QUANT8() \
+        { \
+            static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
+            int x, y; \
+            for( y = 0; y < 8; y++ ) \
+                for( x = 0; x < 8; x++ ) \
+                { \
+                    unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
+                    dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
+                } \
+        }
+
+#define INIT_QUANT4() \
+        { \
+            static const int scale1d[4] = {4,6,4,6}; \
+            int x, y; \
+            for( y = 0; y < 4; y++ ) \
+                for( x = 0; x < 4; x++ ) \
+                { \
+                    unsigned int scale = 255*scale1d[y]*scale1d[x]; \
+                    dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
+                } \
+        }
+
 #define TEST_QUANT( name, cqm ) \
         if( qf_a.name != qf_ref.name ) \
         { \
@@ -413,37 +437,97 @@ static int check_quant( int cpu_ref, int cpu_new )
             } \
         }
 
-        TEST_QUANT( quant_8x8_core, *h->quant8_mf[CQM_8IY] );
-        TEST_QUANT( quant_8x8_core, *h->quant8_mf[CQM_8PY] );
-        TEST_QUANT( quant_4x4_core, *h->quant4_mf[CQM_4IY] );
-        TEST_QUANT( quant_4x4_core, *h->quant4_mf[CQM_4PY] );
+#define TEST_QUANT8( qname, cqm, shift, divider ) \
+        if( qf_a.qname != qf_ref.qname ) \
+        { \
+            int qp; \
+            used_asms[0] = 1; \
+            for( qp = 51; qp > 0; qp-- ) \
+            { \
+                INIT_QUANT8() \
+                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
+                qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
+                if( memcmp( dct1, dct2, 64*2 ) ) \
+                { \
+                    oks[0] = 0; \
+                    fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+                    break; \
+                } \
+            } \
+        }
+
+#define TEST_QUANT4( qname, cqm, shift, divider ) \
+        if( qf_a.qname != qf_ref.qname ) \
+        { \
+            int qp; \
+            used_asms[0] = 1; \
+            for( qp = 51; qp > 0; qp-- ) \
+            { \
+                INIT_QUANT4() \
+                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
+                qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
+                if( memcmp( dct1, dct2, 16*2 ) ) \
+                { \
+                    oks[0] = 0; \
+                    fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+                    break; \
+                } \
+            } \
+        }
+
+        TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8IY], 16, 3 );
+        TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8PY], 16, 6 );
+        TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4IY], 15, 3 );
+        TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4PY], 15, 6 );
         TEST_QUANT( quant_4x4_dc_core, ***h->quant4_mf[CQM_4IY] );
         TEST_QUANT( quant_2x2_dc_core, ***h->quant4_mf[CQM_4IC] );
 
-#define TEST_DEQUANT( name, quant, dqm, cqm, shift ) \
-        if( qf_a.name != qf_ref.name ) \
+#define TEST_DEQUANT8( qname, dqname, cqm, dqm, shift, divider ) \
+        if( qf_a.dqname != qf_ref.dqname ) \
         { \
             int qp; \
             used_asms[1] = 1; \
             for( qp = 51; qp > 0; qp-- ) \
             { \
-                for( i = 0; i < 64; i++ ) \
-                    dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
-                qf_c.quant( (void*)dct1, cqm[qp%6], shift+qp/6, 0 ); \
-                memcpy( dct2, dct1, sizeof(dct2) ); \
-                qf_c.name( (void*)dct1, dqm, qp ); \
-                qf_a.name( (void*)dct2, dqm, qp ); \
+                INIT_QUANT8() \
+                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
+                memcpy( dct2, dct1, 64*2 ); \
+                qf_c.dqname( (void*)dct1, dqm, qp ); \
+                qf_a.dqname( (void*)dct2, dqm, qp ); \
                 if( memcmp( dct1, dct2, 64*2 ) ) \
                 { \
                     oks[1] = 0; \
-                    fprintf( stderr, #name "(qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); \
+                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
+                    break; \
+                } \
+            } \
+        }
+
+#define TEST_DEQUANT4( qname, dqname, cqm, dqm, shift, divider ) \
+        if( qf_a.dqname != qf_ref.dqname ) \
+        { \
+            int qp; \
+            used_asms[1] = 1; \
+            for( qp = 51; qp > 0; qp-- ) \
+            { \
+                INIT_QUANT4() \
+                qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
+                memcpy( dct2, dct1, 16*2 ); \
+                qf_c.dqname( (void*)dct1, dqm, qp ); \
+                qf_a.dqname( (void*)dct2, dqm, qp ); \
+                if( memcmp( dct1, dct2, 16*2 ) ) \
+                { \
+                    oks[1] = 0; \
+                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
                     break; \
                 } \
             } \
         }
 
-        TEST_DEQUANT( dequant_8x8, quant_8x8_core, h->dequant8_mf[CQM_8PY], h->quant8_mf[CQM_8PY], 16 );
-        TEST_DEQUANT( dequant_4x4, quant_4x4_core, h->dequant4_mf[CQM_4PY], h->quant4_mf[CQM_4PY], 15 );
+        TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8IY], h->dequant8_mf[CQM_8IY], 16, 3 );
+        TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8PY], h->dequant8_mf[CQM_8PY], 16, 6 );
+        TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4IY], h->dequant4_mf[CQM_4IY], 15, 3 );
+        TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4PY], h->dequant4_mf[CQM_4PY], 15, 6 );
     }
 
     ok = oks[0]; used_asm = used_asms[0];