]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/cabac.h
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero exten...
[ffmpeg] / libavcodec / cabac.h
index 3008792b2108ddbdac32ae23686700c7a75bd1b8..01163915a7cdde0d54f2e06f3601319fbfa54641 100644 (file)
@@ -31,6 +31,7 @@
 
 #define CABAC_BITS 16
 #define CABAC_MASK ((1<<CABAC_BITS)-1)
+#define BRANCHLESS_CABAD 1
 
 typedef struct CABACContext{
     int low;
@@ -39,9 +40,9 @@ typedef struct CABACContext{
 #ifdef STRICT_LIMITS
     int symCount;
 #endif
-    uint8_t lps_range[2*66][4];   ///< rangeTabLPS
-    uint8_t lps_state[2*65];      ///< transIdxLPS
-    uint8_t mps_state[2*65];      ///< transIdxMPS
+    uint8_t lps_range[2*65][4];   ///< rangeTabLPS
+    uint8_t lps_state[2*64];      ///< transIdxLPS
+    uint8_t mps_state[2*64];      ///< transIdxMPS
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
@@ -51,7 +52,7 @@ typedef struct CABACContext{
 extern const uint8_t ff_h264_lps_range[64][4];
 extern const uint8_t ff_h264_mps_state[64];
 extern const uint8_t ff_h264_lps_state[64];
-extern const uint8_t ff_h264_norm_shift[256];
+extern const uint8_t ff_h264_norm_shift[128];
 
 
 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size);
@@ -256,7 +257,6 @@ static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int
 }
 
 static void refill(CABACContext *c){
-    if(c->bytestream <= c->bytestream_end)
 #if CABAC_BITS == 16
         c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
 #else
@@ -266,16 +266,14 @@ static void refill(CABACContext *c){
     c->bytestream+= CABAC_BITS/8;
 }
 
-#if 1 /* all use commented */
 static void refill2(CABACContext *c){
     int i, x;
 
     x= c->low ^ (c->low-1);
-    i= 9 - ff_h264_norm_shift[x>>(CABAC_BITS+1)];
+    i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS+1)];
 
     x= -CABAC_MASK;
 
-    if(c->bytestream <= c->bytestream_end)
 #if CABAC_BITS == 16
         x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
 #else
@@ -285,7 +283,6 @@ static void refill2(CABACContext *c){
     c->low += x<<i;
     c->bytestream+= CABAC_BITS/8;
 }
-#endif
 
 static inline void renorm_cabac_decoder(CABACContext *c){
     while(c->range < (0x200 << CABAC_BITS)){
@@ -367,39 +364,166 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){
 
 static int get_cabac(CABACContext *c, uint8_t * const state){
     //FIXME gcc generates duplicate load/stores for c->low and c->range
-//START_TIMER
+#ifdef ARCH_X86
+    int bit;
+
+#define LOW          "0"
+#define RANGE        "4"
+#define LPS_RANGE   "12"
+#define LPS_STATE   "12+2*65*4"
+#define MPS_STATE   "12+2*65*4+2*64"
+#define BYTESTART   "12+2*65*4+4*64"
+#define BYTE        "16+2*65*4+4*64"
+#define BYTEEND     "20+2*65*4+4*64"
+#ifndef BRANCHLESS_CABAD
+    asm volatile(
+        "movzbl (%1), %%eax                     \n\t"
+        "movl "RANGE    "(%2), %%ebx            \n\t"
+        "movl "RANGE    "(%2), %%edx            \n\t"
+        "shrl $23, %%ebx                        \n\t"
+        "leal "LPS_RANGE"(%2, %%eax, 4), %%esi  \n\t"
+        "movzbl (%%ebx, %%esi), %%esi           \n\t"
+        "shll $17, %%esi                        \n\t"
+        "movl "LOW      "(%2), %%ebx            \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "subl %%esi, %%edx                      \n\t"
+        "cmpl %%edx, %%ebx                      \n\t"
+        " ja 1f                                 \n\t"
+        "cmp $0x2000000, %%edx                  \n\t" //FIXME avoidable
+        "setb %%cl                              \n\t"
+        "shl %%cl, %%edx                        \n\t"
+        "shl %%cl, %%ebx                        \n\t"
+        "movzbl "MPS_STATE"(%2, %%eax), %%ecx   \n\t"
+        "movb %%cl, (%1)                        \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "test %%bx, %%bx                        \n\t"
+        " jnz 2f                                \n\t"
+        "movl "BYTE     "(%2), %%esi            \n\t"
+        "subl $0xFFFF, %%ebx                    \n\t"
+        "movzwl (%%esi), %%ecx                  \n\t"
+        "bswap %%ecx                            \n\t"
+        "shrl $15, %%ecx                        \n\t"
+        "addl $2, %%esi                         \n\t"
+        "addl %%ecx, %%ebx                      \n\t"
+        "movl %%esi, "BYTE    "(%2)             \n\t"
+        "jmp 2f                                 \n\t"
+        "1:                                     \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "subl %%edx, %%ebx                      \n\t"
+        "movl %%esi, %%edx                      \n\t"
+        "shr $19, %%esi                         \n\t"
+        "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx   \n\t"
+        "shll %%cl, %%ebx                       \n\t"
+        "shll %%cl, %%edx                       \n\t"
+        "movzbl "LPS_STATE"(%2, %%eax), %%ecx   \n\t"
+        "movb %%cl, (%1)                        \n\t"
+        "addl $1, %%eax                         \n\t"
+        "test %%bx, %%bx                        \n\t"
+        " jnz 2f                                \n\t"
+
+        "movl "BYTE     "(%2), %%ecx            \n\t"
+        "movzwl (%%ecx), %%esi                  \n\t"
+        "bswap %%esi                            \n\t"
+        "shrl $15, %%esi                        \n\t"
+        "subl $0xFFFF, %%esi                    \n\t"
+        "addl $2, %%ecx                         \n\t"
+        "movl %%ecx, "BYTE    "(%2)             \n\t"
+
+        "leal -1(%%ebx), %%ecx                  \n\t"
+        "xorl %%ebx, %%ecx                      \n\t"
+        "shrl $17, %%ecx                        \n\t"
+        "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx   \n\t"
+        "neg %%cl                               \n\t"
+        "add $7, %%cl                           \n\t"
+
+        "shll %%cl , %%esi                      \n\t"
+        "addl %%esi, %%ebx                      \n\t"
+        "2:                                     \n\t"
+        "movl %%edx, "RANGE    "(%2)            \n\t"
+        "movl %%ebx, "LOW      "(%2)            \n\t"
+        "andl $1, %%eax                         \n\t"
+
+        :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
+        :"r"(state), "r"(c)
+        : "%ecx", "%ebx", "%edx", "%esi"
+    );
+#else
+    asm volatile(
+        "movzbl (%1), %%eax                     \n\t"
+        "movl "RANGE    "(%2), %%ebx            \n\t"
+        "movl "RANGE    "(%2), %%edx            \n\t"
+        "shrl $23, %%ebx                        \n\t"
+        "leal "LPS_RANGE"(%2, %%eax, 4), %%esi  \n\t"
+        "movzbl (%%ebx, %%esi), %%esi           \n\t"
+        "shll $17, %%esi                        \n\t"
+        "movl "LOW      "(%2), %%ebx            \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "subl %%esi, %%edx                      \n\t"
+        "movl %%edx, %%ecx                      \n\t"
+        "subl %%ebx, %%edx                      \n\t"
+        "sarl $31, %%edx                        \n\t" //lps_mask
+        "subl %%ecx, %%esi                      \n\t" //RangeLPS - range
+        "andl %%edx, %%esi                      \n\t" //(RangeLPS - range)&lps_mask
+        "addl %%ecx, %%esi                      \n\t" //new range
+        "andl %%edx, %%ecx                      \n\t"
+        "subl %%ecx, %%ebx                      \n\t"
+
+//eax:state ebx:low edx:mask esi:range
+        "xorl %%edx, %%eax                      \n\t"
+        "movzbl "MPS_STATE"(%2, %%eax), %%ecx   \n\t"
+        "movb %%cl, (%1)                        \n\t"
+
+        "movl %%esi, %%edx                      \n\t"
+//eax:bit ebx:low edx:range esi:range
+
+        "shr $19, %%esi                         \n\t"
+        "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx   \n\t"
+        "shll %%cl, %%ebx                       \n\t"
+        "shll %%cl, %%edx                       \n\t"
+        "test %%bx, %%bx                        \n\t"
+        " jnz 1f                                \n\t"
+
+        "movl "BYTE     "(%2), %%ecx            \n\t"
+        "movzwl (%%ecx), %%esi                  \n\t"
+        "bswap %%esi                            \n\t"
+        "shrl $15, %%esi                        \n\t"
+        "subl $0xFFFF, %%esi                    \n\t"
+        "addl $2, %%ecx                         \n\t"
+        "movl %%ecx, "BYTE    "(%2)             \n\t"
+
+        "leal -1(%%ebx), %%ecx                  \n\t"
+        "xorl %%ebx, %%ecx                      \n\t"
+        "shrl $17, %%ecx                        \n\t"
+        "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx   \n\t"
+        "neg %%cl                               \n\t"
+        "add $7, %%cl                           \n\t"
+
+        "shll %%cl , %%esi                      \n\t"
+        "addl %%esi, %%ebx                      \n\t"
+        "1:                                     \n\t"
+        "movl %%edx, "RANGE    "(%2)            \n\t"
+        "movl %%ebx, "LOW      "(%2)            \n\t"
+        "andl $1, %%eax                         \n\t"
+        :"=&a"(bit)
+        :"r"(state), "r"(c)
+        : "%ecx", "%ebx", "%edx", "%esi"
+    );
+#endif
+#else
     int s = *state;
     int RangeLPS= c->lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1);
     int bit, lps_mask attribute_unused;
 
     c->range -= RangeLPS;
-#if 1
+#ifndef BRANCHLESS_CABAD
     if(c->low < c->range){
         bit= s&1;
-#ifdef ARCH_X86
-    //P3:627
-asm(
-        "addb $2, %b0       \n\t"
-        " js 1f             \n\t"
-        "movb %b0, %1       \n\t"
-        "1:                 \n\t"
-        : "+q"(s), "=m"(*state)
-);
-#else
-        *state= c->mps_state[s]; //P3:655
-/*        if(s<126) //P3:657
-            *state= s+2;*/
-        s+=2; //P3:631
-        if(s<128)
-            *state= s;
-#endif
+        *state= c->mps_state[s];
         renorm_cabac_decoder_once(c);
     }else{
-        bit= ff_h264_norm_shift[RangeLPS>>17];
+        bit= ff_h264_norm_shift[RangeLPS>>19];
         c->low -= c->range;
         *state= c->lps_state[s];
-//        c->range = RangeLPS;
-//        renorm_cabac_decoder(c);
         c->range = RangeLPS<<bit;
         c->low <<= bit;
         bit= (s&1)^1;
@@ -414,16 +538,17 @@ asm(
     c->low -= c->range & lps_mask;
     c->range += (RangeLPS - c->range) & lps_mask;
 
-    bit= (s^lps_mask)&1;
-    *state= c->mps_state[s - (128&lps_mask)];
+    s^=lps_mask;
+    *state= c->mps_state[s];
+    bit= s&1;
 
-    lps_mask= ff_h264_norm_shift[c->range>>(CABAC_BITS+2)];
+    lps_mask= ff_h264_norm_shift[c->range>>(CABAC_BITS+3)];
     c->range<<= lps_mask;
     c->low  <<= lps_mask;
     if(!(c->low & CABAC_MASK))
         refill2(c);
 #endif
-//STOP_TIMER("get_cabac")
+#endif
     return bit;
 }