git.sesse.net Git - ffmpeg/blob - libpostproc/postprocess.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3  *
   4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file
  25  * postprocessing.
  26  */
  27
  28 /*
  29                         C       MMX     MMX2    3DNow   AltiVec
  30 isVertDC                Ec      Ec                      Ec
  31 isVertMinMaxOk          Ec      Ec                      Ec
  32 doVertLowPass           E               e       e       Ec
  33 doVertDefFilter         Ec      Ec      e       e       Ec
  34 isHorizDC               Ec      Ec                      Ec
  35 isHorizMinMaxOk         a       E                       Ec
  36 doHorizLowPass          E               e       e       Ec
  37 doHorizDefFilter        Ec      Ec      e       e       Ec
  38 do_a_deblock            Ec      E       Ec      E
  39 deRing                  E               e       e*      Ecp
  40 Vertical RKAlgo1        E               a       a
  41 Horizontal RKAlgo1                      a       a
  42 Vertical X1#            a               E       E
  43 Horizontal X1#          a               E       E
  44 LinIpolDeinterlace      e               E       E*
  45 CubicIpolDeinterlace    a               e       e*
  46 LinBlendDeinterlace     e               E       E*
  47 MedianDeinterlace#      E       Ec      Ec
  48 TempDeNoiser#           E               e       e       Ec
  49
  50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
  51 # more or less selfinvented filters so the exactness is not too meaningful
  52 E = Exact implementation
  53 e = almost exact implementation (slightly different rounding,...)
  54 a = alternative / approximate impl
  55 c = checked against the other implementations (-vo md5)
  56 p = partially optimized, still some work to do
  57 */
  58
  59 /*
  60 TODO:
  61 reduce the time wasted on the mem transfer
  62 unroll stuff if instructions depend too much on the prior one
  63 move YScale thing to the end instead of fixing QP
  64 write a faster and higher quality deblocking filter :)
  65 make the mainloop more flexible (variable number of blocks at once
  66         (the if/else stuff per block is slowing things down)
  67 compare the quality & speed of all filters
  68 split this huge file
  69 optimize c versions
  70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  71 ...
  72 */
  73
  74 //Changelog: use git log
  75
  76 #include "config.h"
  77 #include "libavutil/avutil.h"
  78 #include "libavutil/avassert.h"
  79 #include <inttypes.h>
  80 #include <stdio.h>
  81 #include <stdlib.h>
  82 #include <string.h>
  83 //#undef HAVE_MMXEXT_INLINE
  84 //#define HAVE_AMD3DNOW_INLINE
  85 //#undef HAVE_MMX_INLINE
  86 //#undef ARCH_X86
  87 //#define DEBUG_BRIGHTNESS
  88 #include "postprocess.h"
  89 #include "postprocess_internal.h"
  90 #include "libavutil/avstring.h"
  91
  92 unsigned postproc_version(void)
  93 {
  94     av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
  95     return LIBPOSTPROC_VERSION_INT;
  96 }
  97
  98 const char *postproc_configuration(void)
  99 {
 100     return FFMPEG_CONFIGURATION;
 101 }
 102
 103 const char *postproc_license(void)
 104 {
 105 #define LICENSE_PREFIX "libpostproc license: "
 106     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 107 }
 108
 109 #if HAVE_ALTIVEC_H
 110 #include <altivec.h>
 111 #endif
 112
 113 #define GET_MODE_BUFFER_SIZE 500
 114 #define OPTIONS_ARRAY_SIZE 10
 115 #define BLOCK_SIZE 8
 116 #define TEMP_STRIDE 8
 117 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 118
 119 #if ARCH_X86 && HAVE_INLINE_ASM
 120 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 121 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 122 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 123 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 124 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 125 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 126 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 127 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 128 #endif
 129
 130 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 131
 132
 133 static const struct PPFilter filters[]=
 134 {
 135     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 136     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 137 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 138     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 139     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 140     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 141     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 142     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 143     {"dr", "dering",                1, 5, 6, DERING},
 144     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 145     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 146     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 147     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 148     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 149     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 150     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 151     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 152     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 153     {"be", "bitexact",              1, 0, 0, BITEXACT},
 154     {NULL, NULL,0,0,0,0} //End Marker
 155 };
 156
 157 static const char *replaceTable[]=
 158 {
 159     "default",      "hb:a,vb:a,dr:a",
 160     "de",           "hb:a,vb:a,dr:a",
 161     "fast",         "h1:a,v1:a,dr:a",
 162     "fa",           "h1:a,v1:a,dr:a",
 163     "ac",           "ha:a:128:7,va:a,dr:a",
 164     NULL //End Marker
 165 };
 166
 167
 168 #if ARCH_X86 && HAVE_INLINE_ASM
 169 static inline void prefetchnta(void *p)
 170 {
 171     __asm__ volatile(   "prefetchnta (%0)\n\t"
 172         : : "r" (p)
 173     );
 174 }
 175
 176 static inline void prefetcht0(void *p)
 177 {
 178     __asm__ volatile(   "prefetcht0 (%0)\n\t"
 179         : : "r" (p)
 180     );
 181 }
 182
 183 static inline void prefetcht1(void *p)
 184 {
 185     __asm__ volatile(   "prefetcht1 (%0)\n\t"
 186         : : "r" (p)
 187     );
 188 }
 189
 190 static inline void prefetcht2(void *p)
 191 {
 192     __asm__ volatile(   "prefetcht2 (%0)\n\t"
 193         : : "r" (p)
 194     );
 195 }
 196 #endif
 197
 198 /* The horizontal functions exist only in C because the MMX
 199  * code is faster with vertical filters and transposing. */
 200
 201 /**
 202  * Check if the given 8x8 Block is mostly "flat"
 203  */
 204 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
 205 {
 206     int numEq= 0;
 207     int y;
 208     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 209     const int dcThreshold= dcOffset*2 + 1;
 210
 211     for(y=0; y<BLOCK_SIZE; y++){
 212         if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
 213         if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
 214         if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
 215         if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
 216         if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
 217         if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
 218         if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
 219         src+= stride;
 220     }
 221     return numEq > c->ppMode.flatnessThreshold;
 222 }
 223
 224 /**
 225  * Check if the middle 8x8 Block in the given 8x16 block is flat
 226  */
 227 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
 228 {
 229     int numEq= 0;
 230     int y;
 231     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 232     const int dcThreshold= dcOffset*2 + 1;
 233
 234     src+= stride*4; // src points to begin of the 8x8 Block
 235     for(y=0; y<BLOCK_SIZE-1; y++){
 236         if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
 237         if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
 238         if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
 239         if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
 240         if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
 241         if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
 242         if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
 243         if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
 244         src+= stride;
 245     }
 246     return numEq > c->ppMode.flatnessThreshold;
 247 }
 248
 249 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
 250 {
 251     int i;
 252     for(i=0; i<2; i++){
 253         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 254         src += stride;
 255         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 256         src += stride;
 257         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 258         src += stride;
 259         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 260         src += stride;
 261     }
 262     return 1;
 263 }
 264
 265 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 266 {
 267     int x;
 268     src+= stride*4;
 269     for(x=0; x<BLOCK_SIZE; x+=4){
 270         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 271         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 272         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 273         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 274     }
 275     return 1;
 276 }
 277
 278 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
 279 {
 280     if( isHorizDC_C(src, stride, c) ){
 281         if( isHorizMinMaxOk_C(src, stride, c->QP) )
 282             return 1;
 283         else
 284             return 0;
 285     }else{
 286         return 2;
 287     }
 288 }
 289
 290 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
 291 {
 292     if( isVertDC_C(src, stride, c) ){
 293         if( isVertMinMaxOk_C(src, stride, c->QP) )
 294             return 1;
 295         else
 296             return 0;
 297     }else{
 298         return 2;
 299     }
 300 }
 301
 302 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 303 {
 304     int y;
 305     for(y=0; y<BLOCK_SIZE; y++){
 306         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 307
 308         if(FFABS(middleEnergy) < 8*c->QP){
 309             const int q=(dst[3] - dst[4])/2;
 310             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 311             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 312
 313             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 314             d= FFMAX(d, 0);
 315
 316             d= (5*d + 32) >> 6;
 317             d*= FFSIGN(-middleEnergy);
 318
 319             if(q>0)
 320             {
 321                 d= d<0 ? 0 : d;
 322                 d= d>q ? q : d;
 323             }
 324             else
 325             {
 326                 d= d>0 ? 0 : d;
 327                 d= d<q ? q : d;
 328             }
 329
 330             dst[3]-= d;
 331             dst[4]+= d;
 332         }
 333         dst+= stride;
 334     }
 335 }
 336
 337 /**
 338  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 339  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 340  */
 341 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 342 {
 343     int y;
 344     for(y=0; y<BLOCK_SIZE; y++){
 345         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 346         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 347
 348         int sums[10];
 349         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 350         sums[1] = sums[0] - first  + dst[3];
 351         sums[2] = sums[1] - first  + dst[4];
 352         sums[3] = sums[2] - first  + dst[5];
 353         sums[4] = sums[3] - first  + dst[6];
 354         sums[5] = sums[4] - dst[0] + dst[7];
 355         sums[6] = sums[5] - dst[1] + last;
 356         sums[7] = sums[6] - dst[2] + last;
 357         sums[8] = sums[7] - dst[3] + last;
 358         sums[9] = sums[8] - dst[4] + last;
 359
 360         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 361         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 362         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 363         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 364         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 365         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 366         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 367         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 368
 369         dst+= stride;
 370     }
 371 }
 372
 373 /**
 374  * Experimental Filter 1 (Horizontal)
 375  * will not damage linear gradients
 376  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 377  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 378  * MMX2 version does correct clipping C version does not
 379  * not identical with the vertical one
 380  */
 381 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 382 {
 383     int y;
 384     static uint64_t *lut= NULL;
 385     if(lut==NULL)
 386     {
 387         int i;
 388         lut = av_malloc(256*8);
 389         for(i=0; i<256; i++)
 390         {
 391             int v= i < 128 ? 2*i : 2*(i-256);
 392 /*
 393 //Simulate 112242211 9-Tap filter
 394             uint64_t a= (v/16)  & 0xFF;
 395             uint64_t b= (v/8)   & 0xFF;
 396             uint64_t c= (v/4)   & 0xFF;
 397             uint64_t d= (3*v/8) & 0xFF;
 398 */
 399 //Simulate piecewise linear interpolation
 400             uint64_t a= (v/16)   & 0xFF;
 401             uint64_t b= (v*3/16) & 0xFF;
 402             uint64_t c= (v*5/16) & 0xFF;
 403             uint64_t d= (7*v/16) & 0xFF;
 404             uint64_t A= (0x100 - a)&0xFF;
 405             uint64_t B= (0x100 - b)&0xFF;
 406             uint64_t C= (0x100 - c)&0xFF;
 407             uint64_t D= (0x100 - c)&0xFF;
 408
 409             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 410                        (D<<24) | (C<<16) | (B<<8)  | (A);
 411             //lut[i] = (v<<32) | (v<<24);
 412         }
 413     }
 414
 415     for(y=0; y<BLOCK_SIZE; y++){
 416         int a= src[1] - src[2];
 417         int b= src[3] - src[4];
 418         int c= src[5] - src[6];
 419
 420         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 421
 422         if(d < QP){
 423             int v = d * FFSIGN(-b);
 424
 425             src[1] +=v/8;
 426             src[2] +=v/4;
 427             src[3] +=3*v/8;
 428             src[4] -=3*v/8;
 429             src[5] -=v/4;
 430             src[6] -=v/8;
 431         }
 432         src+=stride;
 433     }
 434 }
 435
 436 /**
 437  * accurate deblock filter
 438  */
 439 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 440                                             int stride, const PPContext *c)
 441 {
 442     int y;
 443     const int QP= c->QP;
 444     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 445     const int dcThreshold= dcOffset*2 + 1;
 446 //START_TIMER
 447     src+= step*4; // src points to begin of the 8x8 Block
 448     for(y=0; y<8; y++){
 449         int numEq= 0;
 450
 451         if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
 452         if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
 453         if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
 454         if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
 455         if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
 456         if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
 457         if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
 458         if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
 459         if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
 460         if(numEq > c->ppMode.flatnessThreshold){
 461             int min, max, x;
 462
 463             if(src[0] > src[step]){
 464                 max= src[0];
 465                 min= src[step];
 466             }else{
 467                 max= src[step];
 468                 min= src[0];
 469             }
 470             for(x=2; x<8; x+=2){
 471                 if(src[x*step] > src[(x+1)*step]){
 472                         if(src[x    *step] > max) max= src[ x   *step];
 473                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 474                 }else{
 475                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 476                         if(src[ x   *step] < min) min= src[ x   *step];
 477                 }
 478             }
 479             if(max-min < 2*QP){
 480                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 481                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 482
 483                 int sums[10];
 484                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 485                 sums[1] = sums[0] - first       + src[3*step];
 486                 sums[2] = sums[1] - first       + src[4*step];
 487                 sums[3] = sums[2] - first       + src[5*step];
 488                 sums[4] = sums[3] - first       + src[6*step];
 489                 sums[5] = sums[4] - src[0*step] + src[7*step];
 490                 sums[6] = sums[5] - src[1*step] + last;
 491                 sums[7] = sums[6] - src[2*step] + last;
 492                 sums[8] = sums[7] - src[3*step] + last;
 493                 sums[9] = sums[8] - src[4*step] + last;
 494
 495                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 496                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 497                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 498                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 499                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 500                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 501                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 502                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 503             }
 504         }else{
 505             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 506
 507             if(FFABS(middleEnergy) < 8*QP){
 508                 const int q=(src[3*step] - src[4*step])/2;
 509                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 510                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 511
 512                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 513                 d= FFMAX(d, 0);
 514
 515                 d= (5*d + 32) >> 6;
 516                 d*= FFSIGN(-middleEnergy);
 517
 518                 if(q>0){
 519                     d= d<0 ? 0 : d;
 520                     d= d>q ? q : d;
 521                 }else{
 522                     d= d>0 ? 0 : d;
 523                     d= d<q ? q : d;
 524                 }
 525
 526                 src[3*step]-= d;
 527                 src[4*step]+= d;
 528             }
 529         }
 530
 531         src += stride;
 532     }
 533 /*if(step==16){
 534     STOP_TIMER("step16")
 535 }else{
 536     STOP_TIMER("stepX")
 537 }*/
 538 }
 539
 540 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 541 //Plain C versions
 542 //we always compile C for testing which needs bitexactness
 543 #define TEMPLATE_PP_C 1
 544 #include "postprocess_template.c"
 545
 546 #if HAVE_ALTIVEC
 547 #   define TEMPLATE_PP_ALTIVEC 1
 548 #   include "postprocess_altivec_template.c"
 549 #   include "postprocess_template.c"
 550 #endif
 551
 552 #if ARCH_X86 && HAVE_INLINE_ASM
 553 #    if CONFIG_RUNTIME_CPUDETECT
 554 #        define TEMPLATE_PP_MMX 1
 555 #        include "postprocess_template.c"
 556 #        define TEMPLATE_PP_MMXEXT 1
 557 #        include "postprocess_template.c"
 558 #        define TEMPLATE_PP_3DNOW 1
 559 #        include "postprocess_template.c"
 560 #        define TEMPLATE_PP_SSE2 1
 561 #        include "postprocess_template.c"
 562 #    else
 563 #        if HAVE_SSE2_INLINE
 564 #            define TEMPLATE_PP_SSE2 1
 565 #            include "postprocess_template.c"
 566 #        elif HAVE_MMXEXT_INLINE
 567 #            define TEMPLATE_PP_MMXEXT 1
 568 #            include "postprocess_template.c"
 569 #        elif HAVE_AMD3DNOW_INLINE
 570 #            define TEMPLATE_PP_3DNOW 1
 571 #            include "postprocess_template.c"
 572 #        elif HAVE_MMX_INLINE
 573 #            define TEMPLATE_PP_MMX 1
 574 #            include "postprocess_template.c"
 575 #        endif
 576 #    endif
 577 #endif
 578
 579 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 580                       const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
 581
 582 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 583         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 584 {
 585     pp_fn pp = postProcess_C;
 586     PPContext *c= (PPContext *)vc;
 587     PPMode *ppMode= (PPMode *)vm;
 588     c->ppMode= *ppMode; //FIXME
 589
 590     if (!(ppMode->lumMode & BITEXACT)) {
 591 #if CONFIG_RUNTIME_CPUDETECT
 592 #if ARCH_X86 && HAVE_INLINE_ASM
 593         // ordered per speed fastest first
 594         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
 595         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
 596         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
 597         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
 598 #elif HAVE_ALTIVEC
 599         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 600 #endif
 601 #else /* CONFIG_RUNTIME_CPUDETECT */
 602 #if     HAVE_SSE2_INLINE
 603         pp = postProcess_SSE2;
 604 #elif   HAVE_MMXEXT_INLINE
 605         pp = postProcess_MMX2;
 606 #elif HAVE_AMD3DNOW_INLINE
 607         pp = postProcess_3DNow;
 608 #elif HAVE_MMX_INLINE
 609         pp = postProcess_MMX;
 610 #elif HAVE_ALTIVEC
 611         pp = postProcess_altivec;
 612 #endif
 613 #endif /* !CONFIG_RUNTIME_CPUDETECT */
 614     }
 615
 616     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 617 }
 618
 619 /* -pp Command line Help
 620 */
 621 #if LIBPOSTPROC_VERSION_INT < (52<<16)
 622 const char *const pp_help=
 623 #else
 624 const char pp_help[] =
 625 #endif
 626 "Available postprocessing filters:\n"
 627 "Filters                        Options\n"
 628 "short  long name       short   long option     Description\n"
 629 "*      *               a       autoq           CPU power dependent enabler\n"
 630 "                       c       chrom           chrominance filtering enabled\n"
 631 "                       y       nochrom         chrominance filtering disabled\n"
 632 "                       n       noluma          luma filtering disabled\n"
 633 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 634 "       1. difference factor: default=32, higher -> more deblocking\n"
 635 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 636 "                       the h & v deblocking filters share these\n"
 637 "                       so you can't set different thresholds for h / v\n"
 638 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 639 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 640 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 641 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 642 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 643 "dr     dering                                  deringing filter\n"
 644 "al     autolevels                              automatic brightness / contrast\n"
 645 "                       f        fullyrange     stretch luminance to (0..255)\n"
 646 "lb     linblenddeint                           linear blend deinterlacer\n"
 647 "li     linipoldeint                            linear interpolating deinterlace\n"
 648 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 649 "md     mediandeint                             median deinterlacer\n"
 650 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 651 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 652 "de     default                                 hb:a,vb:a,dr:a\n"
 653 "fa     fast                                    h1:a,v1:a,dr:a\n"
 654 "ac                                             ha:a:128:7,va:a,dr:a\n"
 655 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 656 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 657 "fq     forceQuant      <quantizer>             force quantizer\n"
 658 "Usage:\n"
 659 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 660 "long form example:\n"
 661 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 662 "short form example:\n"
 663 "vb:a/hb:a/lb                                   de,-vb\n"
 664 "more examples:\n"
 665 "tn:64:128:256\n"
 666 "\n"
 667 ;
 668
 669 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 670 {
 671     char temp[GET_MODE_BUFFER_SIZE];
 672     char *p= temp;
 673     static const char filterDelimiters[] = ",/";
 674     static const char optionDelimiters[] = ":";
 675     struct PPMode *ppMode;
 676     char *filterToken;
 677
 678     if (!name)  {
 679         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
 680         return NULL;
 681     }
 682
 683     if (!strcmp(name, "help")) {
 684         const char *p;
 685         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
 686             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
 687             av_log(NULL, AV_LOG_INFO, "%s", temp);
 688         }
 689         return NULL;
 690     }
 691
 692     ppMode= av_malloc(sizeof(PPMode));
 693
 694     ppMode->lumMode= 0;
 695     ppMode->chromMode= 0;
 696     ppMode->maxTmpNoise[0]= 700;
 697     ppMode->maxTmpNoise[1]= 1500;
 698     ppMode->maxTmpNoise[2]= 3000;
 699     ppMode->maxAllowedY= 234;
 700     ppMode->minAllowedY= 16;
 701     ppMode->baseDcDiff= 256/8;
 702     ppMode->flatnessThreshold= 56-16-1;
 703     ppMode->maxClippedThreshold= 0.01;
 704     ppMode->error=0;
 705
 706     memset(temp, 0, GET_MODE_BUFFER_SIZE);
 707     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
 708
 709     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 710
 711     for(;;){
 712         char *filterName;
 713         int q= 1000000; //PP_QUALITY_MAX;
 714         int chrom=-1;
 715         int luma=-1;
 716         char *option;
 717         char *options[OPTIONS_ARRAY_SIZE];
 718         int i;
 719         int filterNameOk=0;
 720         int numOfUnknownOptions=0;
 721         int enable=1; //does the user want us to enabled or disabled the filter
 722
 723         filterToken= strtok(p, filterDelimiters);
 724         if(filterToken == NULL) break;
 725         p+= strlen(filterToken) + 1; // p points to next filterToken
 726         filterName= strtok(filterToken, optionDelimiters);
 727         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 728
 729         if(*filterName == '-'){
 730             enable=0;
 731             filterName++;
 732         }
 733
 734         for(;;){ //for all options
 735             option= strtok(NULL, optionDelimiters);
 736             if(option == NULL) break;
 737
 738             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 739             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 740             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 741             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 742             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 743             else{
 744                 options[numOfUnknownOptions] = option;
 745                 numOfUnknownOptions++;
 746             }
 747             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 748         }
 749         options[numOfUnknownOptions] = NULL;
 750
 751         /* replace stuff from the replace Table */
 752         for(i=0; replaceTable[2*i]!=NULL; i++){
 753             if(!strcmp(replaceTable[2*i], filterName)){
 754                 int newlen= strlen(replaceTable[2*i + 1]);
 755                 int plen;
 756                 int spaceLeft;
 757
 758                 p--, *p=',';
 759
 760                 plen= strlen(p);
 761                 spaceLeft= p - temp + plen;
 762                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
 763                     ppMode->error++;
 764                     break;
 765                 }
 766                 memmove(p + newlen, p, plen+1);
 767                 memcpy(p, replaceTable[2*i + 1], newlen);
 768                 filterNameOk=1;
 769             }
 770         }
 771
 772         for(i=0; filters[i].shortName!=NULL; i++){
 773             if(   !strcmp(filters[i].longName, filterName)
 774                || !strcmp(filters[i].shortName, filterName)){
 775                 ppMode->lumMode &= ~filters[i].mask;
 776                 ppMode->chromMode &= ~filters[i].mask;
 777
 778                 filterNameOk=1;
 779                 if(!enable) break; // user wants to disable it
 780
 781                 if(q >= filters[i].minLumQuality && luma)
 782                     ppMode->lumMode|= filters[i].mask;
 783                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 784                     if(q >= filters[i].minChromQuality)
 785                             ppMode->chromMode|= filters[i].mask;
 786
 787                 if(filters[i].mask == LEVEL_FIX){
 788                     int o;
 789                     ppMode->minAllowedY= 16;
 790                     ppMode->maxAllowedY= 234;
 791                     for(o=0; options[o]!=NULL; o++){
 792                         if(  !strcmp(options[o],"fullyrange")
 793                            ||!strcmp(options[o],"f")){
 794                             ppMode->minAllowedY= 0;
 795                             ppMode->maxAllowedY= 255;
 796                             numOfUnknownOptions--;
 797                         }
 798                     }
 799                 }
 800                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 801                 {
 802                     int o;
 803                     int numOfNoises=0;
 804
 805                     for(o=0; options[o]!=NULL; o++){
 806                         char *tail;
 807                         ppMode->maxTmpNoise[numOfNoises]=
 808                             strtol(options[o], &tail, 0);
 809                         if(tail!=options[o]){
 810                             numOfNoises++;
 811                             numOfUnknownOptions--;
 812                             if(numOfNoises >= 3) break;
 813                         }
 814                     }
 815                 }
 816                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 817                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
 818                     int o;
 819
 820                     for(o=0; options[o]!=NULL && o<2; o++){
 821                         char *tail;
 822                         int val= strtol(options[o], &tail, 0);
 823                         if(tail==options[o]) break;
 824
 825                         numOfUnknownOptions--;
 826                         if(o==0) ppMode->baseDcDiff= val;
 827                         else ppMode->flatnessThreshold= val;
 828                     }
 829                 }
 830                 else if(filters[i].mask == FORCE_QUANT){
 831                     int o;
 832                     ppMode->forcedQuant= 15;
 833
 834                     for(o=0; options[o]!=NULL && o<1; o++){
 835                         char *tail;
 836                         int val= strtol(options[o], &tail, 0);
 837                         if(tail==options[o]) break;
 838
 839                         numOfUnknownOptions--;
 840                         ppMode->forcedQuant= val;
 841                     }
 842                 }
 843             }
 844         }
 845         if(!filterNameOk) ppMode->error++;
 846         ppMode->error += numOfUnknownOptions;
 847     }
 848
 849     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 850     if(ppMode->error){
 851         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 852         av_free(ppMode);
 853         return NULL;
 854     }
 855     return ppMode;
 856 }
 857
 858 void pp_free_mode(pp_mode *mode){
 859     av_free(mode);
 860 }
 861
 862 static void reallocAlign(void **p, int alignment, int size){
 863     av_free(*p);
 864     *p= av_mallocz(size);
 865 }
 866
 867 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 868     int mbWidth = (width+15)>>4;
 869     int mbHeight= (height+15)>>4;
 870     int i;
 871
 872     c->stride= stride;
 873     c->qpStride= qpStride;
 874
 875     reallocAlign((void **)&c->tempDst, 8, stride*24);
 876     reallocAlign((void **)&c->tempSrc, 8, stride*24);
 877     reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
 878     reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
 879     for(i=0; i<256; i++)
 880             c->yHistogram[i]= width*height/64*15/256;
 881
 882     for(i=0; i<3; i++){
 883         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
 884         reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
 885         reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 886     }
 887
 888     reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
 889     reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 890     reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 891     reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
 892 }
 893
 894 static const char * context_to_name(void * ptr) {
 895     return "postproc";
 896 }
 897
 898 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 899
 900 pp_context *pp_get_context(int width, int height, int cpuCaps){
 901     PPContext *c= av_malloc(sizeof(PPContext));
 902     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
 903     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 904
 905     memset(c, 0, sizeof(PPContext));
 906     c->av_class = &av_codec_context_class;
 907     if(cpuCaps&PP_FORMAT){
 908         c->hChromaSubSample= cpuCaps&0x3;
 909         c->vChromaSubSample= (cpuCaps>>4)&0x3;
 910     }else{
 911         c->hChromaSubSample= 1;
 912         c->vChromaSubSample= 1;
 913     }
 914     if (cpuCaps & PP_CPU_CAPS_AUTO) {
 915         c->cpuCaps = av_get_cpu_flags();
 916     } else {
 917         c->cpuCaps = 0;
 918         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
 919         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
 920         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
 921         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
 922     }
 923
 924     reallocBuffers(c, width, height, stride, qpStride);
 925
 926     c->frameNum=-1;
 927
 928     return c;
 929 }
 930
 931 void pp_free_context(void *vc){
 932     PPContext *c = (PPContext*)vc;
 933     int i;
 934
 935     for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
 936     for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
 937
 938     av_free(c->tempBlocks);
 939     av_free(c->yHistogram);
 940     av_free(c->tempDst);
 941     av_free(c->tempSrc);
 942     av_free(c->deintTemp);
 943     av_free(c->stdQPTable);
 944     av_free(c->nonBQPTable);
 945     av_free(c->forcedQPTable);
 946
 947     memset(c, 0, sizeof(PPContext));
 948
 949     av_free(c);
 950 }
 951
 952 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
 953                      uint8_t * dst[3], const int dstStride[3],
 954                      int width, int height,
 955                      const QP_STORE_T *QP_store,  int QPStride,
 956                      pp_mode *vm,  void *vc, int pict_type)
 957 {
 958     int mbWidth = (width+15)>>4;
 959     int mbHeight= (height+15)>>4;
 960     PPMode *mode = (PPMode*)vm;
 961     PPContext *c = (PPContext*)vc;
 962     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
 963     int absQPStride = FFABS(QPStride);
 964
 965     // c->stride and c->QPStride are always positive
 966     if(c->stride < minStride || c->qpStride < absQPStride)
 967         reallocBuffers(c, width, height,
 968                        FFMAX(minStride, c->stride),
 969                        FFMAX(c->qpStride, absQPStride));
 970
 971     if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
 972         int i;
 973         QP_store= c->forcedQPTable;
 974         absQPStride = QPStride = 0;
 975         if(mode->lumMode & FORCE_QUANT)
 976             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
 977         else
 978             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
 979     }
 980
 981     if(pict_type & PP_PICT_TYPE_QP2){
 982         int i;
 983         const int count= mbHeight * absQPStride;
 984         for(i=0; i<(count>>2); i++){
 985             ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
 986         }
 987         for(i<<=2; i<count; i++){
 988             c->stdQPTable[i] = QP_store[i]>>1;
 989         }
 990         QP_store= c->stdQPTable;
 991         QPStride= absQPStride;
 992     }
 993
 994     if(0){
 995         int x,y;
 996         for(y=0; y<mbHeight; y++){
 997             for(x=0; x<mbWidth; x++){
 998                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
 999             }
1000             av_log(c, AV_LOG_INFO, "\n");
1001         }
1002         av_log(c, AV_LOG_INFO, "\n");
1003     }
1004
1005     if((pict_type&7)!=3){
1006         if (QPStride >= 0){
1007             int i;
1008             const int count= mbHeight * QPStride;
1009             for(i=0; i<(count>>2); i++){
1010                 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1011             }
1012             for(i<<=2; i<count; i++){
1013                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1014             }
1015         } else {
1016             int i,j;
1017             for(i=0; i<mbHeight; i++) {
1018                 for(j=0; j<absQPStride; j++) {
1019                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1020                 }
1021             }
1022         }
1023     }
1024
1025     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1026            mode->lumMode, mode->chromMode);
1027
1028     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1029                 width, height, QP_store, QPStride, 0, mode, c);
1030
1031     width  = (width )>>c->hChromaSubSample;
1032     height = (height)>>c->vChromaSubSample;
1033
1034     if(mode->chromMode){
1035         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1036                     width, height, QP_store, QPStride, 1, mode, c);
1037         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1038                     width, height, QP_store, QPStride, 2, mode, c);
1039     }
1040     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1041         linecpy(dst[1], src[1], height, srcStride[1]);
1042         linecpy(dst[2], src[2], height, srcStride[2]);
1043     }else{
1044         int y;
1045         for(y=0; y<height; y++){
1046             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1047             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1048         }
1049     }
1050 }