git.sesse.net Git - ffmpeg/blob - libpostproc/postprocess.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3  *
   4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file
  25  * postprocessing.
  26  */
  27
  28 /*
  29                         C       MMX     MMX2    3DNow   AltiVec
  30 isVertDC                Ec      Ec                      Ec
  31 isVertMinMaxOk          Ec      Ec                      Ec
  32 doVertLowPass           E               e       e       Ec
  33 doVertDefFilter         Ec      Ec      e       e       Ec
  34 isHorizDC               Ec      Ec                      Ec
  35 isHorizMinMaxOk         a       E                       Ec
  36 doHorizLowPass          E               e       e       Ec
  37 doHorizDefFilter        Ec      Ec      e       e       Ec
  38 do_a_deblock            Ec      E       Ec      E
  39 deRing                  E               e       e*      Ecp
  40 Vertical RKAlgo1        E               a       a
  41 Horizontal RKAlgo1                      a       a
  42 Vertical X1#            a               E       E
  43 Horizontal X1#          a               E       E
  44 LinIpolDeinterlace      e               E       E*
  45 CubicIpolDeinterlace    a               e       e*
  46 LinBlendDeinterlace     e               E       E*
  47 MedianDeinterlace#      E       Ec      Ec
  48 TempDeNoiser#           E               e       e       Ec
  49
  50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
  51 # more or less selfinvented filters so the exactness is not too meaningful
  52 E = Exact implementation
  53 e = almost exact implementation (slightly different rounding,...)
  54 a = alternative / approximate impl
  55 c = checked against the other implementations (-vo md5)
  56 p = partially optimized, still some work to do
  57 */
  58
  59 /*
  60 TODO:
  61 reduce the time wasted on the mem transfer
  62 unroll stuff if instructions depend too much on the prior one
  63 move YScale thing to the end instead of fixing QP
  64 write a faster and higher quality deblocking filter :)
  65 make the mainloop more flexible (variable number of blocks at once
  66         (the if/else stuff per block is slowing things down)
  67 compare the quality & speed of all filters
  68 split this huge file
  69 optimize c versions
  70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  71 ...
  72 */
  73
  74 //Changelog: use git log
  75
  76 #include "config.h"
  77 #include "libavutil/avutil.h"
  78 #include "libavutil/avassert.h"
  79 #include "libavutil/intreadwrite.h"
  80 #include <inttypes.h>
  81 #include <stdio.h>
  82 #include <stdlib.h>
  83 #include <string.h>
  84 //#undef HAVE_MMXEXT_INLINE
  85 //#define HAVE_AMD3DNOW_INLINE
  86 //#undef HAVE_MMX_INLINE
  87 //#undef ARCH_X86
  88 //#define DEBUG_BRIGHTNESS
  89 #include "postprocess.h"
  90 #include "postprocess_internal.h"
  91 #include "libavutil/avstring.h"
  92 #include "libavutil/ppc/util_altivec.h"
  93
  94 #include "libavutil/ffversion.h"
  95 const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
  96
  97 unsigned postproc_version(void)
  98 {
  99     av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
 100     return LIBPOSTPROC_VERSION_INT;
 101 }
 102
 103 const char *postproc_configuration(void)
 104 {
 105     return FFMPEG_CONFIGURATION;
 106 }
 107
 108 const char *postproc_license(void)
 109 {
 110 #define LICENSE_PREFIX "libpostproc license: "
 111     return &LICENSE_PREFIX FFMPEG_LICENSE[sizeof(LICENSE_PREFIX) - 1];
 112 }
 113
 114 #define GET_MODE_BUFFER_SIZE 500
 115 #define OPTIONS_ARRAY_SIZE 10
 116 #define BLOCK_SIZE 8
 117 #define TEMP_STRIDE 8
 118 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 119
 120 #if ARCH_X86 && HAVE_INLINE_ASM
 121 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 122 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 123 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 124 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 125 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 126 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 127 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 128 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 129 #endif
 130
 131 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 132
 133
 134 static const struct PPFilter filters[]=
 135 {
 136     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 137     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 138 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 139     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 140     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 141     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 142     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 143     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 144     {"dr", "dering",                1, 5, 6, DERING},
 145     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 146     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 147     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 148     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 149     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 150     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 151     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 152     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 153     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 154     {"be", "bitexact",              1, 0, 0, BITEXACT},
 155     {"vi", "visualize",             1, 0, 0, VISUALIZE},
 156     {NULL, NULL,0,0,0,0} //End Marker
 157 };
 158
 159 static const char * const replaceTable[]=
 160 {
 161     "default",      "hb:a,vb:a,dr:a",
 162     "de",           "hb:a,vb:a,dr:a",
 163     "fast",         "h1:a,v1:a,dr:a",
 164     "fa",           "h1:a,v1:a,dr:a",
 165     "ac",           "ha:a:128:7,va:a,dr:a",
 166     NULL //End Marker
 167 };
 168
 169 /* The horizontal functions exist only in C because the MMX
 170  * code is faster with vertical filters and transposing. */
 171
 172 /**
 173  * Check if the given 8x8 Block is mostly "flat"
 174  */
 175 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
 176 {
 177     int numEq= 0;
 178     int y;
 179     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 180     const int dcThreshold= dcOffset*2 + 1;
 181
 182     for(y=0; y<BLOCK_SIZE; y++){
 183         numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
 184         numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
 185         numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
 186         numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
 187         numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
 188         numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
 189         numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
 190         src+= stride;
 191     }
 192     return numEq > c->ppMode.flatnessThreshold;
 193 }
 194
 195 /**
 196  * Check if the middle 8x8 Block in the given 8x16 block is flat
 197  */
 198 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
 199 {
 200     int numEq= 0;
 201     int y;
 202     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 203     const int dcThreshold= dcOffset*2 + 1;
 204
 205     src+= stride*4; // src points to begin of the 8x8 Block
 206     for(y=0; y<BLOCK_SIZE-1; y++){
 207         numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
 208         numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
 209         numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
 210         numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
 211         numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
 212         numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
 213         numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
 214         numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
 215         src+= stride;
 216     }
 217     return numEq > c->ppMode.flatnessThreshold;
 218 }
 219
 220 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
 221 {
 222     int i;
 223     for(i=0; i<2; i++){
 224         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 225         src += stride;
 226         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 227         src += stride;
 228         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 229         src += stride;
 230         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 231         src += stride;
 232     }
 233     return 1;
 234 }
 235
 236 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 237 {
 238     int x;
 239     src+= stride*4;
 240     for(x=0; x<BLOCK_SIZE; x+=4){
 241         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 242         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 243         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 244         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 245     }
 246     return 1;
 247 }
 248
 249 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
 250 {
 251     if( isHorizDC_C(src, stride, c) ){
 252         return isHorizMinMaxOk_C(src, stride, c->QP);
 253     }else{
 254         return 2;
 255     }
 256 }
 257
 258 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
 259 {
 260     if( isVertDC_C(src, stride, c) ){
 261         return isVertMinMaxOk_C(src, stride, c->QP);
 262     }else{
 263         return 2;
 264     }
 265 }
 266
 267 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 268 {
 269     int y;
 270     for(y=0; y<BLOCK_SIZE; y++){
 271         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 272
 273         if(FFABS(middleEnergy) < 8*c->QP){
 274             const int q=(dst[3] - dst[4])/2;
 275             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 276             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 277
 278             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 279             d= FFMAX(d, 0);
 280
 281             d= (5*d + 32) >> 6;
 282             d*= FFSIGN(-middleEnergy);
 283
 284             if(q>0)
 285             {
 286                 d = FFMAX(d, 0);
 287                 d = FFMIN(d, q);
 288             }
 289             else
 290             {
 291                 d = FFMIN(d, 0);
 292                 d = FFMAX(d, q);
 293             }
 294
 295             dst[3]-= d;
 296             dst[4]+= d;
 297         }
 298         dst+= stride;
 299     }
 300 }
 301
 302 /**
 303  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 304  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 305  */
 306 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 307 {
 308     int y;
 309     for(y=0; y<BLOCK_SIZE; y++){
 310         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 311         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 312
 313         int sums[10];
 314         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 315         sums[1] = sums[0] - first  + dst[3];
 316         sums[2] = sums[1] - first  + dst[4];
 317         sums[3] = sums[2] - first  + dst[5];
 318         sums[4] = sums[3] - first  + dst[6];
 319         sums[5] = sums[4] - dst[0] + dst[7];
 320         sums[6] = sums[5] - dst[1] + last;
 321         sums[7] = sums[6] - dst[2] + last;
 322         sums[8] = sums[7] - dst[3] + last;
 323         sums[9] = sums[8] - dst[4] + last;
 324
 325         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 326         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 327         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 328         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 329         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 330         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 331         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 332         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 333
 334         dst+= stride;
 335     }
 336 }
 337
 338 /**
 339  * Experimental Filter 1 (Horizontal)
 340  * will not damage linear gradients
 341  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 342  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 343  * MMX2 version does correct clipping C version does not
 344  * not identical with the vertical one
 345  */
 346 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 347 {
 348     int y;
 349     static uint64_t lut[256];
 350     if(!lut[255])
 351     {
 352         int i;
 353         for(i=0; i<256; i++)
 354         {
 355             int v= i < 128 ? 2*i : 2*(i-256);
 356 /*
 357 //Simulate 112242211 9-Tap filter
 358             uint64_t a= (v/16)  & 0xFF;
 359             uint64_t b= (v/8)   & 0xFF;
 360             uint64_t c= (v/4)   & 0xFF;
 361             uint64_t d= (3*v/8) & 0xFF;
 362 */
 363 //Simulate piecewise linear interpolation
 364             uint64_t a= (v/16)   & 0xFF;
 365             uint64_t b= (v*3/16) & 0xFF;
 366             uint64_t c= (v*5/16) & 0xFF;
 367             uint64_t d= (7*v/16) & 0xFF;
 368             uint64_t A= (0x100 - a)&0xFF;
 369             uint64_t B= (0x100 - b)&0xFF;
 370             uint64_t C= (0x100 - c)&0xFF;
 371             uint64_t D= (0x100 - c)&0xFF;
 372
 373             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 374                        (D<<24) | (C<<16) | (B<<8)  | (A);
 375             //lut[i] = (v<<32) | (v<<24);
 376         }
 377     }
 378
 379     for(y=0; y<BLOCK_SIZE; y++){
 380         int a= src[1] - src[2];
 381         int b= src[3] - src[4];
 382         int c= src[5] - src[6];
 383
 384         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 385
 386         if(d < QP){
 387             int v = d * FFSIGN(-b);
 388
 389             src[1] +=v/8;
 390             src[2] +=v/4;
 391             src[3] +=3*v/8;
 392             src[4] -=3*v/8;
 393             src[5] -=v/4;
 394             src[6] -=v/8;
 395         }
 396         src+=stride;
 397     }
 398 }
 399
 400 /**
 401  * accurate deblock filter
 402  */
 403 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 404                                             int stride, const PPContext *c, int mode)
 405 {
 406     int y;
 407     const int QP= c->QP;
 408     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 409     const int dcThreshold= dcOffset*2 + 1;
 410
 411     src+= step*4; // src points to begin of the 8x8 Block
 412     for(y=0; y<8; y++){
 413         int numEq= 0;
 414
 415         numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
 416         numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
 417         numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
 418         numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
 419         numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
 420         numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
 421         numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
 422         numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
 423         numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
 424         if(numEq > c->ppMode.flatnessThreshold){
 425             int min, max, x;
 426
 427             if(src[0] > src[step]){
 428                 max= src[0];
 429                 min= src[step];
 430             }else{
 431                 max= src[step];
 432                 min= src[0];
 433             }
 434             for(x=2; x<8; x+=2){
 435                 if(src[x*step] > src[(x+1)*step]){
 436                         if(src[x    *step] > max) max= src[ x   *step];
 437                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 438                 }else{
 439                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 440                         if(src[ x   *step] < min) min= src[ x   *step];
 441                 }
 442             }
 443             if(max-min < 2*QP){
 444                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 445                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 446
 447                 int sums[10];
 448                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 449                 sums[1] = sums[0] - first       + src[3*step];
 450                 sums[2] = sums[1] - first       + src[4*step];
 451                 sums[3] = sums[2] - first       + src[5*step];
 452                 sums[4] = sums[3] - first       + src[6*step];
 453                 sums[5] = sums[4] - src[0*step] + src[7*step];
 454                 sums[6] = sums[5] - src[1*step] + last;
 455                 sums[7] = sums[6] - src[2*step] + last;
 456                 sums[8] = sums[7] - src[3*step] + last;
 457                 sums[9] = sums[8] - src[4*step] + last;
 458
 459                 if (mode & VISUALIZE) {
 460                     src[0*step] =
 461                     src[1*step] =
 462                     src[2*step] =
 463                     src[3*step] =
 464                     src[4*step] =
 465                     src[5*step] =
 466                     src[6*step] =
 467                     src[7*step] = 128;
 468                 }
 469                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 470                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 471                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 472                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 473                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 474                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 475                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 476                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 477             }
 478         }else{
 479             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 480
 481             if(FFABS(middleEnergy) < 8*QP){
 482                 const int q=(src[3*step] - src[4*step])/2;
 483                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 484                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 485
 486                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 487                 d= FFMAX(d, 0);
 488
 489                 d= (5*d + 32) >> 6;
 490                 d*= FFSIGN(-middleEnergy);
 491
 492                 if(q>0){
 493                     d = FFMAX(d, 0);
 494                     d = FFMIN(d, q);
 495                 }else{
 496                     d = FFMIN(d, 0);
 497                     d = FFMAX(d, q);
 498                 }
 499
 500                 if ((mode & VISUALIZE) && d) {
 501                     d= (d < 0) ? 32 : -32;
 502                     src[3*step]= av_clip_uint8(src[3*step] - d);
 503                     src[4*step]= av_clip_uint8(src[4*step] + d);
 504                     d = 0;
 505                 }
 506
 507                 src[3*step]-= d;
 508                 src[4*step]+= d;
 509             }
 510         }
 511
 512         src += stride;
 513     }
 514 }
 515
 516 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 517 //Plain C versions
 518 //we always compile C for testing which needs bitexactness
 519 #define TEMPLATE_PP_C 1
 520 #include "postprocess_template.c"
 521
 522 #if HAVE_ALTIVEC
 523 #   define TEMPLATE_PP_ALTIVEC 1
 524 #   include "postprocess_altivec_template.c"
 525 #   include "postprocess_template.c"
 526 #endif
 527
 528 #if ARCH_X86 && HAVE_INLINE_ASM
 529 #    if CONFIG_RUNTIME_CPUDETECT
 530 #        define TEMPLATE_PP_MMX 1
 531 #        include "postprocess_template.c"
 532 #        define TEMPLATE_PP_MMXEXT 1
 533 #        include "postprocess_template.c"
 534 #        define TEMPLATE_PP_3DNOW 1
 535 #        include "postprocess_template.c"
 536 #        define TEMPLATE_PP_SSE2 1
 537 #        include "postprocess_template.c"
 538 #    else
 539 #        if HAVE_SSE2_INLINE
 540 #            define TEMPLATE_PP_SSE2 1
 541 #            include "postprocess_template.c"
 542 #        elif HAVE_MMXEXT_INLINE
 543 #            define TEMPLATE_PP_MMXEXT 1
 544 #            include "postprocess_template.c"
 545 #        elif HAVE_AMD3DNOW_INLINE
 546 #            define TEMPLATE_PP_3DNOW 1
 547 #            include "postprocess_template.c"
 548 #        elif HAVE_MMX_INLINE
 549 #            define TEMPLATE_PP_MMX 1
 550 #            include "postprocess_template.c"
 551 #        endif
 552 #    endif
 553 #endif
 554
 555 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 556                       const int8_t QPs[], int QPStride, int isColor, PPContext *c2);
 557
 558 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 559         const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 560 {
 561     pp_fn pp = postProcess_C;
 562     PPContext *c= (PPContext *)vc;
 563     PPMode *ppMode= (PPMode *)vm;
 564     c->ppMode= *ppMode; //FIXME
 565
 566     if (!(ppMode->lumMode & BITEXACT)) {
 567 #if CONFIG_RUNTIME_CPUDETECT
 568 #if ARCH_X86 && HAVE_INLINE_ASM
 569         // ordered per speed fastest first
 570         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
 571         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
 572         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
 573         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
 574 #elif HAVE_ALTIVEC
 575         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 576 #endif
 577 #else /* CONFIG_RUNTIME_CPUDETECT */
 578 #if     HAVE_SSE2_INLINE
 579         pp = postProcess_SSE2;
 580 #elif   HAVE_MMXEXT_INLINE
 581         pp = postProcess_MMX2;
 582 #elif HAVE_AMD3DNOW_INLINE
 583         pp = postProcess_3DNow;
 584 #elif HAVE_MMX_INLINE
 585         pp = postProcess_MMX;
 586 #elif HAVE_ALTIVEC
 587         pp = postProcess_altivec;
 588 #endif
 589 #endif /* !CONFIG_RUNTIME_CPUDETECT */
 590     }
 591
 592     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 593 }
 594
 595 /* -pp Command line Help
 596 */
 597 const char pp_help[] =
 598 "Available postprocessing filters:\n"
 599 "Filters                        Options\n"
 600 "short  long name       short   long option     Description\n"
 601 "*      *               a       autoq           CPU power dependent enabler\n"
 602 "                       c       chrom           chrominance filtering enabled\n"
 603 "                       y       nochrom         chrominance filtering disabled\n"
 604 "                       n       noluma          luma filtering disabled\n"
 605 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 606 "       1. difference factor: default=32, higher -> more deblocking\n"
 607 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 608 "                       the h & v deblocking filters share these\n"
 609 "                       so you can't set different thresholds for h / v\n"
 610 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 611 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 612 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 613 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 614 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 615 "dr     dering                                  deringing filter\n"
 616 "al     autolevels                              automatic brightness / contrast\n"
 617 "                       f        fullyrange     stretch luminance to (0..255)\n"
 618 "lb     linblenddeint                           linear blend deinterlacer\n"
 619 "li     linipoldeint                            linear interpolating deinterlace\n"
 620 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 621 "md     mediandeint                             median deinterlacer\n"
 622 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 623 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 624 "de     default                                 hb:a,vb:a,dr:a\n"
 625 "fa     fast                                    h1:a,v1:a,dr:a\n"
 626 "ac                                             ha:a:128:7,va:a,dr:a\n"
 627 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 628 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 629 "fq     forceQuant      <quantizer>             force quantizer\n"
 630 "Usage:\n"
 631 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 632 "long form example:\n"
 633 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 634 "short form example:\n"
 635 "vb:a/hb:a/lb                                   de,-vb\n"
 636 "more examples:\n"
 637 "tn:64:128:256\n"
 638 "\n"
 639 ;
 640
 641 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 642 {
 643     char temp[GET_MODE_BUFFER_SIZE];
 644     char *p= temp;
 645     static const char filterDelimiters[] = ",/";
 646     static const char optionDelimiters[] = ":|";
 647     struct PPMode *ppMode;
 648     char *filterToken;
 649
 650     if (!name)  {
 651         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
 652         return NULL;
 653     }
 654
 655     if (!strcmp(name, "help")) {
 656         const char *p;
 657         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
 658             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
 659             av_log(NULL, AV_LOG_INFO, "%s", temp);
 660         }
 661         return NULL;
 662     }
 663
 664     ppMode= av_malloc(sizeof(PPMode));
 665     if (!ppMode)
 666         return NULL;
 667
 668     ppMode->lumMode= 0;
 669     ppMode->chromMode= 0;
 670     ppMode->maxTmpNoise[0]= 700;
 671     ppMode->maxTmpNoise[1]= 1500;
 672     ppMode->maxTmpNoise[2]= 3000;
 673     ppMode->maxAllowedY= 234;
 674     ppMode->minAllowedY= 16;
 675     ppMode->baseDcDiff= 256/8;
 676     ppMode->flatnessThreshold= 56-16-1;
 677     ppMode->maxClippedThreshold= (AVRational){1,100};
 678     ppMode->error=0;
 679
 680     memset(temp, 0, GET_MODE_BUFFER_SIZE);
 681     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
 682
 683     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 684
 685     for(;;){
 686         const char *filterName;
 687         int q= 1000000; //PP_QUALITY_MAX;
 688         int chrom=-1;
 689         int luma=-1;
 690         const char *option;
 691         const char *options[OPTIONS_ARRAY_SIZE];
 692         int i;
 693         int filterNameOk=0;
 694         int numOfUnknownOptions=0;
 695         int enable=1; //does the user want us to enabled or disabled the filter
 696         char *tokstate;
 697
 698         filterToken= av_strtok(p, filterDelimiters, &tokstate);
 699         if(!filterToken) break;
 700         p+= strlen(filterToken) + 1; // p points to next filterToken
 701         filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
 702         if (!filterName) {
 703             ppMode->error++;
 704             break;
 705         }
 706         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 707
 708         if(*filterName == '-'){
 709             enable=0;
 710             filterName++;
 711         }
 712
 713         for(;;){ //for all options
 714             option= av_strtok(NULL, optionDelimiters, &tokstate);
 715             if(!option) break;
 716
 717             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 718             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 719             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 720             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 721             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 722             else{
 723                 options[numOfUnknownOptions] = option;
 724                 numOfUnknownOptions++;
 725             }
 726             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 727         }
 728         options[numOfUnknownOptions] = NULL;
 729
 730         /* replace stuff from the replace Table */
 731         for(i=0; replaceTable[2*i]; i++){
 732             if(!strcmp(replaceTable[2*i], filterName)){
 733                 size_t newlen = strlen(replaceTable[2*i + 1]);
 734                 int plen;
 735                 int spaceLeft;
 736
 737                 p--, *p=',';
 738
 739                 plen= strlen(p);
 740                 spaceLeft= p - temp + plen;
 741                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
 742                     ppMode->error++;
 743                     break;
 744                 }
 745                 memmove(p + newlen, p, plen+1);
 746                 memcpy(p, replaceTable[2*i + 1], newlen);
 747                 filterNameOk=1;
 748             }
 749         }
 750
 751         for(i=0; filters[i].shortName; i++){
 752             if(   !strcmp(filters[i].longName, filterName)
 753                || !strcmp(filters[i].shortName, filterName)){
 754                 ppMode->lumMode &= ~filters[i].mask;
 755                 ppMode->chromMode &= ~filters[i].mask;
 756
 757                 filterNameOk=1;
 758                 if(!enable) break; // user wants to disable it
 759
 760                 if(q >= filters[i].minLumQuality && luma)
 761                     ppMode->lumMode|= filters[i].mask;
 762                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 763                     if(q >= filters[i].minChromQuality)
 764                             ppMode->chromMode|= filters[i].mask;
 765
 766                 if(filters[i].mask == LEVEL_FIX){
 767                     int o;
 768                     ppMode->minAllowedY= 16;
 769                     ppMode->maxAllowedY= 234;
 770                     for(o=0; options[o]; o++){
 771                         if(  !strcmp(options[o],"fullyrange")
 772                            ||!strcmp(options[o],"f")){
 773                             ppMode->minAllowedY= 0;
 774                             ppMode->maxAllowedY= 255;
 775                             numOfUnknownOptions--;
 776                         }
 777                     }
 778                 }
 779                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 780                 {
 781                     int o;
 782                     int numOfNoises=0;
 783
 784                     for(o=0; options[o]; o++){
 785                         char *tail;
 786                         ppMode->maxTmpNoise[numOfNoises]=
 787                             strtol(options[o], &tail, 0);
 788                         if(tail!=options[o]){
 789                             numOfNoises++;
 790                             numOfUnknownOptions--;
 791                             if(numOfNoises >= 3) break;
 792                         }
 793                     }
 794                 }
 795                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 796                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
 797                     int o;
 798
 799                     for(o=0; options[o] && o<2; o++){
 800                         char *tail;
 801                         int val= strtol(options[o], &tail, 0);
 802                         if(tail==options[o]) break;
 803
 804                         numOfUnknownOptions--;
 805                         if(o==0) ppMode->baseDcDiff= val;
 806                         else ppMode->flatnessThreshold= val;
 807                     }
 808                 }
 809                 else if(filters[i].mask == FORCE_QUANT){
 810                     int o;
 811                     ppMode->forcedQuant= 15;
 812
 813                     for(o=0; options[o] && o<1; o++){
 814                         char *tail;
 815                         int val= strtol(options[o], &tail, 0);
 816                         if(tail==options[o]) break;
 817
 818                         numOfUnknownOptions--;
 819                         ppMode->forcedQuant= val;
 820                     }
 821                 }
 822             }
 823         }
 824         if(!filterNameOk) ppMode->error++;
 825         ppMode->error += numOfUnknownOptions;
 826     }
 827
 828     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 829     if(ppMode->error){
 830         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 831         av_free(ppMode);
 832         return NULL;
 833     }
 834     return ppMode;
 835 }
 836
 837 void pp_free_mode(pp_mode *mode){
 838     av_free(mode);
 839 }
 840
 841 static void reallocAlign(void **p, int size){
 842     av_free(*p);
 843     *p= av_mallocz(size);
 844 }
 845
 846 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 847     int mbWidth = (width+15)>>4;
 848     int mbHeight= (height+15)>>4;
 849     int i;
 850
 851     c->stride= stride;
 852     c->qpStride= qpStride;
 853
 854     reallocAlign((void **)&c->tempDst, stride*24+32);
 855     reallocAlign((void **)&c->tempSrc, stride*24);
 856     reallocAlign((void **)&c->tempBlocks, 2*16*8);
 857     reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
 858     for(i=0; i<256; i++)
 859             c->yHistogram[i]= width*height/64*15/256;
 860
 861     for(i=0; i<3; i++){
 862         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
 863         reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
 864         reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 865     }
 866
 867     reallocAlign((void **)&c->deintTemp, 2*width+32);
 868     reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(int8_t));
 869     reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(int8_t));
 870     reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(int8_t));
 871 }
 872
 873 static const char * context_to_name(void * ptr) {
 874     return "postproc";
 875 }
 876
 877 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 878
 879 av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
 880     PPContext *c= av_mallocz(sizeof(PPContext));
 881     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
 882     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 883
 884     if (!c)
 885         return NULL;
 886
 887     c->av_class = &av_codec_context_class;
 888     if(cpuCaps&PP_FORMAT){
 889         c->hChromaSubSample= cpuCaps&0x3;
 890         c->vChromaSubSample= (cpuCaps>>4)&0x3;
 891     }else{
 892         c->hChromaSubSample= 1;
 893         c->vChromaSubSample= 1;
 894     }
 895     if (cpuCaps & PP_CPU_CAPS_AUTO) {
 896         c->cpuCaps = av_get_cpu_flags();
 897     } else {
 898         c->cpuCaps = 0;
 899         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
 900         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
 901         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
 902         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
 903     }
 904
 905     reallocBuffers(c, width, height, stride, qpStride);
 906
 907     c->frameNum=-1;
 908
 909     return c;
 910 }
 911
 912 av_cold void pp_free_context(void *vc){
 913     PPContext *c = (PPContext*)vc;
 914     int i;
 915
 916     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
 917         av_free(c->tempBlurred[i]);
 918     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
 919         av_free(c->tempBlurredPast[i]);
 920
 921     av_free(c->tempBlocks);
 922     av_free(c->yHistogram);
 923     av_free(c->tempDst);
 924     av_free(c->tempSrc);
 925     av_free(c->deintTemp);
 926     av_free(c->stdQPTable);
 927     av_free(c->nonBQPTable);
 928     av_free(c->forcedQPTable);
 929
 930     memset(c, 0, sizeof(PPContext));
 931
 932     av_free(c);
 933 }
 934
 935 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
 936                      uint8_t * dst[3], const int dstStride[3],
 937                      int width, int height,
 938                      const int8_t *QP_store,  int QPStride,
 939                      pp_mode *vm,  void *vc, int pict_type)
 940 {
 941     int mbWidth = (width+15)>>4;
 942     int mbHeight= (height+15)>>4;
 943     PPMode *mode = vm;
 944     PPContext *c = vc;
 945     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
 946     int absQPStride = FFABS(QPStride);
 947
 948     // c->stride and c->QPStride are always positive
 949     if(c->stride < minStride || c->qpStride < absQPStride)
 950         reallocBuffers(c, width, height,
 951                        FFMAX(minStride, c->stride),
 952                        FFMAX(c->qpStride, absQPStride));
 953
 954     if(!QP_store || (mode->lumMode & FORCE_QUANT)){
 955         int i;
 956         QP_store= c->forcedQPTable;
 957         absQPStride = QPStride = 0;
 958         if(mode->lumMode & FORCE_QUANT)
 959             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
 960         else
 961             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
 962     }
 963
 964     if(pict_type & PP_PICT_TYPE_QP2){
 965         int i;
 966         const int count= FFMAX(mbHeight * absQPStride, mbWidth);
 967         for(i=0; i<(count>>2); i++){
 968             AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
 969         }
 970         for(i<<=2; i<count; i++){
 971             c->stdQPTable[i] = QP_store[i]>>1;
 972         }
 973         QP_store= c->stdQPTable;
 974         QPStride= absQPStride;
 975     }
 976
 977     if(0){
 978         int x,y;
 979         for(y=0; y<mbHeight; y++){
 980             for(x=0; x<mbWidth; x++){
 981                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
 982             }
 983             av_log(c, AV_LOG_INFO, "\n");
 984         }
 985         av_log(c, AV_LOG_INFO, "\n");
 986     }
 987
 988     if((pict_type&7)!=3){
 989         if (QPStride >= 0){
 990             int i;
 991             const int count= FFMAX(mbHeight * QPStride, mbWidth);
 992             for(i=0; i<(count>>2); i++){
 993                 AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
 994             }
 995             for(i<<=2; i<count; i++){
 996                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
 997             }
 998         } else {
 999             int i,j;
1000             for(i=0; i<mbHeight; i++) {
1001                 for(j=0; j<absQPStride; j++) {
1002                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1003                 }
1004             }
1005         }
1006     }
1007
1008     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1009            mode->lumMode, mode->chromMode);
1010
1011     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1012                 width, height, QP_store, QPStride, 0, mode, c);
1013
1014     if (!(src[1] && src[2] && dst[1] && dst[2]))
1015         return;
1016
1017     width  = (width )>>c->hChromaSubSample;
1018     height = (height)>>c->vChromaSubSample;
1019
1020     if(mode->chromMode){
1021         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1022                     width, height, QP_store, QPStride, 1, mode, c);
1023         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1024                     width, height, QP_store, QPStride, 2, mode, c);
1025     }
1026     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1027         linecpy(dst[1], src[1], height, srcStride[1]);
1028         linecpy(dst[2], src[2], height, srcStride[2]);
1029     }else{
1030         int y;
1031         for(y=0; y<height; y++){
1032             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1033             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1034         }
1035     }
1036 }