git.sesse.net Git - ffmpeg/blob - libpostproc/postprocess.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3  *
   4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file
  25  * postprocessing.
  26  */
  27
  28 /*
  29                         C       MMX     MMX2    3DNow   AltiVec
  30 isVertDC                Ec      Ec                      Ec
  31 isVertMinMaxOk          Ec      Ec                      Ec
  32 doVertLowPass           E               e       e       Ec
  33 doVertDefFilter         Ec      Ec      e       e       Ec
  34 isHorizDC               Ec      Ec                      Ec
  35 isHorizMinMaxOk         a       E                       Ec
  36 doHorizLowPass          E               e       e       Ec
  37 doHorizDefFilter        Ec      Ec      e       e       Ec
  38 do_a_deblock            Ec      E       Ec      E
  39 deRing                  E               e       e*      Ecp
  40 Vertical RKAlgo1        E               a       a
  41 Horizontal RKAlgo1                      a       a
  42 Vertical X1#            a               E       E
  43 Horizontal X1#          a               E       E
  44 LinIpolDeinterlace      e               E       E*
  45 CubicIpolDeinterlace    a               e       e*
  46 LinBlendDeinterlace     e               E       E*
  47 MedianDeinterlace#      E       Ec      Ec
  48 TempDeNoiser#           E               e       e       Ec
  49
  50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
  51 # more or less selfinvented filters so the exactness is not too meaningful
  52 E = Exact implementation
  53 e = almost exact implementation (slightly different rounding,...)
  54 a = alternative / approximate impl
  55 c = checked against the other implementations (-vo md5)
  56 p = partially optimized, still some work to do
  57 */
  58
  59 /*
  60 TODO:
  61 reduce the time wasted on the mem transfer
  62 unroll stuff if instructions depend too much on the prior one
  63 move YScale thing to the end instead of fixing QP
  64 write a faster and higher quality deblocking filter :)
  65 make the mainloop more flexible (variable number of blocks at once
  66         (the if/else stuff per block is slowing things down)
  67 compare the quality & speed of all filters
  68 split this huge file
  69 optimize c versions
  70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  71 ...
  72 */
  73
  74 //Changelog: use git log
  75
  76 #include "config.h"
  77 #include "libavutil/avutil.h"
  78 #include "libavutil/avassert.h"
  79 #include <inttypes.h>
  80 #include <stdio.h>
  81 #include <stdlib.h>
  82 #include <string.h>
  83 //#undef HAVE_MMXEXT_INLINE
  84 //#define HAVE_AMD3DNOW_INLINE
  85 //#undef HAVE_MMX_INLINE
  86 //#undef ARCH_X86
  87 //#define DEBUG_BRIGHTNESS
  88 #include "postprocess.h"
  89 #include "postprocess_internal.h"
  90 #include "libavutil/avstring.h"
  91
  92 #include "libavutil/ffversion.h"
  93 const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
  94
  95 unsigned postproc_version(void)
  96 {
  97     av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
  98     return LIBPOSTPROC_VERSION_INT;
  99 }
 100
 101 const char *postproc_configuration(void)
 102 {
 103     return FFMPEG_CONFIGURATION;
 104 }
 105
 106 const char *postproc_license(void)
 107 {
 108 #define LICENSE_PREFIX "libpostproc license: "
 109     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 110 }
 111
 112 #if HAVE_ALTIVEC_H
 113 #include <altivec.h>
 114 #endif
 115
 116 #define GET_MODE_BUFFER_SIZE 500
 117 #define OPTIONS_ARRAY_SIZE 10
 118 #define BLOCK_SIZE 8
 119 #define TEMP_STRIDE 8
 120 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 121
 122 #if ARCH_X86 && HAVE_INLINE_ASM
 123 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 124 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 125 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 126 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 127 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 128 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 129 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 130 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 131 #endif
 132
 133 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 134
 135
 136 static const struct PPFilter filters[]=
 137 {
 138     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 139     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 140 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 141     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 142     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 143     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 144     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 145     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 146     {"dr", "dering",                1, 5, 6, DERING},
 147     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 148     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 149     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 150     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 151     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 152     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 153     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 154     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 155     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 156     {"be", "bitexact",              1, 0, 0, BITEXACT},
 157     {"vi", "visualize",             1, 0, 0, VISUALIZE},
 158     {NULL, NULL,0,0,0,0} //End Marker
 159 };
 160
 161 static const char * const replaceTable[]=
 162 {
 163     "default",      "hb:a,vb:a,dr:a",
 164     "de",           "hb:a,vb:a,dr:a",
 165     "fast",         "h1:a,v1:a,dr:a",
 166     "fa",           "h1:a,v1:a,dr:a",
 167     "ac",           "ha:a:128:7,va:a,dr:a",
 168     NULL //End Marker
 169 };
 170
 171 /* The horizontal functions exist only in C because the MMX
 172  * code is faster with vertical filters and transposing. */
 173
 174 /**
 175  * Check if the given 8x8 Block is mostly "flat"
 176  */
 177 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
 178 {
 179     int numEq= 0;
 180     int y;
 181     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 182     const int dcThreshold= dcOffset*2 + 1;
 183
 184     for(y=0; y<BLOCK_SIZE; y++){
 185         numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
 186         numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
 187         numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
 188         numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
 189         numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
 190         numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
 191         numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
 192         src+= stride;
 193     }
 194     return numEq > c->ppMode.flatnessThreshold;
 195 }
 196
 197 /**
 198  * Check if the middle 8x8 Block in the given 8x16 block is flat
 199  */
 200 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
 201 {
 202     int numEq= 0;
 203     int y;
 204     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 205     const int dcThreshold= dcOffset*2 + 1;
 206
 207     src+= stride*4; // src points to begin of the 8x8 Block
 208     for(y=0; y<BLOCK_SIZE-1; y++){
 209         numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
 210         numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
 211         numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
 212         numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
 213         numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
 214         numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
 215         numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
 216         numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
 217         src+= stride;
 218     }
 219     return numEq > c->ppMode.flatnessThreshold;
 220 }
 221
 222 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
 223 {
 224     int i;
 225     for(i=0; i<2; i++){
 226         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 227         src += stride;
 228         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 229         src += stride;
 230         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 231         src += stride;
 232         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 233         src += stride;
 234     }
 235     return 1;
 236 }
 237
 238 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 239 {
 240     int x;
 241     src+= stride*4;
 242     for(x=0; x<BLOCK_SIZE; x+=4){
 243         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 244         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 245         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 246         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 247     }
 248     return 1;
 249 }
 250
 251 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
 252 {
 253     if( isHorizDC_C(src, stride, c) ){
 254         return isHorizMinMaxOk_C(src, stride, c->QP);
 255     }else{
 256         return 2;
 257     }
 258 }
 259
 260 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
 261 {
 262     if( isVertDC_C(src, stride, c) ){
 263         return isVertMinMaxOk_C(src, stride, c->QP);
 264     }else{
 265         return 2;
 266     }
 267 }
 268
 269 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 270 {
 271     int y;
 272     for(y=0; y<BLOCK_SIZE; y++){
 273         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 274
 275         if(FFABS(middleEnergy) < 8*c->QP){
 276             const int q=(dst[3] - dst[4])/2;
 277             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 278             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 279
 280             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 281             d= FFMAX(d, 0);
 282
 283             d= (5*d + 32) >> 6;
 284             d*= FFSIGN(-middleEnergy);
 285
 286             if(q>0)
 287             {
 288                 d = FFMAX(d, 0);
 289                 d = FFMIN(d, q);
 290             }
 291             else
 292             {
 293                 d = FFMIN(d, 0);
 294                 d = FFMAX(d, q);
 295             }
 296
 297             dst[3]-= d;
 298             dst[4]+= d;
 299         }
 300         dst+= stride;
 301     }
 302 }
 303
 304 /**
 305  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 306  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 307  */
 308 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 309 {
 310     int y;
 311     for(y=0; y<BLOCK_SIZE; y++){
 312         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 313         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 314
 315         int sums[10];
 316         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 317         sums[1] = sums[0] - first  + dst[3];
 318         sums[2] = sums[1] - first  + dst[4];
 319         sums[3] = sums[2] - first  + dst[5];
 320         sums[4] = sums[3] - first  + dst[6];
 321         sums[5] = sums[4] - dst[0] + dst[7];
 322         sums[6] = sums[5] - dst[1] + last;
 323         sums[7] = sums[6] - dst[2] + last;
 324         sums[8] = sums[7] - dst[3] + last;
 325         sums[9] = sums[8] - dst[4] + last;
 326
 327         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 328         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 329         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 330         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 331         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 332         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 333         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 334         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 335
 336         dst+= stride;
 337     }
 338 }
 339
 340 /**
 341  * Experimental Filter 1 (Horizontal)
 342  * will not damage linear gradients
 343  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 344  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 345  * MMX2 version does correct clipping C version does not
 346  * not identical with the vertical one
 347  */
 348 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 349 {
 350     int y;
 351     static uint64_t lut[256];
 352     if(!lut[255])
 353     {
 354         int i;
 355         for(i=0; i<256; i++)
 356         {
 357             int v= i < 128 ? 2*i : 2*(i-256);
 358 /*
 359 //Simulate 112242211 9-Tap filter
 360             uint64_t a= (v/16)  & 0xFF;
 361             uint64_t b= (v/8)   & 0xFF;
 362             uint64_t c= (v/4)   & 0xFF;
 363             uint64_t d= (3*v/8) & 0xFF;
 364 */
 365 //Simulate piecewise linear interpolation
 366             uint64_t a= (v/16)   & 0xFF;
 367             uint64_t b= (v*3/16) & 0xFF;
 368             uint64_t c= (v*5/16) & 0xFF;
 369             uint64_t d= (7*v/16) & 0xFF;
 370             uint64_t A= (0x100 - a)&0xFF;
 371             uint64_t B= (0x100 - b)&0xFF;
 372             uint64_t C= (0x100 - c)&0xFF;
 373             uint64_t D= (0x100 - c)&0xFF;
 374
 375             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 376                        (D<<24) | (C<<16) | (B<<8)  | (A);
 377             //lut[i] = (v<<32) | (v<<24);
 378         }
 379     }
 380
 381     for(y=0; y<BLOCK_SIZE; y++){
 382         int a= src[1] - src[2];
 383         int b= src[3] - src[4];
 384         int c= src[5] - src[6];
 385
 386         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 387
 388         if(d < QP){
 389             int v = d * FFSIGN(-b);
 390
 391             src[1] +=v/8;
 392             src[2] +=v/4;
 393             src[3] +=3*v/8;
 394             src[4] -=3*v/8;
 395             src[5] -=v/4;
 396             src[6] -=v/8;
 397         }
 398         src+=stride;
 399     }
 400 }
 401
 402 /**
 403  * accurate deblock filter
 404  */
 405 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 406                                             int stride, const PPContext *c, int mode)
 407 {
 408     int y;
 409     const int QP= c->QP;
 410     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 411     const int dcThreshold= dcOffset*2 + 1;
 412 //START_TIMER
 413     src+= step*4; // src points to begin of the 8x8 Block
 414     for(y=0; y<8; y++){
 415         int numEq= 0;
 416
 417         numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
 418         numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
 419         numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
 420         numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
 421         numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
 422         numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
 423         numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
 424         numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
 425         numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
 426         if(numEq > c->ppMode.flatnessThreshold){
 427             int min, max, x;
 428
 429             if(src[0] > src[step]){
 430                 max= src[0];
 431                 min= src[step];
 432             }else{
 433                 max= src[step];
 434                 min= src[0];
 435             }
 436             for(x=2; x<8; x+=2){
 437                 if(src[x*step] > src[(x+1)*step]){
 438                         if(src[x    *step] > max) max= src[ x   *step];
 439                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 440                 }else{
 441                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 442                         if(src[ x   *step] < min) min= src[ x   *step];
 443                 }
 444             }
 445             if(max-min < 2*QP){
 446                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 447                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 448
 449                 int sums[10];
 450                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 451                 sums[1] = sums[0] - first       + src[3*step];
 452                 sums[2] = sums[1] - first       + src[4*step];
 453                 sums[3] = sums[2] - first       + src[5*step];
 454                 sums[4] = sums[3] - first       + src[6*step];
 455                 sums[5] = sums[4] - src[0*step] + src[7*step];
 456                 sums[6] = sums[5] - src[1*step] + last;
 457                 sums[7] = sums[6] - src[2*step] + last;
 458                 sums[8] = sums[7] - src[3*step] + last;
 459                 sums[9] = sums[8] - src[4*step] + last;
 460
 461                 if (mode & VISUALIZE) {
 462                     src[0*step] =
 463                     src[1*step] =
 464                     src[2*step] =
 465                     src[3*step] =
 466                     src[4*step] =
 467                     src[5*step] =
 468                     src[6*step] =
 469                     src[7*step] = 128;
 470                 }
 471                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 472                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 473                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 474                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 475                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 476                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 477                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 478                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 479             }
 480         }else{
 481             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 482
 483             if(FFABS(middleEnergy) < 8*QP){
 484                 const int q=(src[3*step] - src[4*step])/2;
 485                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 486                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 487
 488                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 489                 d= FFMAX(d, 0);
 490
 491                 d= (5*d + 32) >> 6;
 492                 d*= FFSIGN(-middleEnergy);
 493
 494                 if(q>0){
 495                     d = FFMAX(d, 0);
 496                     d = FFMIN(d, q);
 497                 }else{
 498                     d = FFMIN(d, 0);
 499                     d = FFMAX(d, q);
 500                 }
 501
 502                 if ((mode & VISUALIZE) && d) {
 503                     d= (d < 0) ? 32 : -32;
 504                     src[3*step]= av_clip_uint8(src[3*step] - d);
 505                     src[4*step]= av_clip_uint8(src[4*step] + d);
 506                     d = 0;
 507                 }
 508
 509                 src[3*step]-= d;
 510                 src[4*step]+= d;
 511             }
 512         }
 513
 514         src += stride;
 515     }
 516 /*if(step==16){
 517     STOP_TIMER("step16")
 518 }else{
 519     STOP_TIMER("stepX")
 520 }*/
 521 }
 522
 523 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 524 //Plain C versions
 525 //we always compile C for testing which needs bitexactness
 526 #define TEMPLATE_PP_C 1
 527 #include "postprocess_template.c"
 528
 529 #if HAVE_ALTIVEC
 530 #   define TEMPLATE_PP_ALTIVEC 1
 531 #   include "postprocess_altivec_template.c"
 532 #   include "postprocess_template.c"
 533 #endif
 534
 535 #if ARCH_X86 && HAVE_INLINE_ASM
 536 #    if CONFIG_RUNTIME_CPUDETECT
 537 #        define TEMPLATE_PP_MMX 1
 538 #        include "postprocess_template.c"
 539 #        define TEMPLATE_PP_MMXEXT 1
 540 #        include "postprocess_template.c"
 541 #        define TEMPLATE_PP_3DNOW 1
 542 #        include "postprocess_template.c"
 543 #        define TEMPLATE_PP_SSE2 1
 544 #        include "postprocess_template.c"
 545 #    else
 546 #        if HAVE_SSE2_INLINE
 547 #            define TEMPLATE_PP_SSE2 1
 548 #            include "postprocess_template.c"
 549 #        elif HAVE_MMXEXT_INLINE
 550 #            define TEMPLATE_PP_MMXEXT 1
 551 #            include "postprocess_template.c"
 552 #        elif HAVE_AMD3DNOW_INLINE
 553 #            define TEMPLATE_PP_3DNOW 1
 554 #            include "postprocess_template.c"
 555 #        elif HAVE_MMX_INLINE
 556 #            define TEMPLATE_PP_MMX 1
 557 #            include "postprocess_template.c"
 558 #        endif
 559 #    endif
 560 #endif
 561
 562 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 563                       const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
 564
 565 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 566         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 567 {
 568     pp_fn pp = postProcess_C;
 569     PPContext *c= (PPContext *)vc;
 570     PPMode *ppMode= (PPMode *)vm;
 571     c->ppMode= *ppMode; //FIXME
 572
 573     if (!(ppMode->lumMode & BITEXACT)) {
 574 #if CONFIG_RUNTIME_CPUDETECT
 575 #if ARCH_X86 && HAVE_INLINE_ASM
 576         // ordered per speed fastest first
 577         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
 578         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
 579         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
 580         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
 581 #elif HAVE_ALTIVEC
 582         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 583 #endif
 584 #else /* CONFIG_RUNTIME_CPUDETECT */
 585 #if     HAVE_SSE2_INLINE
 586         pp = postProcess_SSE2;
 587 #elif   HAVE_MMXEXT_INLINE
 588         pp = postProcess_MMX2;
 589 #elif HAVE_AMD3DNOW_INLINE
 590         pp = postProcess_3DNow;
 591 #elif HAVE_MMX_INLINE
 592         pp = postProcess_MMX;
 593 #elif HAVE_ALTIVEC
 594         pp = postProcess_altivec;
 595 #endif
 596 #endif /* !CONFIG_RUNTIME_CPUDETECT */
 597     }
 598
 599     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 600 }
 601
 602 /* -pp Command line Help
 603 */
 604 const char pp_help[] =
 605 "Available postprocessing filters:\n"
 606 "Filters                        Options\n"
 607 "short  long name       short   long option     Description\n"
 608 "*      *               a       autoq           CPU power dependent enabler\n"
 609 "                       c       chrom           chrominance filtering enabled\n"
 610 "                       y       nochrom         chrominance filtering disabled\n"
 611 "                       n       noluma          luma filtering disabled\n"
 612 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 613 "       1. difference factor: default=32, higher -> more deblocking\n"
 614 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 615 "                       the h & v deblocking filters share these\n"
 616 "                       so you can't set different thresholds for h / v\n"
 617 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 618 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 619 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 620 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 621 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 622 "dr     dering                                  deringing filter\n"
 623 "al     autolevels                              automatic brightness / contrast\n"
 624 "                       f        fullyrange     stretch luminance to (0..255)\n"
 625 "lb     linblenddeint                           linear blend deinterlacer\n"
 626 "li     linipoldeint                            linear interpolating deinterlace\n"
 627 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 628 "md     mediandeint                             median deinterlacer\n"
 629 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 630 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 631 "de     default                                 hb:a,vb:a,dr:a\n"
 632 "fa     fast                                    h1:a,v1:a,dr:a\n"
 633 "ac                                             ha:a:128:7,va:a,dr:a\n"
 634 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 635 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 636 "fq     forceQuant      <quantizer>             force quantizer\n"
 637 "Usage:\n"
 638 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 639 "long form example:\n"
 640 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 641 "short form example:\n"
 642 "vb:a/hb:a/lb                                   de,-vb\n"
 643 "more examples:\n"
 644 "tn:64:128:256\n"
 645 "\n"
 646 ;
 647
 648 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 649 {
 650     char temp[GET_MODE_BUFFER_SIZE];
 651     char *p= temp;
 652     static const char filterDelimiters[] = ",/";
 653     static const char optionDelimiters[] = ":|";
 654     struct PPMode *ppMode;
 655     char *filterToken;
 656
 657     if (!name)  {
 658         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
 659         return NULL;
 660     }
 661
 662     if (!strcmp(name, "help")) {
 663         const char *p;
 664         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
 665             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
 666             av_log(NULL, AV_LOG_INFO, "%s", temp);
 667         }
 668         return NULL;
 669     }
 670
 671     ppMode= av_malloc(sizeof(PPMode));
 672     if (!ppMode)
 673         return NULL;
 674
 675     ppMode->lumMode= 0;
 676     ppMode->chromMode= 0;
 677     ppMode->maxTmpNoise[0]= 700;
 678     ppMode->maxTmpNoise[1]= 1500;
 679     ppMode->maxTmpNoise[2]= 3000;
 680     ppMode->maxAllowedY= 234;
 681     ppMode->minAllowedY= 16;
 682     ppMode->baseDcDiff= 256/8;
 683     ppMode->flatnessThreshold= 56-16-1;
 684     ppMode->maxClippedThreshold= (AVRational){1,100};
 685     ppMode->error=0;
 686
 687     memset(temp, 0, GET_MODE_BUFFER_SIZE);
 688     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
 689
 690     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 691
 692     for(;;){
 693         const char *filterName;
 694         int q= 1000000; //PP_QUALITY_MAX;
 695         int chrom=-1;
 696         int luma=-1;
 697         const char *option;
 698         const char *options[OPTIONS_ARRAY_SIZE];
 699         int i;
 700         int filterNameOk=0;
 701         int numOfUnknownOptions=0;
 702         int enable=1; //does the user want us to enabled or disabled the filter
 703         char *tokstate;
 704
 705         filterToken= av_strtok(p, filterDelimiters, &tokstate);
 706         if(!filterToken) break;
 707         p+= strlen(filterToken) + 1; // p points to next filterToken
 708         filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
 709         if (!filterName) {
 710             ppMode->error++;
 711             break;
 712         }
 713         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 714
 715         if(*filterName == '-'){
 716             enable=0;
 717             filterName++;
 718         }
 719
 720         for(;;){ //for all options
 721             option= av_strtok(NULL, optionDelimiters, &tokstate);
 722             if(!option) break;
 723
 724             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 725             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 726             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 727             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 728             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 729             else{
 730                 options[numOfUnknownOptions] = option;
 731                 numOfUnknownOptions++;
 732             }
 733             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 734         }
 735         options[numOfUnknownOptions] = NULL;
 736
 737         /* replace stuff from the replace Table */
 738         for(i=0; replaceTable[2*i]; i++){
 739             if(!strcmp(replaceTable[2*i], filterName)){
 740                 size_t newlen = strlen(replaceTable[2*i + 1]);
 741                 int plen;
 742                 int spaceLeft;
 743
 744                 p--, *p=',';
 745
 746                 plen= strlen(p);
 747                 spaceLeft= p - temp + plen;
 748                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
 749                     ppMode->error++;
 750                     break;
 751                 }
 752                 memmove(p + newlen, p, plen+1);
 753                 memcpy(p, replaceTable[2*i + 1], newlen);
 754                 filterNameOk=1;
 755             }
 756         }
 757
 758         for(i=0; filters[i].shortName; i++){
 759             if(   !strcmp(filters[i].longName, filterName)
 760                || !strcmp(filters[i].shortName, filterName)){
 761                 ppMode->lumMode &= ~filters[i].mask;
 762                 ppMode->chromMode &= ~filters[i].mask;
 763
 764                 filterNameOk=1;
 765                 if(!enable) break; // user wants to disable it
 766
 767                 if(q >= filters[i].minLumQuality && luma)
 768                     ppMode->lumMode|= filters[i].mask;
 769                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 770                     if(q >= filters[i].minChromQuality)
 771                             ppMode->chromMode|= filters[i].mask;
 772
 773                 if(filters[i].mask == LEVEL_FIX){
 774                     int o;
 775                     ppMode->minAllowedY= 16;
 776                     ppMode->maxAllowedY= 234;
 777                     for(o=0; options[o]; o++){
 778                         if(  !strcmp(options[o],"fullyrange")
 779                            ||!strcmp(options[o],"f")){
 780                             ppMode->minAllowedY= 0;
 781                             ppMode->maxAllowedY= 255;
 782                             numOfUnknownOptions--;
 783                         }
 784                     }
 785                 }
 786                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 787                 {
 788                     int o;
 789                     int numOfNoises=0;
 790
 791                     for(o=0; options[o]; o++){
 792                         char *tail;
 793                         ppMode->maxTmpNoise[numOfNoises]=
 794                             strtol(options[o], &tail, 0);
 795                         if(tail!=options[o]){
 796                             numOfNoises++;
 797                             numOfUnknownOptions--;
 798                             if(numOfNoises >= 3) break;
 799                         }
 800                     }
 801                 }
 802                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 803                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
 804                     int o;
 805
 806                     for(o=0; options[o] && o<2; o++){
 807                         char *tail;
 808                         int val= strtol(options[o], &tail, 0);
 809                         if(tail==options[o]) break;
 810
 811                         numOfUnknownOptions--;
 812                         if(o==0) ppMode->baseDcDiff= val;
 813                         else ppMode->flatnessThreshold= val;
 814                     }
 815                 }
 816                 else if(filters[i].mask == FORCE_QUANT){
 817                     int o;
 818                     ppMode->forcedQuant= 15;
 819
 820                     for(o=0; options[o] && o<1; o++){
 821                         char *tail;
 822                         int val= strtol(options[o], &tail, 0);
 823                         if(tail==options[o]) break;
 824
 825                         numOfUnknownOptions--;
 826                         ppMode->forcedQuant= val;
 827                     }
 828                 }
 829             }
 830         }
 831         if(!filterNameOk) ppMode->error++;
 832         ppMode->error += numOfUnknownOptions;
 833     }
 834
 835     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 836     if(ppMode->error){
 837         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 838         av_free(ppMode);
 839         return NULL;
 840     }
 841     return ppMode;
 842 }
 843
 844 void pp_free_mode(pp_mode *mode){
 845     av_free(mode);
 846 }
 847
 848 static void reallocAlign(void **p, int size){
 849     av_free(*p);
 850     *p= av_mallocz(size);
 851 }
 852
 853 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 854     int mbWidth = (width+15)>>4;
 855     int mbHeight= (height+15)>>4;
 856     int i;
 857
 858     c->stride= stride;
 859     c->qpStride= qpStride;
 860
 861     reallocAlign((void **)&c->tempDst, stride*24+32);
 862     reallocAlign((void **)&c->tempSrc, stride*24);
 863     reallocAlign((void **)&c->tempBlocks, 2*16*8);
 864     reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
 865     for(i=0; i<256; i++)
 866             c->yHistogram[i]= width*height/64*15/256;
 867
 868     for(i=0; i<3; i++){
 869         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
 870         reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
 871         reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 872     }
 873
 874     reallocAlign((void **)&c->deintTemp, 2*width+32);
 875     reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 876     reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(QP_STORE_T));
 877     reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(QP_STORE_T));
 878 }
 879
 880 static const char * context_to_name(void * ptr) {
 881     return "postproc";
 882 }
 883
 884 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 885
 886 av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
 887     PPContext *c= av_mallocz(sizeof(PPContext));
 888     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
 889     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 890
 891     if (!c)
 892         return NULL;
 893
 894     c->av_class = &av_codec_context_class;
 895     if(cpuCaps&PP_FORMAT){
 896         c->hChromaSubSample= cpuCaps&0x3;
 897         c->vChromaSubSample= (cpuCaps>>4)&0x3;
 898     }else{
 899         c->hChromaSubSample= 1;
 900         c->vChromaSubSample= 1;
 901     }
 902     if (cpuCaps & PP_CPU_CAPS_AUTO) {
 903         c->cpuCaps = av_get_cpu_flags();
 904     } else {
 905         c->cpuCaps = 0;
 906         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
 907         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
 908         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
 909         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
 910     }
 911
 912     reallocBuffers(c, width, height, stride, qpStride);
 913
 914     c->frameNum=-1;
 915
 916     return c;
 917 }
 918
 919 av_cold void pp_free_context(void *vc){
 920     PPContext *c = (PPContext*)vc;
 921     int i;
 922
 923     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
 924         av_free(c->tempBlurred[i]);
 925     for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
 926         av_free(c->tempBlurredPast[i]);
 927
 928     av_free(c->tempBlocks);
 929     av_free(c->yHistogram);
 930     av_free(c->tempDst);
 931     av_free(c->tempSrc);
 932     av_free(c->deintTemp);
 933     av_free(c->stdQPTable);
 934     av_free(c->nonBQPTable);
 935     av_free(c->forcedQPTable);
 936
 937     memset(c, 0, sizeof(PPContext));
 938
 939     av_free(c);
 940 }
 941
 942 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
 943                      uint8_t * dst[3], const int dstStride[3],
 944                      int width, int height,
 945                      const QP_STORE_T *QP_store,  int QPStride,
 946                      pp_mode *vm,  void *vc, int pict_type)
 947 {
 948     int mbWidth = (width+15)>>4;
 949     int mbHeight= (height+15)>>4;
 950     PPMode *mode = vm;
 951     PPContext *c = vc;
 952     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
 953     int absQPStride = FFABS(QPStride);
 954
 955     // c->stride and c->QPStride are always positive
 956     if(c->stride < minStride || c->qpStride < absQPStride)
 957         reallocBuffers(c, width, height,
 958                        FFMAX(minStride, c->stride),
 959                        FFMAX(c->qpStride, absQPStride));
 960
 961     if(!QP_store || (mode->lumMode & FORCE_QUANT)){
 962         int i;
 963         QP_store= c->forcedQPTable;
 964         absQPStride = QPStride = 0;
 965         if(mode->lumMode & FORCE_QUANT)
 966             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
 967         else
 968             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
 969     }
 970
 971     if(pict_type & PP_PICT_TYPE_QP2){
 972         int i;
 973         const int count= FFMAX(mbHeight * absQPStride, mbWidth);
 974         for(i=0; i<(count>>2); i++){
 975             ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
 976         }
 977         for(i<<=2; i<count; i++){
 978             c->stdQPTable[i] = QP_store[i]>>1;
 979         }
 980         QP_store= c->stdQPTable;
 981         QPStride= absQPStride;
 982     }
 983
 984     if(0){
 985         int x,y;
 986         for(y=0; y<mbHeight; y++){
 987             for(x=0; x<mbWidth; x++){
 988                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
 989             }
 990             av_log(c, AV_LOG_INFO, "\n");
 991         }
 992         av_log(c, AV_LOG_INFO, "\n");
 993     }
 994
 995     if((pict_type&7)!=3){
 996         if (QPStride >= 0){
 997             int i;
 998             const int count= FFMAX(mbHeight * QPStride, mbWidth);
 999             for(i=0; i<(count>>2); i++){
1000                 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1001             }
1002             for(i<<=2; i<count; i++){
1003                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1004             }
1005         } else {
1006             int i,j;
1007             for(i=0; i<mbHeight; i++) {
1008                 for(j=0; j<absQPStride; j++) {
1009                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1010                 }
1011             }
1012         }
1013     }
1014
1015     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1016            mode->lumMode, mode->chromMode);
1017
1018     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1019                 width, height, QP_store, QPStride, 0, mode, c);
1020
1021     if (!(src[1] && src[2] && dst[1] && dst[2]))
1022         return;
1023
1024     width  = (width )>>c->hChromaSubSample;
1025     height = (height)>>c->vChromaSubSample;
1026
1027     if(mode->chromMode){
1028         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1029                     width, height, QP_store, QPStride, 1, mode, c);
1030         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1031                     width, height, QP_store, QPStride, 2, mode, c);
1032     }
1033     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1034         linecpy(dst[1], src[1], height, srcStride[1]);
1035         linecpy(dst[2], src[2], height, srcStride[2]);
1036     }else{
1037         int y;
1038         for(y=0; y<height; y++){
1039             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1040             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1041         }
1042     }
1043 }