git.sesse.net Git - x264/blob - common/pixel.c

   1 /*****************************************************************************
   2  * pixel.c: h264 encoder
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_STDINT_H
  25 #include <stdint.h>
  26 #else
  27 #include <inttypes.h>
  28 #endif
  29 #include <stdlib.h>
  30 #include <string.h>
  31 #include <stdarg.h>
  32
  33 #include "x264.h"
  34 #include "pixel.h"
  35 #include "clip1.h"
  36
  37 #ifdef HAVE_MMXEXT
  38 #   include "i386/pixel.h"
  39 #endif
  40 #ifdef ARCH_PPC
  41 #   include "ppc/pixel.h"
  42 #endif
  43 #ifdef ARCH_UltraSparc
  44 #   include "sparc/pixel.h"
  45 #endif
  46
  47
  48 /****************************************************************************
  49  * pixel_sad_WxH
  50  ****************************************************************************/
  51 #define PIXEL_SAD_C( name, lx, ly ) \
  52 static int name( uint8_t *pix1, int i_stride_pix1,  \
  53                  uint8_t *pix2, int i_stride_pix2 ) \
  54 {                                                   \
  55     int i_sum = 0;                                  \
  56     int x, y;                                       \
  57     for( y = 0; y < ly; y++ )                       \
  58     {                                               \
  59         for( x = 0; x < lx; x++ )                   \
  60         {                                           \
  61             i_sum += abs( pix1[x] - pix2[x] );      \
  62         }                                           \
  63         pix1 += i_stride_pix1;                      \
  64         pix2 += i_stride_pix2;                      \
  65     }                                               \
  66     return i_sum;                                   \
  67 }
  68
  69
  70 PIXEL_SAD_C( pixel_sad_16x16, 16, 16 )
  71 PIXEL_SAD_C( pixel_sad_16x8,  16,  8 )
  72 PIXEL_SAD_C( pixel_sad_8x16,   8, 16 )
  73 PIXEL_SAD_C( pixel_sad_8x8,    8,  8 )
  74 PIXEL_SAD_C( pixel_sad_8x4,    8,  4 )
  75 PIXEL_SAD_C( pixel_sad_4x8,    4,  8 )
  76 PIXEL_SAD_C( pixel_sad_4x4,    4,  4 )
  77
  78
  79 /****************************************************************************
  80  * pixel_ssd_WxH
  81  ****************************************************************************/
  82 #define PIXEL_SSD_C( name, lx, ly ) \
  83 static int name( uint8_t *pix1, int i_stride_pix1,  \
  84                  uint8_t *pix2, int i_stride_pix2 ) \
  85 {                                                   \
  86     int i_sum = 0;                                  \
  87     int x, y;                                       \
  88     for( y = 0; y < ly; y++ )                       \
  89     {                                               \
  90         for( x = 0; x < lx; x++ )                   \
  91         {                                           \
  92             int d = pix1[x] - pix2[x];              \
  93             i_sum += d*d;                           \
  94         }                                           \
  95         pix1 += i_stride_pix1;                      \
  96         pix2 += i_stride_pix2;                      \
  97     }                                               \
  98     return i_sum;                                   \
  99 }
 100
 101 PIXEL_SSD_C( pixel_ssd_16x16, 16, 16 )
 102 PIXEL_SSD_C( pixel_ssd_16x8,  16,  8 )
 103 PIXEL_SSD_C( pixel_ssd_8x16,   8, 16 )
 104 PIXEL_SSD_C( pixel_ssd_8x8,    8,  8 )
 105 PIXEL_SSD_C( pixel_ssd_8x4,    8,  4 )
 106 PIXEL_SSD_C( pixel_ssd_4x8,    4,  8 )
 107 PIXEL_SSD_C( pixel_ssd_4x4,    4,  4 )
 108
 109 int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
 110 {
 111     int64_t i_ssd = 0;
 112     int x, y;
 113
 114 #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
 115                                           pix2 + y*i_pix2 + x, i_pix2 );
 116     for( y = 0; y < i_height-15; y += 16 )
 117     {
 118         for( x = 0; x < i_width-15; x += 16 )
 119             SSD(PIXEL_16x16);
 120         if( x < i_width-7 )
 121             SSD(PIXEL_8x16);
 122     }
 123     if( y < i_height-7 )
 124         for( x = 0; x < i_width-7; x += 8 )
 125             SSD(PIXEL_8x8);
 126 #undef SSD
 127
 128 #define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
 129     if( i_width % 8 != 0 )
 130     {
 131         for( y = 0; y < (i_height & ~7); y++ )
 132             for( x = i_width & ~7; x < i_width; x++ )
 133                 SSD1;
 134     }
 135     if( i_height % 8 != 0 )
 136     {
 137         for( y = i_height & ~7; y < i_height; y++ )
 138             for( x = 0; x < i_width; x++ )
 139                 SSD1;
 140     }
 141 #undef SSD1
 142
 143     return i_ssd;
 144 }
 145
 146
 147 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 148                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 149 {
 150     int y, x;
 151     for( y = 0; y < i_size; y++ )
 152     {
 153         for( x = 0; x < i_size; x++ )
 154         {
 155             diff[x + y*i_size] = pix1[x] - pix2[x];
 156         }
 157         pix1 += i_pix1;
 158         pix2 += i_pix2;
 159     }
 160 }
 161
 162
 163 /****************************************************************************
 164  * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
 165  ****************************************************************************/
 166 static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
 167 {
 168     int16_t tmp[4][4];
 169     int16_t diff[4][4];
 170     int x, y;
 171     int i_satd = 0;
 172
 173     for( y = 0; y < i_height; y += 4 )
 174     {
 175         for( x = 0; x < i_width; x += 4 )
 176         {
 177             int d;
 178
 179             pixel_sub_wxh( (int16_t*)diff, 4, &pix1[x], i_pix1, &pix2[x], i_pix2 );
 180
 181             for( d = 0; d < 4; d++ )
 182             {
 183                 int s01, s23;
 184                 int d01, d23;
 185
 186                 s01 = diff[d][0] + diff[d][1]; s23 = diff[d][2] + diff[d][3];
 187                 d01 = diff[d][0] - diff[d][1]; d23 = diff[d][2] - diff[d][3];
 188
 189                 tmp[d][0] = s01 + s23;
 190                 tmp[d][1] = s01 - s23;
 191                 tmp[d][2] = d01 - d23;
 192                 tmp[d][3] = d01 + d23;
 193             }
 194             for( d = 0; d < 4; d++ )
 195             {
 196                 int s01, s23;
 197                 int d01, d23;
 198
 199                 s01 = tmp[0][d] + tmp[1][d]; s23 = tmp[2][d] + tmp[3][d];
 200                 d01 = tmp[0][d] - tmp[1][d]; d23 = tmp[2][d] - tmp[3][d];
 201
 202                 i_satd += abs( s01 + s23 ) + abs( s01 - s23 ) + abs( d01 - d23 ) + abs( d01 + d23 );
 203             }
 204
 205         }
 206         pix1 += 4 * i_pix1;
 207         pix2 += 4 * i_pix2;
 208     }
 209
 210     return i_satd / 2;
 211 }
 212 #define PIXEL_SATD_C( name, width, height ) \
 213 static int name( uint8_t *pix1, int i_stride_pix1, \
 214                  uint8_t *pix2, int i_stride_pix2 ) \
 215 { \
 216     return pixel_satd_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
 217 }
 218 PIXEL_SATD_C( pixel_satd_16x16, 16, 16 )
 219 PIXEL_SATD_C( pixel_satd_16x8,  16, 8 )
 220 PIXEL_SATD_C( pixel_satd_8x16,  8, 16 )
 221 PIXEL_SATD_C( pixel_satd_8x8,   8, 8 )
 222 PIXEL_SATD_C( pixel_satd_8x4,   8, 4 )
 223 PIXEL_SATD_C( pixel_satd_4x8,   4, 8 )
 224 PIXEL_SATD_C( pixel_satd_4x4,   4, 4 )
 225
 226
 227 /****************************************************************************
 228  * pixel_sa8d_WxH: sum of 8x8 Hadamard transformed differences
 229  ****************************************************************************/
 230 #define SA8D_1D {\
 231     const int a0 = SRC(0) + SRC(4);\
 232     const int a4 = SRC(0) - SRC(4);\
 233     const int a1 = SRC(1) + SRC(5);\
 234     const int a5 = SRC(1) - SRC(5);\
 235     const int a2 = SRC(2) + SRC(6);\
 236     const int a6 = SRC(2) - SRC(6);\
 237     const int a3 = SRC(3) + SRC(7);\
 238     const int a7 = SRC(3) - SRC(7);\
 239     const int b0 = a0 + a2;\
 240     const int b2 = a0 - a2;\
 241     const int b1 = a1 + a3;\
 242     const int b3 = a1 - a3;\
 243     const int b4 = a4 + a6;\
 244     const int b6 = a4 - a6;\
 245     const int b5 = a5 + a7;\
 246     const int b7 = a5 - a7;\
 247     DST(0, b0 + b1);\
 248     DST(1, b0 - b1);\
 249     DST(2, b2 + b3);\
 250     DST(3, b2 - b3);\
 251     DST(4, b4 + b5);\
 252     DST(5, b4 - b5);\
 253     DST(6, b6 + b7);\
 254     DST(7, b6 - b7);\
 255 }
 256
 257 static inline int pixel_sa8d_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2,
 258                                   int i_width, int i_height )
 259 {
 260     int16_t diff[8][8];
 261     int i_satd = 0;
 262     int x, y;
 263
 264     for( y = 0; y < i_height; y += 8 )
 265     {
 266         for( x = 0; x < i_width; x += 8 )
 267         {
 268             int i;
 269             pixel_sub_wxh( (int16_t*)diff, 8, pix1+x, i_pix1, pix2+x, i_pix2 );
 270
 271 #define SRC(x)     diff[i][x]
 272 #define DST(x,rhs) diff[i][x] = (rhs)
 273             for( i = 0; i < 8; i++ )
 274                 SA8D_1D
 275 #undef SRC
 276 #undef DST
 277
 278 #define SRC(x)     diff[x][i]
 279 #define DST(x,rhs) i_satd += abs(rhs)
 280             for( i = 0; i < 8; i++ )
 281                 SA8D_1D
 282 #undef SRC
 283 #undef DST
 284         }
 285         pix1 += 8 * i_pix1;
 286         pix2 += 8 * i_pix2;
 287     }
 288
 289     return i_satd;
 290 }
 291
 292 #define PIXEL_SA8D_C( width, height ) \
 293 static int pixel_sa8d_##width##x##height( uint8_t *pix1, int i_stride_pix1, \
 294                  uint8_t *pix2, int i_stride_pix2 ) \
 295 { \
 296     return ( pixel_sa8d_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ) + 2 ) >> 2; \
 297 }
 298 PIXEL_SA8D_C( 16, 16 )
 299 PIXEL_SA8D_C( 16, 8 )
 300 PIXEL_SA8D_C( 8, 16 )
 301 PIXEL_SA8D_C( 8, 8 )
 302
 303
 304 /****************************************************************************
 305  * x264_pixel_init:
 306  ****************************************************************************/
 307 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 308 {
 309     pixf->sad[PIXEL_16x16] = pixel_sad_16x16;
 310     pixf->sad[PIXEL_16x8]  = pixel_sad_16x8;
 311     pixf->sad[PIXEL_8x16]  = pixel_sad_8x16;
 312     pixf->sad[PIXEL_8x8]   = pixel_sad_8x8;
 313     pixf->sad[PIXEL_8x4]   = pixel_sad_8x4;
 314     pixf->sad[PIXEL_4x8]   = pixel_sad_4x8;
 315     pixf->sad[PIXEL_4x4]   = pixel_sad_4x4;
 316
 317     pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16;
 318     pixf->ssd[PIXEL_16x8]  = pixel_ssd_16x8;
 319     pixf->ssd[PIXEL_8x16]  = pixel_ssd_8x16;
 320     pixf->ssd[PIXEL_8x8]   = pixel_ssd_8x8;
 321     pixf->ssd[PIXEL_8x4]   = pixel_ssd_8x4;
 322     pixf->ssd[PIXEL_4x8]   = pixel_ssd_4x8;
 323     pixf->ssd[PIXEL_4x4]   = pixel_ssd_4x4;
 324
 325     pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
 326     pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
 327     pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
 328     pixf->satd[PIXEL_8x8]  = pixel_satd_8x8;
 329     pixf->satd[PIXEL_8x4]  = pixel_satd_8x4;
 330     pixf->satd[PIXEL_4x8]  = pixel_satd_4x8;
 331     pixf->satd[PIXEL_4x4]  = pixel_satd_4x4;
 332
 333     pixf->sa8d[PIXEL_16x16]= pixel_sa8d_16x16;
 334     pixf->sa8d[PIXEL_16x8] = pixel_sa8d_16x8;
 335     pixf->sa8d[PIXEL_8x16] = pixel_sa8d_8x16;
 336     pixf->sa8d[PIXEL_8x8]  = pixel_sa8d_8x8;
 337
 338 #ifdef HAVE_MMXEXT
 339     if( cpu&X264_CPU_MMXEXT )
 340     {
 341         pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_mmxext;
 342         pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_mmxext;
 343         pixf->sad[PIXEL_8x16 ] = x264_pixel_sad_8x16_mmxext;
 344         pixf->sad[PIXEL_8x8  ] = x264_pixel_sad_8x8_mmxext;
 345         pixf->sad[PIXEL_8x4  ] = x264_pixel_sad_8x4_mmxext;
 346         pixf->sad[PIXEL_4x8  ] = x264_pixel_sad_4x8_mmxext;
 347         pixf->sad[PIXEL_4x4]   = x264_pixel_sad_4x4_mmxext;
 348
 349         pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmxext;
 350         pixf->ssd[PIXEL_16x8]  = x264_pixel_ssd_16x8_mmxext;
 351         pixf->ssd[PIXEL_8x16]  = x264_pixel_ssd_8x16_mmxext;
 352         pixf->ssd[PIXEL_8x8]   = x264_pixel_ssd_8x8_mmxext;
 353         pixf->ssd[PIXEL_8x4]   = x264_pixel_ssd_8x4_mmxext;
 354         pixf->ssd[PIXEL_4x8]   = x264_pixel_ssd_4x8_mmxext;
 355         pixf->ssd[PIXEL_4x4]   = x264_pixel_ssd_4x4_mmxext;
 356
 357         pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
 358         pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
 359         pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;
 360         pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_mmxext;
 361         pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_mmxext;
 362         pixf->satd[PIXEL_4x8]  = x264_pixel_satd_4x8_mmxext;
 363         pixf->satd[PIXEL_4x4]  = x264_pixel_satd_4x4_mmxext;
 364     }
 365 #endif
 366
 367 #ifdef HAVE_SSE2
 368     // disable on AMD processors since it is slower
 369     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
 370     {
 371         pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_sse2;
 372         pixf->sad[PIXEL_16x8 ] = x264_pixel_sad_16x8_sse2;
 373
 374         pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_sse2;
 375         pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_sse2;
 376         pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_sse2;
 377         pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_sse2;
 378         pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_sse2;
 379     }
 380     // these are faster on both Intel and AMD
 381     if( cpu&X264_CPU_SSE2 )
 382     {
 383         pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2;
 384         pixf->ssd[PIXEL_16x8]  = x264_pixel_ssd_16x8_sse2;
 385     }
 386 #endif
 387
 388 #ifdef ARCH_PPC
 389     if( cpu&X264_CPU_ALTIVEC )
 390     {
 391         x264_pixel_altivec_init( pixf );
 392     }
 393 #endif
 394 #ifdef ARCH_UltraSparc
 395       pixf->sad[PIXEL_8x8]   = x264_pixel_sad_8x8_vis;
 396       pixf->sad[PIXEL_8x16]  = x264_pixel_sad_8x16_vis;
 397       pixf->sad[PIXEL_16x8]  = x264_pixel_sad_16x8_vis;
 398       pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_vis;
 399 #endif
 400 }
 401