]> git.sesse.net Git - x264/blob - common/quant.c
CABAC trellis opts part 4: x86_64 asm
[x264] / common / quant.c
1 /*****************************************************************************
2  * quant.c: quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2005-2011 x264 project
5  *
6  * Authors: Loren Merritt <lorenm@u.washington.edu>
7  *          Fiona Glaser <fiona@x264.com>
8  *          Christian Heine <sennindemokrit@gmx.net>
9  *          Henrik Gramner <hengar-6@student.ltu.se>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24  *
25  * This program is also available under a commercial proprietary license.
26  * For more information, contact us at licensing@x264.com.
27  *****************************************************************************/
28
29 #include "common.h"
30
31 #if HAVE_MMX
32 #include "x86/quant.h"
33 #endif
34 #if ARCH_PPC
35 #   include "ppc/quant.h"
36 #endif
37 #if ARCH_ARM
38 #   include "arm/quant.h"
39 #endif
40
41 #define QUANT_ONE( coef, mf, f ) \
42 { \
43     if( (coef) > 0 ) \
44         (coef) = (f + (coef)) * (mf) >> 16; \
45     else \
46         (coef) = - ((f - (coef)) * (mf) >> 16); \
47     nz |= (coef); \
48 }
49
50 static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
51 {
52     int nz = 0;
53     for( int i = 0; i < 64; i++ )
54         QUANT_ONE( dct[i], mf[i], bias[i] );
55     return !!nz;
56 }
57
58 static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
59 {
60     int nz = 0;
61     for( int i = 0; i < 16; i++ )
62         QUANT_ONE( dct[i], mf[i], bias[i] );
63     return !!nz;
64 }
65
66 static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
67 {
68     int nz = 0;
69     for( int i = 0; i < 16; i++ )
70         QUANT_ONE( dct[i], mf, bias );
71     return !!nz;
72 }
73
74 static int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
75 {
76     int nz = 0;
77     QUANT_ONE( dct[0], mf, bias );
78     QUANT_ONE( dct[1], mf, bias );
79     QUANT_ONE( dct[2], mf, bias );
80     QUANT_ONE( dct[3], mf, bias );
81     return !!nz;
82 }
83
84 #define DEQUANT_SHL( x ) \
85     dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
86
87 #define DEQUANT_SHR( x ) \
88     dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
89
90 static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
91 {
92     const int i_mf = i_qp%6;
93     const int i_qbits = i_qp/6 - 4;
94
95     if( i_qbits >= 0 )
96     {
97         for( int i = 0; i < 16; i++ )
98             DEQUANT_SHL( i );
99     }
100     else
101     {
102         const int f = 1 << (-i_qbits-1);
103         for( int i = 0; i < 16; i++ )
104             DEQUANT_SHR( i );
105     }
106 }
107
108 static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
109 {
110     const int i_mf = i_qp%6;
111     const int i_qbits = i_qp/6 - 6;
112
113     if( i_qbits >= 0 )
114     {
115         for( int i = 0; i < 64; i++ )
116             DEQUANT_SHL( i );
117     }
118     else
119     {
120         const int f = 1 << (-i_qbits-1);
121         for( int i = 0; i < 64; i++ )
122             DEQUANT_SHR( i );
123     }
124 }
125
126 static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
127 {
128     const int i_qbits = i_qp/6 - 6;
129
130     if( i_qbits >= 0 )
131     {
132         const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
133         for( int i = 0; i < 16; i++ )
134             dct[i] *= i_dmf;
135     }
136     else
137     {
138         const int i_dmf = dequant_mf[i_qp%6][0];
139         const int f = 1 << (-i_qbits-1);
140         for( int i = 0; i < 16; i++ )
141             dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
142     }
143 }
144
145 #define IDCT_DEQUANT_2X4_START \
146     int a0 = dct[0] + dct[1]; \
147     int a1 = dct[2] + dct[3]; \
148     int a2 = dct[4] + dct[5]; \
149     int a3 = dct[6] + dct[7]; \
150     int a4 = dct[0] - dct[1]; \
151     int a5 = dct[2] - dct[3]; \
152     int a6 = dct[4] - dct[5]; \
153     int a7 = dct[6] - dct[7]; \
154     int b0 = a0 + a1; \
155     int b1 = a2 + a3; \
156     int b2 = a4 + a5; \
157     int b3 = a6 + a7; \
158     int b4 = a0 - a1; \
159     int b5 = a2 - a3; \
160     int b6 = a4 - a5; \
161     int b7 = a6 - a7;
162
163 static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
164 {
165     IDCT_DEQUANT_2X4_START
166     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
167     dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
168     dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
169     dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
170     dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
171     dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
172     dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
173     dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
174     dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
175 }
176
177 static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
178 {
179     IDCT_DEQUANT_2X4_START
180     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
181     dct[0] = ((b0 + b1) * dmf + 32) >> 6;
182     dct[1] = ((b2 + b3) * dmf + 32) >> 6;
183     dct[2] = ((b0 - b1) * dmf + 32) >> 6;
184     dct[3] = ((b2 - b3) * dmf + 32) >> 6;
185     dct[4] = ((b4 - b5) * dmf + 32) >> 6;
186     dct[5] = ((b6 - b7) * dmf + 32) >> 6;
187     dct[6] = ((b4 + b5) * dmf + 32) >> 6;
188     dct[7] = ((b6 + b7) * dmf + 32) >> 6;
189 }
190
191 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
192 {
193     IDCT_DEQUANT_2X4_START
194     out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
195     out[1] = ((b2 + b3) * dmf + 2080) >> 6;
196     out[2] = ((b0 - b1) * dmf + 2080) >> 6;
197     out[3] = ((b2 - b3) * dmf + 2080) >> 6;
198     out[4] = ((b4 - b5) * dmf + 2080) >> 6;
199     out[5] = ((b6 - b7) * dmf + 2080) >> 6;
200     out[6] = ((b4 + b5) * dmf + 2080) >> 6;
201     out[7] = ((b6 + b7) * dmf + 2080) >> 6;
202 }
203 #undef IDCT_DEQUANT_2X4_START
204
205 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
206 {
207     int d0 = dct[0] + dct[1];
208     int d1 = dct[2] + dct[3];
209     int d2 = dct[0] - dct[1];
210     int d3 = dct[2] - dct[3];
211     out[0] = ((d0 + d1) * dmf >> 5) + 32;
212     out[1] = ((d0 - d1) * dmf >> 5) + 32;
213     out[2] = ((d2 + d3) * dmf >> 5) + 32;
214     out[3] = ((d2 - d3) * dmf >> 5) + 32;
215 }
216
217 static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
218 {
219     dctcoef out[8];
220
221     if( chroma422 )
222         optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
223     else
224         optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
225
226     int sum = 0;
227     for( int i = 0; i < (chroma422?8:4); i++ )
228         sum |= ref[i] ^ out[i];
229     return sum >> 6;
230 }
231
232 static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
233 {
234     /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
235     dctcoef dct_orig[8];
236     int coeff, nz;
237
238     if( chroma422 )
239         optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
240     else
241         optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
242
243     /* If the DC coefficients already round to zero, terminate early. */
244     int sum = 0;
245     for( int i = 0; i < (chroma422?8:4); i++ )
246         sum |= dct_orig[i];
247     if( !(sum >> 6) )
248         return 0;
249
250     /* Start with the highest frequency coefficient... is this the best option? */
251     for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
252     {
253         int level = dct[coeff];
254         int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
255
256         while( level )
257         {
258             dct[coeff] = level - sign;
259             if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
260             {
261                 nz = 1;
262                 dct[coeff] = level;
263                 break;
264             }
265             level -= sign;
266         }
267     }
268
269     return nz;
270 }
271
272 static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
273 {
274     return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
275 }
276
277 static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
278 {
279     return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
280 }
281
282 static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
283 {
284     for( int i = 0; i < size; i++ )
285     {
286         int level = dct[i];
287         int sign = level>>31;
288         level = (level+sign)^sign;
289         sum[i] += level;
290         level -= offset[i];
291         dct[i] = level<0 ? 0 : (level^sign)-sign;
292     }
293 }
294
295 /* (ref: JVT-B118)
296  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
297  * to 0 (low score means set it to null)
298  * Used in inter macroblock (luma and chroma)
299  *  luma: for a 8x8 block: if score < 4 -> null
300  *        for the complete mb: if score < 6 -> null
301  *  chroma: for the complete mb: if score < 7 -> null
302  */
303
304 const uint8_t x264_decimate_table4[16] =
305 {
306     3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0
307 };
308 const uint8_t x264_decimate_table8[64] =
309 {
310     3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
311     1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
312     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
313     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
314 };
315
316 static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
317 {
318     const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
319     int i_score = 0;
320     int idx = i_max - 1;
321
322     while( idx >= 0 && dct[idx] == 0 )
323         idx--;
324     while( idx >= 0 )
325     {
326         int i_run;
327
328         if( (unsigned)(dct[idx--] + 1) > 2 )
329             return 9;
330
331         i_run = 0;
332         while( idx >= 0 && dct[idx] == 0 )
333         {
334             idx--;
335             i_run++;
336         }
337         i_score += ds_table[i_run];
338     }
339
340     return i_score;
341 }
342
343 static int x264_decimate_score15( dctcoef *dct )
344 {
345     return x264_decimate_score_internal( dct+1, 15 );
346 }
347 static int x264_decimate_score16( dctcoef *dct )
348 {
349     return x264_decimate_score_internal( dct, 16 );
350 }
351 static int x264_decimate_score64( dctcoef *dct )
352 {
353     return x264_decimate_score_internal( dct, 64 );
354 }
355
356 #define last(num)\
357 static int x264_coeff_last##num( dctcoef *l )\
358 {\
359     int i_last = num-1;\
360     while( i_last >= 0 && l[i_last] == 0 )\
361         i_last--;\
362     return i_last;\
363 }
364
365 last(4)
366 last(8)
367 last(15)
368 last(16)
369 last(64)
370
371 #define level_run(num)\
372 static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
373 {\
374     int i_last = runlevel->last = x264_coeff_last##num(dct);\
375     int i_total = 0;\
376     int mask = 0;\
377     do\
378     {\
379         int r = 0;\
380         runlevel->level[i_total] = dct[i_last];\
381         mask |= 1 << (i_last);\
382         while( --i_last >= 0 && dct[i_last] == 0 )\
383             r++;\
384         runlevel->run[i_total++] = r;\
385     } while( i_last >= 0 );\
386     runlevel->mask = mask;\
387     return i_total;\
388 }
389
390 level_run(4)
391 level_run(8)
392 level_run(15)
393 level_run(16)
394
395 #if ARCH_X86_64
396 #define INIT_TRELLIS(cpu)\
397     pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\
398     pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\
399     pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\
400     pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\
401     pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\
402     pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu;
403 #else
404 #define INIT_TRELLIS(...)
405 #endif
406
407 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
408 {
409     pf->quant_8x8 = quant_8x8;
410     pf->quant_4x4 = quant_4x4;
411     pf->quant_4x4_dc = quant_4x4_dc;
412     pf->quant_2x2_dc = quant_2x2_dc;
413
414     pf->dequant_4x4 = dequant_4x4;
415     pf->dequant_4x4_dc = dequant_4x4_dc;
416     pf->dequant_8x8 = dequant_8x8;
417
418     pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
419     pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
420
421     pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
422     pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
423
424     pf->denoise_dct = x264_denoise_dct;
425     pf->decimate_score15 = x264_decimate_score15;
426     pf->decimate_score16 = x264_decimate_score16;
427     pf->decimate_score64 = x264_decimate_score64;
428
429     pf->coeff_last4 = x264_coeff_last4;
430     pf->coeff_last8 = x264_coeff_last8;
431     pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
432     pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
433     pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
434     pf->coeff_level_run4 = x264_coeff_level_run4;
435     pf->coeff_level_run8 = x264_coeff_level_run8;
436     pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
437     pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
438
439 #if HIGH_BIT_DEPTH
440 #if HAVE_MMX
441     INIT_TRELLIS( sse2 );
442     if( cpu&X264_CPU_MMX2 )
443     {
444 #if ARCH_X86
445         pf->denoise_dct = x264_denoise_dct_mmx;
446         pf->decimate_score15 = x264_decimate_score15_mmx2;
447         pf->decimate_score16 = x264_decimate_score16_mmx2;
448         if( cpu&X264_CPU_SLOW_CTZ )
449         {
450             pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
451             pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
452         }
453         pf->decimate_score64 = x264_decimate_score64_mmx2;
454         pf->coeff_last8 = x264_coeff_last8_mmx2;
455         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
456         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
457         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
458         pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
459         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
460         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
461 #endif
462         pf->coeff_last4 = x264_coeff_last4_mmx2;
463         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
464         if( cpu&X264_CPU_LZCNT )
465             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
466     }
467     if( cpu&X264_CPU_SSE2 )
468     {
469         pf->quant_4x4 = x264_quant_4x4_sse2;
470         pf->quant_8x8 = x264_quant_8x8_sse2;
471         pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
472         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
473         pf->dequant_4x4 = x264_dequant_4x4_sse2;
474         pf->dequant_8x8 = x264_dequant_8x8_sse2;
475         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
476         pf->denoise_dct = x264_denoise_dct_sse2;
477         pf->decimate_score15 = x264_decimate_score15_sse2;
478         pf->decimate_score16 = x264_decimate_score16_sse2;
479         pf->decimate_score64 = x264_decimate_score64_sse2;
480         if( cpu&X264_CPU_SLOW_CTZ )
481         {
482             pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
483             pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
484         }
485         pf->coeff_last8 = x264_coeff_last8_sse2;
486         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
487         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
488         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
489         pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
490         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
491         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
492         if( cpu&X264_CPU_LZCNT )
493         {
494             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
495             pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
496             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
497             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
498             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
499             pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
500             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
501             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
502         }
503     }
504     if( cpu&X264_CPU_SSSE3 )
505     {
506         pf->quant_4x4 = x264_quant_4x4_ssse3;
507         pf->quant_8x8 = x264_quant_8x8_ssse3;
508         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
509         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
510         pf->denoise_dct = x264_denoise_dct_ssse3;
511         pf->decimate_score15 = x264_decimate_score15_ssse3;
512         pf->decimate_score16 = x264_decimate_score16_ssse3;
513         if( cpu&X264_CPU_SLOW_CTZ )
514         {
515             pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
516             pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
517         }
518         pf->decimate_score64 = x264_decimate_score64_ssse3;
519         INIT_TRELLIS( ssse3 );
520     }
521     if( cpu&X264_CPU_SSE4 )
522     {
523         pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
524         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
525         pf->quant_4x4 = x264_quant_4x4_sse4;
526         pf->quant_8x8 = x264_quant_8x8_sse4;
527     }
528     if( cpu&X264_CPU_AVX )
529     {
530         pf->denoise_dct = x264_denoise_dct_avx;
531     }
532     if( cpu&X264_CPU_XOP )
533     {
534         pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
535         if( h->param.i_cqm_preset != X264_CQM_FLAT )
536         {
537             pf->dequant_4x4 = x264_dequant_4x4_xop;
538             pf->dequant_8x8 = x264_dequant_8x8_xop;
539         }
540     }
541 #endif // HAVE_MMX
542 #else // !HIGH_BIT_DEPTH
543 #if HAVE_MMX
544     INIT_TRELLIS( sse2 );
545     if( cpu&X264_CPU_MMX )
546     {
547 #if ARCH_X86
548         pf->quant_4x4 = x264_quant_4x4_mmx;
549         pf->quant_8x8 = x264_quant_8x8_mmx;
550         pf->dequant_4x4 = x264_dequant_4x4_mmx;
551         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
552         pf->dequant_8x8 = x264_dequant_8x8_mmx;
553         if( h->param.i_cqm_preset == X264_CQM_FLAT )
554         {
555             pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
556             pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
557         }
558         pf->denoise_dct = x264_denoise_dct_mmx;
559 #endif
560     }
561
562     if( cpu&X264_CPU_MMX2 )
563     {
564         pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
565 #if ARCH_X86
566         pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
567         pf->decimate_score15 = x264_decimate_score15_mmx2;
568         pf->decimate_score16 = x264_decimate_score16_mmx2;
569         if( cpu&X264_CPU_SLOW_CTZ )
570         {
571             pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
572             pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
573         }
574         pf->decimate_score64 = x264_decimate_score64_mmx2;
575         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
576         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
577         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
578         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
579         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
580 #endif
581         pf->coeff_last4 = x264_coeff_last4_mmx2;
582         pf->coeff_last8 = x264_coeff_last8_mmx2;
583         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
584         pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
585         if( cpu&X264_CPU_LZCNT )
586         {
587             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
588             pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
589             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
590             pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
591         }
592     }
593
594     if( cpu&X264_CPU_SSE2 )
595     {
596         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
597         pf->quant_4x4 = x264_quant_4x4_sse2;
598         pf->quant_8x8 = x264_quant_8x8_sse2;
599         pf->dequant_4x4 = x264_dequant_4x4_sse2;
600         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
601         pf->dequant_8x8 = x264_dequant_8x8_sse2;
602         if( h->param.i_cqm_preset == X264_CQM_FLAT )
603         {
604             pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
605             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
606         }
607         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
608         pf->denoise_dct = x264_denoise_dct_sse2;
609         pf->decimate_score15 = x264_decimate_score15_sse2;
610         pf->decimate_score16 = x264_decimate_score16_sse2;
611         pf->decimate_score64 = x264_decimate_score64_sse2;
612         if( cpu&X264_CPU_SLOW_CTZ )
613         {
614             pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
615             pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
616         }
617         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
618         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
619         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
620         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
621         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
622         if( cpu&X264_CPU_LZCNT )
623         {
624             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
625             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
626             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
627             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
628             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
629         }
630     }
631
632     if( cpu&X264_CPU_SSSE3 )
633     {
634         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
635         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
636         pf->quant_4x4 = x264_quant_4x4_ssse3;
637         pf->quant_8x8 = x264_quant_8x8_ssse3;
638         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
639         pf->denoise_dct = x264_denoise_dct_ssse3;
640         pf->decimate_score15 = x264_decimate_score15_ssse3;
641         pf->decimate_score16 = x264_decimate_score16_ssse3;
642         if( cpu&X264_CPU_SLOW_CTZ )
643         {
644             pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
645             pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
646         }
647         pf->decimate_score64 = x264_decimate_score64_ssse3;
648         INIT_TRELLIS( ssse3 );
649     }
650
651     if( cpu&X264_CPU_SSE4 )
652     {
653         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
654         pf->quant_4x4 = x264_quant_4x4_sse4;
655         pf->quant_8x8 = x264_quant_8x8_sse4;
656         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
657     }
658
659     if( cpu&X264_CPU_AVX )
660     {
661         pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
662         if( h->param.i_cqm_preset != X264_CQM_FLAT )
663         {
664             pf->dequant_4x4 = x264_dequant_4x4_avx;
665             pf->dequant_8x8 = x264_dequant_8x8_avx;
666         }
667         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
668         pf->denoise_dct = x264_denoise_dct_avx;
669     }
670
671     if( cpu&X264_CPU_XOP )
672     {
673         if( h->param.i_cqm_preset != X264_CQM_FLAT )
674         {
675             pf->dequant_4x4 = x264_dequant_4x4_xop;
676             pf->dequant_8x8 = x264_dequant_8x8_xop;
677         }
678     }
679 #endif // HAVE_MMX
680
681 #if HAVE_ALTIVEC
682     if( cpu&X264_CPU_ALTIVEC ) {
683         pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
684         pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
685         pf->quant_4x4 = x264_quant_4x4_altivec;
686         pf->quant_8x8 = x264_quant_8x8_altivec;
687
688         pf->dequant_4x4 = x264_dequant_4x4_altivec;
689         pf->dequant_8x8 = x264_dequant_8x8_altivec;
690     }
691 #endif
692
693 #if HAVE_ARMV6
694     if( cpu&X264_CPU_ARMV6 )
695         pf->coeff_last4 = x264_coeff_last4_arm;
696
697     if( cpu&X264_CPU_NEON )
698     {
699         pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
700         pf->quant_4x4      = x264_quant_4x4_neon;
701         pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
702         pf->quant_8x8      = x264_quant_8x8_neon;
703         pf->dequant_4x4    = x264_dequant_4x4_neon;
704         pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
705         pf->dequant_8x8    = x264_dequant_8x8_neon;
706         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
707         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
708         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
709     }
710 #endif
711 #endif // HIGH_BIT_DEPTH
712     pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
713     pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
714     pf->coeff_last[DCT_CHROMA_AC]   = pf->coeff_last[DCT_CHROMAU_AC]  =
715     pf->coeff_last[DCT_CHROMAV_AC]  = pf->coeff_last[DCT_LUMA_AC];
716     pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];
717
718     pf->coeff_level_run[DCT_LUMA_DC]     = pf->coeff_level_run[DCT_CHROMAU_DC]  = pf->coeff_level_run[DCT_CHROMAV_DC] =
719     pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
720     pf->coeff_level_run[DCT_CHROMA_AC]   = pf->coeff_level_run[DCT_CHROMAU_AC]  =
721     pf->coeff_level_run[DCT_CHROMAV_AC]  = pf->coeff_level_run[DCT_LUMA_AC];
722 }