]> git.sesse.net Git - x264/blob - common/quant.c
Initial XOP and FMA4 support on AMD Bulldozer
[x264] / common / quant.c
1 /*****************************************************************************
2  * quant.c: quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2005-2011 x264 project
5  *
6  * Authors: Loren Merritt <lorenm@u.washington.edu>
7  *          Fiona Glaser <fiona@x264.com>
8  *          Christian Heine <sennindemokrit@gmx.net>
9  *          Henrik Gramner <hengar-6@student.ltu.se>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24  *
25  * This program is also available under a commercial proprietary license.
26  * For more information, contact us at licensing@x264.com.
27  *****************************************************************************/
28
29 #include "common.h"
30
31 #if HAVE_MMX
32 #include "x86/quant.h"
33 #endif
34 #if ARCH_PPC
35 #   include "ppc/quant.h"
36 #endif
37 #if ARCH_ARM
38 #   include "arm/quant.h"
39 #endif
40
41 #define QUANT_ONE( coef, mf, f ) \
42 { \
43     if( (coef) > 0 ) \
44         (coef) = (f + (coef)) * (mf) >> 16; \
45     else \
46         (coef) = - ((f - (coef)) * (mf) >> 16); \
47     nz |= (coef); \
48 }
49
50 static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
51 {
52     int nz = 0;
53     for( int i = 0; i < 64; i++ )
54         QUANT_ONE( dct[i], mf[i], bias[i] );
55     return !!nz;
56 }
57
58 static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
59 {
60     int nz = 0;
61     for( int i = 0; i < 16; i++ )
62         QUANT_ONE( dct[i], mf[i], bias[i] );
63     return !!nz;
64 }
65
66 static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
67 {
68     int nz = 0;
69     for( int i = 0; i < 16; i++ )
70         QUANT_ONE( dct[i], mf, bias );
71     return !!nz;
72 }
73
74 static int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
75 {
76     int nz = 0;
77     QUANT_ONE( dct[0], mf, bias );
78     QUANT_ONE( dct[1], mf, bias );
79     QUANT_ONE( dct[2], mf, bias );
80     QUANT_ONE( dct[3], mf, bias );
81     return !!nz;
82 }
83
84 #define DEQUANT_SHL( x ) \
85     dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
86
87 #define DEQUANT_SHR( x ) \
88     dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
89
90 static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
91 {
92     const int i_mf = i_qp%6;
93     const int i_qbits = i_qp/6 - 4;
94
95     if( i_qbits >= 0 )
96     {
97         for( int i = 0; i < 16; i++ )
98             DEQUANT_SHL( i );
99     }
100     else
101     {
102         const int f = 1 << (-i_qbits-1);
103         for( int i = 0; i < 16; i++ )
104             DEQUANT_SHR( i );
105     }
106 }
107
108 static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
109 {
110     const int i_mf = i_qp%6;
111     const int i_qbits = i_qp/6 - 6;
112
113     if( i_qbits >= 0 )
114     {
115         for( int i = 0; i < 64; i++ )
116             DEQUANT_SHL( i );
117     }
118     else
119     {
120         const int f = 1 << (-i_qbits-1);
121         for( int i = 0; i < 64; i++ )
122             DEQUANT_SHR( i );
123     }
124 }
125
126 static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
127 {
128     const int i_qbits = i_qp/6 - 6;
129
130     if( i_qbits >= 0 )
131     {
132         const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
133         for( int i = 0; i < 16; i++ )
134             dct[i] *= i_dmf;
135     }
136     else
137     {
138         const int i_dmf = dequant_mf[i_qp%6][0];
139         const int f = 1 << (-i_qbits-1);
140         for( int i = 0; i < 16; i++ )
141             dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
142     }
143 }
144
145 #define IDCT_DEQUANT_2X4_START \
146     int a0 = dct[0] + dct[1]; \
147     int a1 = dct[2] + dct[3]; \
148     int a2 = dct[4] + dct[5]; \
149     int a3 = dct[6] + dct[7]; \
150     int a4 = dct[0] - dct[1]; \
151     int a5 = dct[2] - dct[3]; \
152     int a6 = dct[4] - dct[5]; \
153     int a7 = dct[6] - dct[7]; \
154     int b0 = a0 + a1; \
155     int b1 = a2 + a3; \
156     int b2 = a4 + a5; \
157     int b3 = a6 + a7; \
158     int b4 = a0 - a1; \
159     int b5 = a2 - a3; \
160     int b6 = a4 - a5; \
161     int b7 = a6 - a7;
162
163 static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
164 {
165     IDCT_DEQUANT_2X4_START
166     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
167     dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
168     dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
169     dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
170     dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
171     dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
172     dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
173     dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
174     dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
175 }
176
177 static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
178 {
179     IDCT_DEQUANT_2X4_START
180     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
181     dct[0] = ((b0 + b1) * dmf + 32) >> 6;
182     dct[1] = ((b2 + b3) * dmf + 32) >> 6;
183     dct[2] = ((b0 - b1) * dmf + 32) >> 6;
184     dct[3] = ((b2 - b3) * dmf + 32) >> 6;
185     dct[4] = ((b4 - b5) * dmf + 32) >> 6;
186     dct[5] = ((b6 - b7) * dmf + 32) >> 6;
187     dct[6] = ((b4 + b5) * dmf + 32) >> 6;
188     dct[7] = ((b6 + b7) * dmf + 32) >> 6;
189 }
190
191 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
192 {
193     IDCT_DEQUANT_2X4_START
194     out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
195     out[1] = ((b2 + b3) * dmf + 2080) >> 6;
196     out[2] = ((b0 - b1) * dmf + 2080) >> 6;
197     out[3] = ((b2 - b3) * dmf + 2080) >> 6;
198     out[4] = ((b4 - b5) * dmf + 2080) >> 6;
199     out[5] = ((b6 - b7) * dmf + 2080) >> 6;
200     out[6] = ((b4 + b5) * dmf + 2080) >> 6;
201     out[7] = ((b6 + b7) * dmf + 2080) >> 6;
202 }
203 #undef IDCT_DEQUANT_2X4_START
204
205 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
206 {
207     int d0 = dct[0] + dct[1];
208     int d1 = dct[2] + dct[3];
209     int d2 = dct[0] - dct[1];
210     int d3 = dct[2] - dct[3];
211     out[0] = ((d0 + d1) * dmf >> 5) + 32;
212     out[1] = ((d0 - d1) * dmf >> 5) + 32;
213     out[2] = ((d2 + d3) * dmf >> 5) + 32;
214     out[3] = ((d2 - d3) * dmf >> 5) + 32;
215 }
216
217 static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
218 {
219     dctcoef out[8];
220
221     if( chroma422 )
222         optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
223     else
224         optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
225
226     int sum = 0;
227     for( int i = 0; i < (chroma422?8:4); i++ )
228         sum |= ref[i] ^ out[i];
229     return sum >> 6;
230 }
231
232 static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
233 {
234     /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
235     dctcoef dct_orig[8];
236     int coeff, nz;
237
238     if( chroma422 )
239         optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
240     else
241         optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
242
243     /* If the DC coefficients already round to zero, terminate early. */
244     int sum = 0;
245     for( int i = 0; i < (chroma422?8:4); i++ )
246         sum |= dct_orig[i];
247     if( !(sum >> 6) )
248         return 0;
249
250     /* Start with the highest frequency coefficient... is this the best option? */
251     for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
252     {
253         int level = dct[coeff];
254         int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
255
256         while( level )
257         {
258             dct[coeff] = level - sign;
259             if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
260             {
261                 nz = 1;
262                 dct[coeff] = level;
263                 break;
264             }
265             level -= sign;
266         }
267     }
268
269     return nz;
270 }
271
272 static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
273 {
274     return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
275 }
276
277 static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
278 {
279     return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
280 }
281
282 static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
283 {
284     for( int i = 0; i < size; i++ )
285     {
286         int level = dct[i];
287         int sign = level>>31;
288         level = (level+sign)^sign;
289         sum[i] += level;
290         level -= offset[i];
291         dct[i] = level<0 ? 0 : (level^sign)-sign;
292     }
293 }
294
295 /* (ref: JVT-B118)
296  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
297  * to 0 (low score means set it to null)
298  * Used in inter macroblock (luma and chroma)
299  *  luma: for a 8x8 block: if score < 4 -> null
300  *        for the complete mb: if score < 6 -> null
301  *  chroma: for the complete mb: if score < 7 -> null
302  */
303
304 const uint8_t x264_decimate_table4[16] =
305 {
306     3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0
307 };
308 const uint8_t x264_decimate_table8[64] =
309 {
310     3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
311     1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
312     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
313     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
314 };
315
316 static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
317 {
318     const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
319     int i_score = 0;
320     int idx = i_max - 1;
321
322     while( idx >= 0 && dct[idx] == 0 )
323         idx--;
324     while( idx >= 0 )
325     {
326         int i_run;
327
328         if( (unsigned)(dct[idx--] + 1) > 2 )
329             return 9;
330
331         i_run = 0;
332         while( idx >= 0 && dct[idx] == 0 )
333         {
334             idx--;
335             i_run++;
336         }
337         i_score += ds_table[i_run];
338     }
339
340     return i_score;
341 }
342
343 static int x264_decimate_score15( dctcoef *dct )
344 {
345     return x264_decimate_score_internal( dct+1, 15 );
346 }
347 static int x264_decimate_score16( dctcoef *dct )
348 {
349     return x264_decimate_score_internal( dct, 16 );
350 }
351 static int x264_decimate_score64( dctcoef *dct )
352 {
353     return x264_decimate_score_internal( dct, 64 );
354 }
355
356 #define last(num)\
357 static int x264_coeff_last##num( dctcoef *l )\
358 {\
359     int i_last = num-1;\
360     while( i_last >= 0 && l[i_last] == 0 )\
361         i_last--;\
362     return i_last;\
363 }
364
365 last(4)
366 last(8)
367 last(15)
368 last(16)
369 last(64)
370
371 #define level_run(num)\
372 static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
373 {\
374     int i_last = runlevel->last = x264_coeff_last##num(dct);\
375     int i_total = 0;\
376     do\
377     {\
378         int r = 0;\
379         runlevel->level[i_total] = dct[i_last];\
380         while( --i_last >= 0 && dct[i_last] == 0 )\
381             r++;\
382         runlevel->run[i_total++] = r;\
383     } while( i_last >= 0 );\
384     return i_total;\
385 }
386
387 level_run(4)
388 level_run(8)
389 level_run(15)
390 level_run(16)
391
392 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
393 {
394     pf->quant_8x8 = quant_8x8;
395     pf->quant_4x4 = quant_4x4;
396     pf->quant_4x4_dc = quant_4x4_dc;
397     pf->quant_2x2_dc = quant_2x2_dc;
398
399     pf->dequant_4x4 = dequant_4x4;
400     pf->dequant_4x4_dc = dequant_4x4_dc;
401     pf->dequant_8x8 = dequant_8x8;
402
403     pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
404     pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
405
406     pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
407     pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
408
409     pf->denoise_dct = x264_denoise_dct;
410     pf->decimate_score15 = x264_decimate_score15;
411     pf->decimate_score16 = x264_decimate_score16;
412     pf->decimate_score64 = x264_decimate_score64;
413
414     pf->coeff_last4 = x264_coeff_last4;
415     pf->coeff_last8 = x264_coeff_last8;
416     pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
417     pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
418     pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
419     pf->coeff_level_run4 = x264_coeff_level_run4;
420     pf->coeff_level_run8 = x264_coeff_level_run8;
421     pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
422     pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
423
424 #if HIGH_BIT_DEPTH
425 #if HAVE_MMX
426     if( cpu&X264_CPU_MMX2 )
427     {
428 #if ARCH_X86
429         pf->denoise_dct = x264_denoise_dct_mmx;
430         pf->decimate_score15 = x264_decimate_score15_mmx2;
431         pf->decimate_score16 = x264_decimate_score16_mmx2;
432         if( cpu&X264_CPU_SLOW_CTZ )
433         {
434             pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
435             pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
436         }
437         pf->decimate_score64 = x264_decimate_score64_mmx2;
438         pf->coeff_last4 = x264_coeff_last4_mmx2;
439         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
440         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
441         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
442         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
443         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
444 #endif
445         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
446         if( cpu&X264_CPU_LZCNT )
447             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
448     }
449     if( cpu&X264_CPU_SSE2 )
450     {
451         pf->quant_4x4 = x264_quant_4x4_sse2;
452         pf->quant_8x8 = x264_quant_8x8_sse2;
453         pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
454         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
455         pf->dequant_4x4 = x264_dequant_4x4_sse2;
456         pf->dequant_8x8 = x264_dequant_8x8_sse2;
457         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
458         pf->denoise_dct = x264_denoise_dct_sse2;
459         pf->decimate_score15 = x264_decimate_score15_sse2;
460         pf->decimate_score16 = x264_decimate_score16_sse2;
461         pf->decimate_score64 = x264_decimate_score64_sse2;
462         if( cpu&X264_CPU_SLOW_CTZ )
463         {
464             pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
465             pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
466         }
467         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
468         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
469         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
470         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
471         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
472         if( cpu&X264_CPU_LZCNT )
473         {
474             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
475             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
476             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
477             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
478             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
479             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
480         }
481     }
482     if( cpu&X264_CPU_SSSE3 )
483     {
484         pf->quant_4x4 = x264_quant_4x4_ssse3;
485         pf->quant_8x8 = x264_quant_8x8_ssse3;
486         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
487         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
488         pf->denoise_dct = x264_denoise_dct_ssse3;
489         pf->decimate_score15 = x264_decimate_score15_ssse3;
490         pf->decimate_score16 = x264_decimate_score16_ssse3;
491         if( cpu&X264_CPU_SLOW_CTZ )
492         {
493             pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
494             pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
495         }
496         pf->decimate_score64 = x264_decimate_score64_ssse3;
497     }
498     if( cpu&X264_CPU_SSE4 )
499     {
500         pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
501         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
502         pf->quant_4x4 = x264_quant_4x4_sse4;
503         pf->quant_8x8 = x264_quant_8x8_sse4;
504     }
505     if( cpu&X264_CPU_AVX )
506     {
507         pf->denoise_dct = x264_denoise_dct_avx;
508     }
509     if( cpu&X264_CPU_XOP )
510     {
511         pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
512         if( h->param.i_cqm_preset != X264_CQM_FLAT )
513         {
514             pf->dequant_4x4 = x264_dequant_4x4_xop;
515             pf->dequant_8x8 = x264_dequant_8x8_xop;
516         }
517     }
518 #endif // HAVE_MMX
519 #else // !HIGH_BIT_DEPTH
520 #if HAVE_MMX
521     if( cpu&X264_CPU_MMX )
522     {
523 #if ARCH_X86
524         pf->quant_4x4 = x264_quant_4x4_mmx;
525         pf->quant_8x8 = x264_quant_8x8_mmx;
526         pf->dequant_4x4 = x264_dequant_4x4_mmx;
527         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
528         pf->dequant_8x8 = x264_dequant_8x8_mmx;
529         if( h->param.i_cqm_preset == X264_CQM_FLAT )
530         {
531             pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
532             pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
533         }
534         pf->denoise_dct = x264_denoise_dct_mmx;
535 #endif
536     }
537
538     if( cpu&X264_CPU_MMX2 )
539     {
540         pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
541 #if ARCH_X86
542         pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
543         pf->decimate_score15 = x264_decimate_score15_mmx2;
544         pf->decimate_score16 = x264_decimate_score16_mmx2;
545         if( cpu&X264_CPU_SLOW_CTZ )
546         {
547             pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
548             pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
549         }
550         pf->decimate_score64 = x264_decimate_score64_mmx2;
551         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
552         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
553         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
554         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
555         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
556 #endif
557         pf->coeff_last4 = x264_coeff_last4_mmx2;
558         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
559         if( cpu&X264_CPU_LZCNT )
560         {
561             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
562             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
563         }
564     }
565
566     if( cpu&X264_CPU_SSE2 )
567     {
568         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
569         pf->quant_4x4 = x264_quant_4x4_sse2;
570         pf->quant_8x8 = x264_quant_8x8_sse2;
571         pf->dequant_4x4 = x264_dequant_4x4_sse2;
572         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
573         pf->dequant_8x8 = x264_dequant_8x8_sse2;
574         if( h->param.i_cqm_preset == X264_CQM_FLAT )
575         {
576             pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
577             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
578         }
579         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
580         pf->denoise_dct = x264_denoise_dct_sse2;
581         pf->decimate_score15 = x264_decimate_score15_sse2;
582         pf->decimate_score16 = x264_decimate_score16_sse2;
583         pf->decimate_score64 = x264_decimate_score64_sse2;
584         if( cpu&X264_CPU_SLOW_CTZ )
585         {
586             pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
587             pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
588         }
589         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
590         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
591         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
592         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
593         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
594         if( cpu&X264_CPU_LZCNT )
595         {
596             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
597             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
598             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
599             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
600             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
601         }
602     }
603
604     if( cpu&X264_CPU_SSSE3 )
605     {
606         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
607         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
608         pf->quant_4x4 = x264_quant_4x4_ssse3;
609         pf->quant_8x8 = x264_quant_8x8_ssse3;
610         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
611         pf->denoise_dct = x264_denoise_dct_ssse3;
612         pf->decimate_score15 = x264_decimate_score15_ssse3;
613         pf->decimate_score16 = x264_decimate_score16_ssse3;
614         if( cpu&X264_CPU_SLOW_CTZ )
615         {
616             pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
617             pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
618         }
619         pf->decimate_score64 = x264_decimate_score64_ssse3;
620     }
621
622     if( cpu&X264_CPU_SSE4 )
623     {
624         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
625         pf->quant_4x4 = x264_quant_4x4_sse4;
626         pf->quant_8x8 = x264_quant_8x8_sse4;
627         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
628     }
629
630     if( cpu&X264_CPU_AVX )
631     {
632         pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
633         if( h->param.i_cqm_preset != X264_CQM_FLAT )
634         {
635             pf->dequant_4x4 = x264_dequant_4x4_avx;
636             pf->dequant_8x8 = x264_dequant_8x8_avx;
637         }
638         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
639         pf->denoise_dct = x264_denoise_dct_avx;
640     }
641
642     if( cpu&X264_CPU_XOP )
643     {
644         if( h->param.i_cqm_preset != X264_CQM_FLAT )
645         {
646             pf->dequant_4x4 = x264_dequant_4x4_xop;
647             pf->dequant_8x8 = x264_dequant_8x8_xop;
648         }
649     }
650 #endif // HAVE_MMX
651
652 #if HAVE_ALTIVEC
653     if( cpu&X264_CPU_ALTIVEC ) {
654         pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
655         pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
656         pf->quant_4x4 = x264_quant_4x4_altivec;
657         pf->quant_8x8 = x264_quant_8x8_altivec;
658
659         pf->dequant_4x4 = x264_dequant_4x4_altivec;
660         pf->dequant_8x8 = x264_dequant_8x8_altivec;
661     }
662 #endif
663
664 #if HAVE_ARMV6
665     if( cpu&X264_CPU_ARMV6 )
666         pf->coeff_last4 = x264_coeff_last4_arm;
667
668     if( cpu&X264_CPU_NEON )
669     {
670         pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
671         pf->quant_4x4      = x264_quant_4x4_neon;
672         pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
673         pf->quant_8x8      = x264_quant_8x8_neon;
674         pf->dequant_4x4    = x264_dequant_4x4_neon;
675         pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
676         pf->dequant_8x8    = x264_dequant_8x8_neon;
677         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
678         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
679         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
680     }
681 #endif
682 #endif // HIGH_BIT_DEPTH
683     pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
684     pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
685     pf->coeff_last[DCT_CHROMA_AC]   = pf->coeff_last[DCT_CHROMAU_AC]  =
686     pf->coeff_last[DCT_CHROMAV_AC]  = pf->coeff_last[DCT_LUMA_AC];
687     pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];
688
689     pf->coeff_level_run[DCT_LUMA_DC]     = pf->coeff_level_run[DCT_CHROMAU_DC]  = pf->coeff_level_run[DCT_CHROMAV_DC] =
690     pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
691     pf->coeff_level_run[DCT_CHROMA_AC]   = pf->coeff_level_run[DCT_CHROMAU_AC]  =
692     pf->coeff_level_run[DCT_CHROMAV_AC]  = pf->coeff_level_run[DCT_LUMA_AC];
693 }