]> git.sesse.net Git - x264/blob - common/quant.c
Remove explicit run calculation from coeff_level_run
[x264] / common / quant.c
1 /*****************************************************************************
2  * quant.c: quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2005-2012 x264 project
5  *
6  * Authors: Loren Merritt <lorenm@u.washington.edu>
7  *          Fiona Glaser <fiona@x264.com>
8  *          Christian Heine <sennindemokrit@gmx.net>
9  *          Henrik Gramner <hengar-6@student.ltu.se>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24  *
25  * This program is also available under a commercial proprietary license.
26  * For more information, contact us at licensing@x264.com.
27  *****************************************************************************/
28
29 #include "common.h"
30
31 #if HAVE_MMX
32 #include "x86/quant.h"
33 #endif
34 #if ARCH_PPC
35 #   include "ppc/quant.h"
36 #endif
37 #if ARCH_ARM
38 #   include "arm/quant.h"
39 #endif
40
41 #define QUANT_ONE( coef, mf, f ) \
42 { \
43     if( (coef) > 0 ) \
44         (coef) = (f + (coef)) * (mf) >> 16; \
45     else \
46         (coef) = - ((f - (coef)) * (mf) >> 16); \
47     nz |= (coef); \
48 }
49
50 static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
51 {
52     int nz = 0;
53     for( int i = 0; i < 64; i++ )
54         QUANT_ONE( dct[i], mf[i], bias[i] );
55     return !!nz;
56 }
57
58 static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
59 {
60     int nz = 0;
61     for( int i = 0; i < 16; i++ )
62         QUANT_ONE( dct[i], mf[i], bias[i] );
63     return !!nz;
64 }
65
66 static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
67 {
68     int nz = 0;
69     for( int i = 0; i < 16; i++ )
70         QUANT_ONE( dct[i], mf, bias );
71     return !!nz;
72 }
73
74 static int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
75 {
76     int nz = 0;
77     QUANT_ONE( dct[0], mf, bias );
78     QUANT_ONE( dct[1], mf, bias );
79     QUANT_ONE( dct[2], mf, bias );
80     QUANT_ONE( dct[3], mf, bias );
81     return !!nz;
82 }
83
84 #define DEQUANT_SHL( x ) \
85     dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
86
87 #define DEQUANT_SHR( x ) \
88     dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
89
90 static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
91 {
92     const int i_mf = i_qp%6;
93     const int i_qbits = i_qp/6 - 4;
94
95     if( i_qbits >= 0 )
96     {
97         for( int i = 0; i < 16; i++ )
98             DEQUANT_SHL( i );
99     }
100     else
101     {
102         const int f = 1 << (-i_qbits-1);
103         for( int i = 0; i < 16; i++ )
104             DEQUANT_SHR( i );
105     }
106 }
107
108 static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
109 {
110     const int i_mf = i_qp%6;
111     const int i_qbits = i_qp/6 - 6;
112
113     if( i_qbits >= 0 )
114     {
115         for( int i = 0; i < 64; i++ )
116             DEQUANT_SHL( i );
117     }
118     else
119     {
120         const int f = 1 << (-i_qbits-1);
121         for( int i = 0; i < 64; i++ )
122             DEQUANT_SHR( i );
123     }
124 }
125
126 static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
127 {
128     const int i_qbits = i_qp/6 - 6;
129
130     if( i_qbits >= 0 )
131     {
132         const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
133         for( int i = 0; i < 16; i++ )
134             dct[i] *= i_dmf;
135     }
136     else
137     {
138         const int i_dmf = dequant_mf[i_qp%6][0];
139         const int f = 1 << (-i_qbits-1);
140         for( int i = 0; i < 16; i++ )
141             dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
142     }
143 }
144
145 #define IDCT_DEQUANT_2X4_START \
146     int a0 = dct[0] + dct[1]; \
147     int a1 = dct[2] + dct[3]; \
148     int a2 = dct[4] + dct[5]; \
149     int a3 = dct[6] + dct[7]; \
150     int a4 = dct[0] - dct[1]; \
151     int a5 = dct[2] - dct[3]; \
152     int a6 = dct[4] - dct[5]; \
153     int a7 = dct[6] - dct[7]; \
154     int b0 = a0 + a1; \
155     int b1 = a2 + a3; \
156     int b2 = a4 + a5; \
157     int b3 = a6 + a7; \
158     int b4 = a0 - a1; \
159     int b5 = a2 - a3; \
160     int b6 = a4 - a5; \
161     int b7 = a6 - a7;
162
163 static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
164 {
165     IDCT_DEQUANT_2X4_START
166     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
167     dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
168     dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
169     dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
170     dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
171     dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
172     dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
173     dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
174     dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
175 }
176
177 static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
178 {
179     IDCT_DEQUANT_2X4_START
180     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
181     dct[0] = ((b0 + b1) * dmf + 32) >> 6;
182     dct[1] = ((b2 + b3) * dmf + 32) >> 6;
183     dct[2] = ((b0 - b1) * dmf + 32) >> 6;
184     dct[3] = ((b2 - b3) * dmf + 32) >> 6;
185     dct[4] = ((b4 - b5) * dmf + 32) >> 6;
186     dct[5] = ((b6 - b7) * dmf + 32) >> 6;
187     dct[6] = ((b4 + b5) * dmf + 32) >> 6;
188     dct[7] = ((b6 + b7) * dmf + 32) >> 6;
189 }
190
191 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
192 {
193     IDCT_DEQUANT_2X4_START
194     out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
195     out[1] = ((b2 + b3) * dmf + 2080) >> 6;
196     out[2] = ((b0 - b1) * dmf + 2080) >> 6;
197     out[3] = ((b2 - b3) * dmf + 2080) >> 6;
198     out[4] = ((b4 - b5) * dmf + 2080) >> 6;
199     out[5] = ((b6 - b7) * dmf + 2080) >> 6;
200     out[6] = ((b4 + b5) * dmf + 2080) >> 6;
201     out[7] = ((b6 + b7) * dmf + 2080) >> 6;
202 }
203 #undef IDCT_DEQUANT_2X4_START
204
205 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
206 {
207     int d0 = dct[0] + dct[1];
208     int d1 = dct[2] + dct[3];
209     int d2 = dct[0] - dct[1];
210     int d3 = dct[2] - dct[3];
211     out[0] = ((d0 + d1) * dmf >> 5) + 32;
212     out[1] = ((d0 - d1) * dmf >> 5) + 32;
213     out[2] = ((d2 + d3) * dmf >> 5) + 32;
214     out[3] = ((d2 - d3) * dmf >> 5) + 32;
215 }
216
217 static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
218 {
219     dctcoef out[8];
220
221     if( chroma422 )
222         optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
223     else
224         optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
225
226     int sum = 0;
227     for( int i = 0; i < (chroma422?8:4); i++ )
228         sum |= ref[i] ^ out[i];
229     return sum >> 6;
230 }
231
232 static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
233 {
234     /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
235     dctcoef dct_orig[8];
236     int coeff, nz;
237
238     if( chroma422 )
239         optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
240     else
241         optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
242
243     /* If the DC coefficients already round to zero, terminate early. */
244     int sum = 0;
245     for( int i = 0; i < (chroma422?8:4); i++ )
246         sum |= dct_orig[i];
247     if( !(sum >> 6) )
248         return 0;
249
250     /* Start with the highest frequency coefficient... is this the best option? */
251     for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
252     {
253         int level = dct[coeff];
254         int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
255
256         while( level )
257         {
258             dct[coeff] = level - sign;
259             if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
260             {
261                 nz = 1;
262                 dct[coeff] = level;
263                 break;
264             }
265             level -= sign;
266         }
267     }
268
269     return nz;
270 }
271
272 static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
273 {
274     return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
275 }
276
277 static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
278 {
279     return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
280 }
281
282 static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
283 {
284     for( int i = 0; i < size; i++ )
285     {
286         int level = dct[i];
287         int sign = level>>31;
288         level = (level+sign)^sign;
289         sum[i] += level;
290         level -= offset[i];
291         dct[i] = level<0 ? 0 : (level^sign)-sign;
292     }
293 }
294
295 /* (ref: JVT-B118)
296  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
297  * to 0 (low score means set it to null)
298  * Used in inter macroblock (luma and chroma)
299  *  luma: for a 8x8 block: if score < 4 -> null
300  *        for the complete mb: if score < 6 -> null
301  *  chroma: for the complete mb: if score < 7 -> null
302  */
303
304 const uint8_t x264_decimate_table4[16] =
305 {
306     3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0
307 };
308 const uint8_t x264_decimate_table8[64] =
309 {
310     3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
311     1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
312     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
313     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
314 };
315
316 static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
317 {
318     const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
319     int i_score = 0;
320     int idx = i_max - 1;
321
322     while( idx >= 0 && dct[idx] == 0 )
323         idx--;
324     while( idx >= 0 )
325     {
326         int i_run;
327
328         if( (unsigned)(dct[idx--] + 1) > 2 )
329             return 9;
330
331         i_run = 0;
332         while( idx >= 0 && dct[idx] == 0 )
333         {
334             idx--;
335             i_run++;
336         }
337         i_score += ds_table[i_run];
338     }
339
340     return i_score;
341 }
342
343 static int x264_decimate_score15( dctcoef *dct )
344 {
345     return x264_decimate_score_internal( dct+1, 15 );
346 }
347 static int x264_decimate_score16( dctcoef *dct )
348 {
349     return x264_decimate_score_internal( dct, 16 );
350 }
351 static int x264_decimate_score64( dctcoef *dct )
352 {
353     return x264_decimate_score_internal( dct, 64 );
354 }
355
356 #define last(num)\
357 static int x264_coeff_last##num( dctcoef *l )\
358 {\
359     int i_last = num-1;\
360     while( i_last >= 0 && l[i_last] == 0 )\
361         i_last--;\
362     return i_last;\
363 }
364
365 last(4)
366 last(8)
367 last(15)
368 last(16)
369 last(64)
370
371 #define level_run(num)\
372 static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
373 {\
374     int i_last = runlevel->last = x264_coeff_last##num(dct);\
375     int i_total = 0;\
376     int mask = 0;\
377     do\
378     {\
379         runlevel->level[i_total++] = dct[i_last];\
380         mask |= 1 << (i_last);\
381         while( --i_last >= 0 && dct[i_last] == 0 );\
382     } while( i_last >= 0 );\
383     runlevel->mask = mask;\
384     return i_total;\
385 }
386
387 level_run(4)
388 level_run(8)
389 level_run(15)
390 level_run(16)
391
392 #if ARCH_X86_64
393 #define INIT_TRELLIS(cpu)\
394     pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\
395     pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\
396     pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\
397     pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\
398     pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\
399     pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu;
400 #else
401 #define INIT_TRELLIS(...)
402 #endif
403
404 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
405 {
406     pf->quant_8x8 = quant_8x8;
407     pf->quant_4x4 = quant_4x4;
408     pf->quant_4x4_dc = quant_4x4_dc;
409     pf->quant_2x2_dc = quant_2x2_dc;
410
411     pf->dequant_4x4 = dequant_4x4;
412     pf->dequant_4x4_dc = dequant_4x4_dc;
413     pf->dequant_8x8 = dequant_8x8;
414
415     pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
416     pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
417
418     pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
419     pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
420
421     pf->denoise_dct = x264_denoise_dct;
422     pf->decimate_score15 = x264_decimate_score15;
423     pf->decimate_score16 = x264_decimate_score16;
424     pf->decimate_score64 = x264_decimate_score64;
425
426     pf->coeff_last4 = x264_coeff_last4;
427     pf->coeff_last8 = x264_coeff_last8;
428     pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
429     pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
430     pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
431     pf->coeff_level_run4 = x264_coeff_level_run4;
432     pf->coeff_level_run8 = x264_coeff_level_run8;
433     pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
434     pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
435
436 #if HIGH_BIT_DEPTH
437 #if HAVE_MMX
438     INIT_TRELLIS( sse2 );
439     if( cpu&X264_CPU_MMX2 )
440     {
441 #if ARCH_X86
442         pf->denoise_dct = x264_denoise_dct_mmx;
443         pf->decimate_score15 = x264_decimate_score15_mmx2;
444         pf->decimate_score16 = x264_decimate_score16_mmx2;
445         if( cpu&X264_CPU_SLOW_CTZ )
446         {
447             pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
448             pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
449         }
450         pf->decimate_score64 = x264_decimate_score64_mmx2;
451         pf->coeff_last8 = x264_coeff_last8_mmx2;
452         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
453         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
454         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
455         pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
456         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
457         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
458 #endif
459         pf->coeff_last4 = x264_coeff_last4_mmx2;
460         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
461         if( cpu&X264_CPU_LZCNT )
462             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
463     }
464     if( cpu&X264_CPU_SSE2 )
465     {
466         pf->quant_4x4 = x264_quant_4x4_sse2;
467         pf->quant_8x8 = x264_quant_8x8_sse2;
468         pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
469         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
470         pf->dequant_4x4 = x264_dequant_4x4_sse2;
471         pf->dequant_8x8 = x264_dequant_8x8_sse2;
472         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
473         pf->denoise_dct = x264_denoise_dct_sse2;
474         pf->decimate_score15 = x264_decimate_score15_sse2;
475         pf->decimate_score16 = x264_decimate_score16_sse2;
476         pf->decimate_score64 = x264_decimate_score64_sse2;
477         if( cpu&X264_CPU_SLOW_CTZ )
478         {
479             pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
480             pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
481         }
482         pf->coeff_last8 = x264_coeff_last8_sse2;
483         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
484         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
485         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
486         pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
487         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
488         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
489         if( cpu&X264_CPU_LZCNT )
490         {
491             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
492             pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
493             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
494             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
495             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
496             pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
497             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
498             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
499         }
500     }
501     if( cpu&X264_CPU_SSSE3 )
502     {
503         pf->quant_4x4 = x264_quant_4x4_ssse3;
504         pf->quant_8x8 = x264_quant_8x8_ssse3;
505         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
506         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
507         pf->denoise_dct = x264_denoise_dct_ssse3;
508         pf->decimate_score15 = x264_decimate_score15_ssse3;
509         pf->decimate_score16 = x264_decimate_score16_ssse3;
510         if( cpu&X264_CPU_SLOW_CTZ )
511         {
512             pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
513             pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
514         }
515         pf->decimate_score64 = x264_decimate_score64_ssse3;
516         INIT_TRELLIS( ssse3 );
517     }
518     if( cpu&X264_CPU_SSE4 )
519     {
520         pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
521         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
522         pf->quant_4x4 = x264_quant_4x4_sse4;
523         pf->quant_8x8 = x264_quant_8x8_sse4;
524     }
525     if( cpu&X264_CPU_AVX )
526     {
527         pf->denoise_dct = x264_denoise_dct_avx;
528     }
529     if( cpu&X264_CPU_XOP )
530     {
531         pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
532         if( h->param.i_cqm_preset != X264_CQM_FLAT )
533         {
534             pf->dequant_4x4 = x264_dequant_4x4_xop;
535             pf->dequant_8x8 = x264_dequant_8x8_xop;
536         }
537     }
538 #endif // HAVE_MMX
539 #else // !HIGH_BIT_DEPTH
540 #if HAVE_MMX
541     INIT_TRELLIS( sse2 );
542     if( cpu&X264_CPU_MMX )
543     {
544 #if ARCH_X86
545         pf->quant_4x4 = x264_quant_4x4_mmx;
546         pf->quant_8x8 = x264_quant_8x8_mmx;
547         pf->dequant_4x4 = x264_dequant_4x4_mmx;
548         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
549         pf->dequant_8x8 = x264_dequant_8x8_mmx;
550         if( h->param.i_cqm_preset == X264_CQM_FLAT )
551         {
552             pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
553             pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
554         }
555         pf->denoise_dct = x264_denoise_dct_mmx;
556 #endif
557     }
558
559     if( cpu&X264_CPU_MMX2 )
560     {
561         pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
562 #if ARCH_X86
563         pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
564         pf->decimate_score15 = x264_decimate_score15_mmx2;
565         pf->decimate_score16 = x264_decimate_score16_mmx2;
566         if( cpu&X264_CPU_SLOW_CTZ )
567         {
568             pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
569             pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
570         }
571         pf->decimate_score64 = x264_decimate_score64_mmx2;
572         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
573         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
574         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
575         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
576         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
577 #endif
578         pf->coeff_last4 = x264_coeff_last4_mmx2;
579         pf->coeff_last8 = x264_coeff_last8_mmx2;
580         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
581         pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
582         if( cpu&X264_CPU_LZCNT )
583         {
584             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
585             pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
586             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
587             pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
588         }
589     }
590
591     if( cpu&X264_CPU_SSE2 )
592     {
593         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
594         pf->quant_4x4 = x264_quant_4x4_sse2;
595         pf->quant_8x8 = x264_quant_8x8_sse2;
596         pf->dequant_4x4 = x264_dequant_4x4_sse2;
597         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
598         pf->dequant_8x8 = x264_dequant_8x8_sse2;
599         if( h->param.i_cqm_preset == X264_CQM_FLAT )
600         {
601             pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
602             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
603         }
604         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
605         pf->denoise_dct = x264_denoise_dct_sse2;
606         pf->decimate_score15 = x264_decimate_score15_sse2;
607         pf->decimate_score16 = x264_decimate_score16_sse2;
608         pf->decimate_score64 = x264_decimate_score64_sse2;
609         if( cpu&X264_CPU_SLOW_CTZ )
610         {
611             pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
612             pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
613         }
614         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
615         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
616         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
617         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
618         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
619         if( cpu&X264_CPU_LZCNT )
620         {
621             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
622             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
623             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
624             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
625             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
626         }
627     }
628
629     if( cpu&X264_CPU_SSSE3 )
630     {
631         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
632         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
633         pf->quant_4x4 = x264_quant_4x4_ssse3;
634         pf->quant_8x8 = x264_quant_8x8_ssse3;
635         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
636         pf->denoise_dct = x264_denoise_dct_ssse3;
637         pf->decimate_score15 = x264_decimate_score15_ssse3;
638         pf->decimate_score16 = x264_decimate_score16_ssse3;
639         if( cpu&X264_CPU_SLOW_CTZ )
640         {
641             pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
642             pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
643         }
644         pf->decimate_score64 = x264_decimate_score64_ssse3;
645         INIT_TRELLIS( ssse3 );
646     }
647
648     if( cpu&X264_CPU_SSE4 )
649     {
650         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
651         pf->quant_4x4 = x264_quant_4x4_sse4;
652         pf->quant_8x8 = x264_quant_8x8_sse4;
653         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
654     }
655
656     if( cpu&X264_CPU_AVX )
657     {
658         pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
659         if( h->param.i_cqm_preset != X264_CQM_FLAT )
660         {
661             pf->dequant_4x4 = x264_dequant_4x4_avx;
662             pf->dequant_8x8 = x264_dequant_8x8_avx;
663         }
664         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
665         pf->denoise_dct = x264_denoise_dct_avx;
666     }
667
668     if( cpu&X264_CPU_XOP )
669     {
670         if( h->param.i_cqm_preset != X264_CQM_FLAT )
671         {
672             pf->dequant_4x4 = x264_dequant_4x4_xop;
673             pf->dequant_8x8 = x264_dequant_8x8_xop;
674         }
675     }
676 #endif // HAVE_MMX
677
678 #if HAVE_ALTIVEC
679     if( cpu&X264_CPU_ALTIVEC ) {
680         pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
681         pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
682         pf->quant_4x4 = x264_quant_4x4_altivec;
683         pf->quant_8x8 = x264_quant_8x8_altivec;
684
685         pf->dequant_4x4 = x264_dequant_4x4_altivec;
686         pf->dequant_8x8 = x264_dequant_8x8_altivec;
687     }
688 #endif
689
690 #if HAVE_ARMV6
691     if( cpu&X264_CPU_ARMV6 )
692         pf->coeff_last4 = x264_coeff_last4_arm;
693
694     if( cpu&X264_CPU_NEON )
695     {
696         pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
697         pf->quant_4x4      = x264_quant_4x4_neon;
698         pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
699         pf->quant_8x8      = x264_quant_8x8_neon;
700         pf->dequant_4x4    = x264_dequant_4x4_neon;
701         pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
702         pf->dequant_8x8    = x264_dequant_8x8_neon;
703         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
704         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
705         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
706     }
707 #endif
708 #endif // HIGH_BIT_DEPTH
709     pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
710     pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
711     pf->coeff_last[DCT_CHROMA_AC]   = pf->coeff_last[DCT_CHROMAU_AC]  =
712     pf->coeff_last[DCT_CHROMAV_AC]  = pf->coeff_last[DCT_LUMA_AC];
713     pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];
714
715     pf->coeff_level_run[DCT_LUMA_DC]     = pf->coeff_level_run[DCT_CHROMAU_DC]  = pf->coeff_level_run[DCT_CHROMAV_DC] =
716     pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
717     pf->coeff_level_run[DCT_CHROMA_AC]   = pf->coeff_level_run[DCT_CHROMAU_AC]  =
718     pf->coeff_level_run[DCT_CHROMAV_AC]  = pf->coeff_level_run[DCT_LUMA_AC];
719 }