]> git.sesse.net Git - x264/blob - common/quant.c
Bump dates to 2015
[x264] / common / quant.c
1 /*****************************************************************************
2  * quant.c: quantization and level-run
3  *****************************************************************************
4  * Copyright (C) 2005-2015 x264 project
5  *
6  * Authors: Loren Merritt <lorenm@u.washington.edu>
7  *          Fiona Glaser <fiona@x264.com>
8  *          Christian Heine <sennindemokrit@gmx.net>
9  *          Henrik Gramner <henrik@gramner.com>
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24  *
25  * This program is also available under a commercial proprietary license.
26  * For more information, contact us at licensing@x264.com.
27  *****************************************************************************/
28
29 #include "common.h"
30
31 #if HAVE_MMX
32 #include "x86/quant.h"
33 #endif
34 #if ARCH_PPC
35 #   include "ppc/quant.h"
36 #endif
37 #if ARCH_ARM
38 #   include "arm/quant.h"
39 #endif
40 #if ARCH_AARCH64
41 #   include "aarch64/quant.h"
42 #endif
43
44 #define QUANT_ONE( coef, mf, f ) \
45 { \
46     if( (coef) > 0 ) \
47         (coef) = (f + (coef)) * (mf) >> 16; \
48     else \
49         (coef) = - ((f - (coef)) * (mf) >> 16); \
50     nz |= (coef); \
51 }
52
53 static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
54 {
55     int nz = 0;
56     for( int i = 0; i < 64; i++ )
57         QUANT_ONE( dct[i], mf[i], bias[i] );
58     return !!nz;
59 }
60
61 static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
62 {
63     int nz = 0;
64     for( int i = 0; i < 16; i++ )
65         QUANT_ONE( dct[i], mf[i], bias[i] );
66     return !!nz;
67 }
68
69 static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
70 {
71     int nza = 0;
72     for( int j = 0; j < 4; j++ )
73     {
74         int nz = 0;
75         for( int i = 0; i < 16; i++ )
76             QUANT_ONE( dct[j][i], mf[i], bias[i] );
77         nza |= (!!nz)<<j;
78     }
79     return nza;
80 }
81
82 static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
83 {
84     int nz = 0;
85     for( int i = 0; i < 16; i++ )
86         QUANT_ONE( dct[i], mf, bias );
87     return !!nz;
88 }
89
90 static int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
91 {
92     int nz = 0;
93     QUANT_ONE( dct[0], mf, bias );
94     QUANT_ONE( dct[1], mf, bias );
95     QUANT_ONE( dct[2], mf, bias );
96     QUANT_ONE( dct[3], mf, bias );
97     return !!nz;
98 }
99
100 #define DEQUANT_SHL( x ) \
101     dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
102
103 #define DEQUANT_SHR( x ) \
104     dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
105
106 static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
107 {
108     const int i_mf = i_qp%6;
109     const int i_qbits = i_qp/6 - 4;
110
111     if( i_qbits >= 0 )
112     {
113         for( int i = 0; i < 16; i++ )
114             DEQUANT_SHL( i );
115     }
116     else
117     {
118         const int f = 1 << (-i_qbits-1);
119         for( int i = 0; i < 16; i++ )
120             DEQUANT_SHR( i );
121     }
122 }
123
124 static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
125 {
126     const int i_mf = i_qp%6;
127     const int i_qbits = i_qp/6 - 6;
128
129     if( i_qbits >= 0 )
130     {
131         for( int i = 0; i < 64; i++ )
132             DEQUANT_SHL( i );
133     }
134     else
135     {
136         const int f = 1 << (-i_qbits-1);
137         for( int i = 0; i < 64; i++ )
138             DEQUANT_SHR( i );
139     }
140 }
141
142 static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
143 {
144     const int i_qbits = i_qp/6 - 6;
145
146     if( i_qbits >= 0 )
147     {
148         const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
149         for( int i = 0; i < 16; i++ )
150             dct[i] *= i_dmf;
151     }
152     else
153     {
154         const int i_dmf = dequant_mf[i_qp%6][0];
155         const int f = 1 << (-i_qbits-1);
156         for( int i = 0; i < 16; i++ )
157             dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
158     }
159 }
160
161 #define IDCT_DEQUANT_2X4_START \
162     int a0 = dct[0] + dct[1]; \
163     int a1 = dct[2] + dct[3]; \
164     int a2 = dct[4] + dct[5]; \
165     int a3 = dct[6] + dct[7]; \
166     int a4 = dct[0] - dct[1]; \
167     int a5 = dct[2] - dct[3]; \
168     int a6 = dct[4] - dct[5]; \
169     int a7 = dct[6] - dct[7]; \
170     int b0 = a0 + a1; \
171     int b1 = a2 + a3; \
172     int b2 = a4 + a5; \
173     int b3 = a6 + a7; \
174     int b4 = a0 - a1; \
175     int b5 = a2 - a3; \
176     int b6 = a4 - a5; \
177     int b7 = a6 - a7;
178
179 static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
180 {
181     IDCT_DEQUANT_2X4_START
182     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
183     dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
184     dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
185     dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
186     dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
187     dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
188     dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
189     dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
190     dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
191 }
192
193 static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
194 {
195     IDCT_DEQUANT_2X4_START
196     int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
197     dct[0] = ((b0 + b1) * dmf + 32) >> 6;
198     dct[1] = ((b2 + b3) * dmf + 32) >> 6;
199     dct[2] = ((b0 - b1) * dmf + 32) >> 6;
200     dct[3] = ((b2 - b3) * dmf + 32) >> 6;
201     dct[4] = ((b4 - b5) * dmf + 32) >> 6;
202     dct[5] = ((b6 - b7) * dmf + 32) >> 6;
203     dct[6] = ((b4 + b5) * dmf + 32) >> 6;
204     dct[7] = ((b6 + b7) * dmf + 32) >> 6;
205 }
206
207 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
208 {
209     IDCT_DEQUANT_2X4_START
210     out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
211     out[1] = ((b2 + b3) * dmf + 2080) >> 6;
212     out[2] = ((b0 - b1) * dmf + 2080) >> 6;
213     out[3] = ((b2 - b3) * dmf + 2080) >> 6;
214     out[4] = ((b4 - b5) * dmf + 2080) >> 6;
215     out[5] = ((b6 - b7) * dmf + 2080) >> 6;
216     out[6] = ((b4 + b5) * dmf + 2080) >> 6;
217     out[7] = ((b6 + b7) * dmf + 2080) >> 6;
218 }
219 #undef IDCT_DEQUANT_2X4_START
220
221 static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
222 {
223     int d0 = dct[0] + dct[1];
224     int d1 = dct[2] + dct[3];
225     int d2 = dct[0] - dct[1];
226     int d3 = dct[2] - dct[3];
227     out[0] = ((d0 + d1) * dmf >> 5) + 32;
228     out[1] = ((d0 - d1) * dmf >> 5) + 32;
229     out[2] = ((d2 + d3) * dmf >> 5) + 32;
230     out[3] = ((d2 - d3) * dmf >> 5) + 32;
231 }
232
233 static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
234 {
235     dctcoef out[8];
236
237     if( chroma422 )
238         optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
239     else
240         optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
241
242     int sum = 0;
243     for( int i = 0; i < (chroma422?8:4); i++ )
244         sum |= ref[i] ^ out[i];
245     return sum >> 6;
246 }
247
248 static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
249 {
250     /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
251     dctcoef dct_orig[8];
252     int coeff, nz;
253
254     if( chroma422 )
255         optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
256     else
257         optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
258
259     /* If the DC coefficients already round to zero, terminate early. */
260     int sum = 0;
261     for( int i = 0; i < (chroma422?8:4); i++ )
262         sum |= dct_orig[i];
263     if( !(sum >> 6) )
264         return 0;
265
266     /* Start with the highest frequency coefficient... is this the best option? */
267     for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
268     {
269         int level = dct[coeff];
270         int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
271
272         while( level )
273         {
274             dct[coeff] = level - sign;
275             if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
276             {
277                 nz = 1;
278                 dct[coeff] = level;
279                 break;
280             }
281             level -= sign;
282         }
283     }
284
285     return nz;
286 }
287
288 static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
289 {
290     return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
291 }
292
293 static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
294 {
295     return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
296 }
297
298 static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
299 {
300     for( int i = 0; i < size; i++ )
301     {
302         int level = dct[i];
303         int sign = level>>31;
304         level = (level+sign)^sign;
305         sum[i] += level;
306         level -= offset[i];
307         dct[i] = level<0 ? 0 : (level^sign)-sign;
308     }
309 }
310
311 /* (ref: JVT-B118)
312  * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
313  * to 0 (low score means set it to null)
314  * Used in inter macroblock (luma and chroma)
315  *  luma: for a 8x8 block: if score < 4 -> null
316  *        for the complete mb: if score < 6 -> null
317  *  chroma: for the complete mb: if score < 7 -> null
318  */
319
320 const uint8_t x264_decimate_table4[16] =
321 {
322     3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0
323 };
324 const uint8_t x264_decimate_table8[64] =
325 {
326     3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,
327     1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
328     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
329     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
330 };
331
332 static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
333 {
334     const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
335     int i_score = 0;
336     int idx = i_max - 1;
337
338     while( idx >= 0 && dct[idx] == 0 )
339         idx--;
340     while( idx >= 0 )
341     {
342         int i_run;
343
344         if( (unsigned)(dct[idx--] + 1) > 2 )
345             return 9;
346
347         i_run = 0;
348         while( idx >= 0 && dct[idx] == 0 )
349         {
350             idx--;
351             i_run++;
352         }
353         i_score += ds_table[i_run];
354     }
355
356     return i_score;
357 }
358
359 static int x264_decimate_score15( dctcoef *dct )
360 {
361     return x264_decimate_score_internal( dct+1, 15 );
362 }
363 static int x264_decimate_score16( dctcoef *dct )
364 {
365     return x264_decimate_score_internal( dct, 16 );
366 }
367 static int x264_decimate_score64( dctcoef *dct )
368 {
369     return x264_decimate_score_internal( dct, 64 );
370 }
371
372 #define last(num)\
373 static int x264_coeff_last##num( dctcoef *l )\
374 {\
375     int i_last = num-1;\
376     while( i_last >= 0 && l[i_last] == 0 )\
377         i_last--;\
378     return i_last;\
379 }
380
381 last(4)
382 last(8)
383 last(15)
384 last(16)
385 last(64)
386
387 #define level_run(num)\
388 static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
389 {\
390     int i_last = runlevel->last = x264_coeff_last##num(dct);\
391     int i_total = 0;\
392     int mask = 0;\
393     do\
394     {\
395         runlevel->level[i_total++] = dct[i_last];\
396         mask |= 1 << (i_last);\
397         while( --i_last >= 0 && dct[i_last] == 0 );\
398     } while( i_last >= 0 );\
399     runlevel->mask = mask;\
400     return i_total;\
401 }
402
403 level_run(4)
404 level_run(8)
405 level_run(15)
406 level_run(16)
407
408 #if ARCH_X86_64
409 #define INIT_TRELLIS(cpu)\
410     pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\
411     pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\
412     pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\
413     pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\
414     pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\
415     pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu;
416 #else
417 #define INIT_TRELLIS(...)
418 #endif
419
420 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
421 {
422     pf->quant_8x8 = quant_8x8;
423     pf->quant_4x4 = quant_4x4;
424     pf->quant_4x4x4 = quant_4x4x4;
425     pf->quant_4x4_dc = quant_4x4_dc;
426     pf->quant_2x2_dc = quant_2x2_dc;
427
428     pf->dequant_4x4 = dequant_4x4;
429     pf->dequant_4x4_dc = dequant_4x4_dc;
430     pf->dequant_8x8 = dequant_8x8;
431
432     pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
433     pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
434
435     pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
436     pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
437
438     pf->denoise_dct = x264_denoise_dct;
439     pf->decimate_score15 = x264_decimate_score15;
440     pf->decimate_score16 = x264_decimate_score16;
441     pf->decimate_score64 = x264_decimate_score64;
442
443     pf->coeff_last4 = x264_coeff_last4;
444     pf->coeff_last8 = x264_coeff_last8;
445     pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
446     pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
447     pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
448     pf->coeff_level_run4 = x264_coeff_level_run4;
449     pf->coeff_level_run8 = x264_coeff_level_run8;
450     pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
451     pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
452
453 #if HIGH_BIT_DEPTH
454 #if HAVE_MMX
455     INIT_TRELLIS( sse2 );
456     if( cpu&X264_CPU_MMX2 )
457     {
458 #if ARCH_X86
459         pf->denoise_dct = x264_denoise_dct_mmx;
460         pf->decimate_score15 = x264_decimate_score15_mmx2;
461         pf->decimate_score16 = x264_decimate_score16_mmx2;
462         pf->decimate_score64 = x264_decimate_score64_mmx2;
463         pf->coeff_last8 = x264_coeff_last8_mmx2;
464         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
465         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
466         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
467         pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
468         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
469         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
470 #endif
471         pf->coeff_last4 = x264_coeff_last4_mmx2;
472         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
473         if( cpu&X264_CPU_LZCNT )
474             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
475     }
476     if( cpu&X264_CPU_SSE2 )
477     {
478         pf->quant_4x4 = x264_quant_4x4_sse2;
479         pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
480         pf->quant_8x8 = x264_quant_8x8_sse2;
481         pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
482         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
483         pf->dequant_4x4 = x264_dequant_4x4_sse2;
484         pf->dequant_8x8 = x264_dequant_8x8_sse2;
485         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
486         pf->denoise_dct = x264_denoise_dct_sse2;
487         pf->decimate_score15 = x264_decimate_score15_sse2;
488         pf->decimate_score16 = x264_decimate_score16_sse2;
489         pf->decimate_score64 = x264_decimate_score64_sse2;
490         pf->coeff_last8 = x264_coeff_last8_sse2;
491         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
492         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
493         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
494         pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
495         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
496         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
497         if( cpu&X264_CPU_LZCNT )
498         {
499             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
500             pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
501             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
502             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
503             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
504             pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
505             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
506             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
507         }
508     }
509     if( cpu&X264_CPU_SSSE3 )
510     {
511         pf->quant_4x4 = x264_quant_4x4_ssse3;
512         pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
513         pf->quant_8x8 = x264_quant_8x8_ssse3;
514         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
515         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
516         pf->denoise_dct = x264_denoise_dct_ssse3;
517         pf->decimate_score15 = x264_decimate_score15_ssse3;
518         pf->decimate_score16 = x264_decimate_score16_ssse3;
519         pf->decimate_score64 = x264_decimate_score64_ssse3;
520         INIT_TRELLIS( ssse3 );
521     }
522     if( cpu&X264_CPU_SSE4 )
523     {
524         pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
525         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
526         pf->quant_4x4 = x264_quant_4x4_sse4;
527         pf->quant_4x4x4 = x264_quant_4x4x4_sse4;
528         pf->quant_8x8 = x264_quant_8x8_sse4;
529     }
530     if( cpu&X264_CPU_AVX )
531     {
532         pf->denoise_dct = x264_denoise_dct_avx;
533     }
534     if( cpu&X264_CPU_XOP )
535     {
536         pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
537         if( h->param.i_cqm_preset != X264_CQM_FLAT )
538         {
539             pf->dequant_4x4 = x264_dequant_4x4_xop;
540             pf->dequant_8x8 = x264_dequant_8x8_xop;
541         }
542     }
543     if( cpu&X264_CPU_AVX2 )
544     {
545         pf->quant_4x4 = x264_quant_4x4_avx2;
546         pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
547         pf->quant_8x8 = x264_quant_8x8_avx2;
548         pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
549         pf->dequant_4x4 = x264_dequant_4x4_avx2;
550         pf->dequant_8x8 = x264_dequant_8x8_avx2;
551         pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
552         pf->denoise_dct = x264_denoise_dct_avx2;
553     }
554 #endif // HAVE_MMX
555 #else // !HIGH_BIT_DEPTH
556 #if HAVE_MMX
557     INIT_TRELLIS( sse2 );
558     if( cpu&X264_CPU_MMX )
559     {
560 #if ARCH_X86
561         pf->dequant_4x4 = x264_dequant_4x4_mmx;
562         pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
563         pf->dequant_8x8 = x264_dequant_8x8_mmx;
564         if( h->param.i_cqm_preset == X264_CQM_FLAT )
565         {
566             pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
567             pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
568         }
569         pf->denoise_dct = x264_denoise_dct_mmx;
570 #endif
571     }
572
573     if( cpu&X264_CPU_MMX2 )
574     {
575         pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
576 #if ARCH_X86
577         pf->quant_4x4 = x264_quant_4x4_mmx2;
578         pf->quant_8x8 = x264_quant_8x8_mmx2;
579         pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
580         pf->decimate_score15 = x264_decimate_score15_mmx2;
581         pf->decimate_score16 = x264_decimate_score16_mmx2;
582         pf->decimate_score64 = x264_decimate_score64_mmx2;
583         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmx2;
584         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
585         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
586         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
587         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
588 #endif
589         pf->coeff_last4 = x264_coeff_last4_mmx2;
590         pf->coeff_last8 = x264_coeff_last8_mmx2;
591         pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
592         pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
593         if( cpu&X264_CPU_LZCNT )
594         {
595             pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
596             pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
597             pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
598             pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
599         }
600     }
601
602     if( cpu&X264_CPU_SSE2 )
603     {
604         pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
605         pf->quant_4x4 = x264_quant_4x4_sse2;
606         pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
607         pf->quant_8x8 = x264_quant_8x8_sse2;
608         pf->dequant_4x4 = x264_dequant_4x4_sse2;
609         pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
610         pf->dequant_8x8 = x264_dequant_8x8_sse2;
611         if( h->param.i_cqm_preset == X264_CQM_FLAT )
612         {
613             pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
614             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
615         }
616         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
617         pf->denoise_dct = x264_denoise_dct_sse2;
618         pf->decimate_score15 = x264_decimate_score15_sse2;
619         pf->decimate_score16 = x264_decimate_score16_sse2;
620         pf->decimate_score64 = x264_decimate_score64_sse2;
621         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
622         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
623         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
624         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
625         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
626         if( cpu&X264_CPU_LZCNT )
627         {
628             pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
629             pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
630             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
631             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
632             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
633         }
634     }
635
636     if( cpu&X264_CPU_SSSE3 )
637     {
638         pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
639         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
640         pf->quant_4x4 = x264_quant_4x4_ssse3;
641         pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
642         pf->quant_8x8 = x264_quant_8x8_ssse3;
643         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
644         pf->denoise_dct = x264_denoise_dct_ssse3;
645         pf->decimate_score15 = x264_decimate_score15_ssse3;
646         pf->decimate_score16 = x264_decimate_score16_ssse3;
647         pf->decimate_score64 = x264_decimate_score64_ssse3;
648         INIT_TRELLIS( ssse3 );
649         pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
650         pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
651         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
652         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
653         if( cpu&X264_CPU_LZCNT )
654         {
655             pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
656             pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
657             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
658             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
659         }
660     }
661
662     if( cpu&X264_CPU_SSE4 )
663     {
664         pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
665         pf->quant_4x4 = x264_quant_4x4_sse4;
666         pf->quant_8x8 = x264_quant_8x8_sse4;
667         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
668     }
669
670     if( cpu&X264_CPU_AVX )
671     {
672         pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
673         if( h->param.i_cqm_preset != X264_CQM_FLAT )
674         {
675             pf->dequant_4x4 = x264_dequant_4x4_avx;
676             pf->dequant_8x8 = x264_dequant_8x8_avx;
677         }
678         pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
679         pf->denoise_dct = x264_denoise_dct_avx;
680     }
681
682     if( cpu&X264_CPU_XOP )
683     {
684         if( h->param.i_cqm_preset != X264_CQM_FLAT )
685         {
686             pf->dequant_4x4 = x264_dequant_4x4_xop;
687             pf->dequant_8x8 = x264_dequant_8x8_xop;
688         }
689     }
690
691     if( cpu&X264_CPU_AVX2 )
692     {
693         pf->quant_4x4 = x264_quant_4x4_avx2;
694         pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
695         pf->quant_8x8 = x264_quant_8x8_avx2;
696         pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
697         pf->dequant_4x4 = x264_dequant_4x4_avx2;
698         pf->dequant_8x8 = x264_dequant_8x8_avx2;
699         pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
700         if( h->param.i_cqm_preset == X264_CQM_FLAT )
701         {
702             pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
703             pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2;
704         }
705         pf->decimate_score64 = x264_decimate_score64_avx2;
706         pf->denoise_dct = x264_denoise_dct_avx2;
707         if( cpu&X264_CPU_LZCNT )
708         {
709             pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
710             pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
711             pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
712         }
713     }
714 #endif // HAVE_MMX
715
716 #if HAVE_ALTIVEC
717     if( cpu&X264_CPU_ALTIVEC )
718     {
719         pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
720         pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
721         pf->quant_4x4 = x264_quant_4x4_altivec;
722         pf->quant_8x8 = x264_quant_8x8_altivec;
723
724         pf->dequant_4x4 = x264_dequant_4x4_altivec;
725         pf->dequant_8x8 = x264_dequant_8x8_altivec;
726     }
727 #endif
728
729 #if HAVE_ARMV6
730     if( cpu&X264_CPU_ARMV6 )
731     {
732         pf->coeff_last4 = x264_coeff_last4_arm;
733         pf->coeff_last8 = x264_coeff_last8_arm;
734     }
735 #endif
736 #if HAVE_ARMV6 || ARCH_AARCH64
737     if( cpu&X264_CPU_NEON )
738     {
739         pf->quant_2x2_dc   = x264_quant_2x2_dc_neon;
740         pf->quant_4x4      = x264_quant_4x4_neon;
741         pf->quant_4x4_dc   = x264_quant_4x4_dc_neon;
742         pf->quant_4x4x4    = x264_quant_4x4x4_neon;
743         pf->quant_8x8      = x264_quant_8x8_neon;
744         pf->dequant_4x4    = x264_dequant_4x4_neon;
745         pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
746         pf->dequant_8x8    = x264_dequant_8x8_neon;
747         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
748         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
749         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
750     }
751 #endif
752 #if ARCH_AARCH64
753     if( cpu&X264_CPU_ARMV8 )
754     {
755         pf->coeff_last4 = x264_coeff_last4_aarch64;
756         pf->coeff_last8 = x264_coeff_last8_aarch64;
757         pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
758     }
759     if( cpu&X264_CPU_NEON )
760     {
761         pf->coeff_level_run8 = x264_coeff_level_run8_neon;
762         pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
763         pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
764         pf->decimate_score15 = x264_decimate_score15_neon;
765         pf->decimate_score16 = x264_decimate_score16_neon;
766         pf->decimate_score64 = x264_decimate_score64_neon;
767         pf->denoise_dct = x264_denoise_dct_neon;
768     }
769 #endif
770 #endif // HIGH_BIT_DEPTH
771     pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
772     pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
773     pf->coeff_last[DCT_CHROMA_AC]   = pf->coeff_last[DCT_CHROMAU_AC]  =
774     pf->coeff_last[DCT_CHROMAV_AC]  = pf->coeff_last[DCT_LUMA_AC];
775     pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];
776
777     pf->coeff_level_run[DCT_LUMA_DC]     = pf->coeff_level_run[DCT_CHROMAU_DC]  = pf->coeff_level_run[DCT_CHROMAV_DC] =
778     pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
779     pf->coeff_level_run[DCT_CHROMA_AC]   = pf->coeff_level_run[DCT_CHROMAU_AC]  =
780     pf->coeff_level_run[DCT_CHROMAV_AC]  = pf->coeff_level_run[DCT_LUMA_AC];
781 }