1 ;*****************************************************************************
2 ;* dct-a.asm: x86 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2016 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
34 pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
35 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
36 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
37 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
38 pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
39 pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
40 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
41 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
43 pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14
44 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14
45 pb_scan8framet3: SHUFFLE_MASK_W 0, 1, 5, 6, 8, 11, 12, 13
46 pb_scan8framet4: SHUFFLE_MASK_W 0, 3, 4, 5, 8, 11, 12, 15
47 pb_scan8framet5: SHUFFLE_MASK_W 1, 2, 6, 7, 9, 10, 13, 14
48 pb_scan8framet6: SHUFFLE_MASK_W 0, 3, 4, 5, 10, 11, 12, 15
49 pb_scan8framet7: SHUFFLE_MASK_W 1, 2, 6, 7, 8, 9, 14, 15
50 pb_scan8framet8: SHUFFLE_MASK_W 0, 1, 2, 7, 8, 10, 11, 14
51 pb_scan8framet9: SHUFFLE_MASK_W 1, 4, 5, 7, 8, 13, 14, 15
53 pb_scan8frame1: SHUFFLE_MASK_W 0, 8, 1, 2, 9, 12, 4, 13
54 pb_scan8frame2: SHUFFLE_MASK_W 4, 0, 1, 5, 8, 10, 12, 14
55 pb_scan8frame3: SHUFFLE_MASK_W 12, 10, 8, 6, 2, 3, 7, 9
56 pb_scan8frame4: SHUFFLE_MASK_W 0, 1, 8, 12, 4, 13, 9, 2
57 pb_scan8frame5: SHUFFLE_MASK_W 5, 14, 10, 3, 11, 15, 6, 7
58 pb_scan8frame6: SHUFFLE_MASK_W 6, 8, 12, 13, 9, 7, 5, 3
59 pb_scan8frame7: SHUFFLE_MASK_W 1, 3, 5, 7, 10, 14, 15, 11
60 pb_scan8frame8: SHUFFLE_MASK_W 10, 3, 11, 14, 5, 6, 15, 7
62 pb_scan8field1 : SHUFFLE_MASK_W 0, 1, 2, 8, 9, 3, 4, 10
63 pb_scan8field2a: SHUFFLE_MASK_W 0x80, 11, 5, 6, 7, 12,0x80,0x80
64 pb_scan8field2b: SHUFFLE_MASK_W 0,0x80,0x80,0x80,0x80,0x80, 1, 8
65 pb_scan8field3a: SHUFFLE_MASK_W 10, 5, 6, 7, 11,0x80,0x80,0x80
66 pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80, 1, 8, 2
67 pb_scan8field4a: SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80,0x80
68 pb_scan8field6 : SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80, 12
69 pb_scan8field7 : SHUFFLE_MASK_W 5, 6, 7, 11,0x80,0x80, 12, 13
85 cextern deinterleave_shufd
90 SUMSUB_BADC %1, %5, %4, %3, %2, %6
91 SUMSUB_BADC %1, %5, %3, %4, %2, %6
95 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
116 ;-----------------------------------------------------------------------------
117 ; void dct4x4dc( dctcoef d[4][4] )
118 ;-----------------------------------------------------------------------------
120 cglobal dct4x4dc, 1,1,5
125 WALSH4_1D d, 0,1,2,3,4
126 TRANSPOSE4x4D 0,1,2,3,4
128 WALSH4_1D d, 0,1,2,3,4
138 %endmacro ; DCT4x4_DC
147 cglobal dct4x4dc, 1,1
152 movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
153 WALSH4_1D w, 0,1,2,3,4
154 TRANSPOSE4x4W 0,1,2,3,4
155 SUMSUB_BADC w, 1, 0, 3, 2, 4
165 %endif ; HIGH_BIT_DEPTH
168 ;-----------------------------------------------------------------------------
169 ; void idct4x4dc( int32_t d[4][4] )
170 ;-----------------------------------------------------------------------------
172 cglobal idct4x4dc, 1,1
177 WALSH4_1D d,0,1,2,3,4
178 TRANSPOSE4x4D 0,1,2,3,4
179 WALSH4_1D d,0,1,2,3,4
185 %endmacro ; IDCT4x4DC
193 ;-----------------------------------------------------------------------------
194 ; void idct4x4dc( int16_t d[4][4] )
195 ;-----------------------------------------------------------------------------
197 cglobal idct4x4dc, 1,1
202 WALSH4_1D w,0,1,2,3,4
203 TRANSPOSE4x4W 0,1,2,3,4
204 WALSH4_1D w,0,1,2,3,4
210 %endif ; HIGH_BIT_DEPTH
213 ;-----------------------------------------------------------------------------
214 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
215 ;-----------------------------------------------------------------------------
217 cglobal sub4x4_dct, 3,3
219 LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
220 LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
221 LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
222 LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
224 TRANSPOSE4x4W 0,1,2,3,4
226 SUMSUB_BADC w, 3, 0, 2, 1
228 DCT_UNPACK m2, m4, m5
229 DCT_UNPACK m3, m6, m7
230 mova [r0+ 0], m2 ; s03 + s12
232 mova [r0+32], m3 ; s03 - s12
235 DCT_UNPACK m0, m2, m4
236 DCT_UNPACK m1, m3, m5
237 SUMSUB2_AB d, 0, 1, 4
238 SUMSUB2_AB d, 2, 3, 5
239 mova [r0+16], m0 ; d03*2 + d12
241 mova [r0+48], m4 ; d03 - 2*d12
247 cglobal sub4x4_dct, 3,3
252 LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
254 TRANSPOSE4x4W 0,1,2,3,4
267 %endif ; HIGH_BIT_DEPTH
270 ;-----------------------------------------------------------------------------
271 ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
272 ;-----------------------------------------------------------------------------
273 %macro STORE_DIFFx2 6
280 CLIPW %1, %4, [pw_pixel_max]
286 cglobal add4x4_idct, 2,2,6
287 add r0, 2*FDEC_STRIDEB
293 IDCT4_1D d,0,1,2,3,4,5
294 TRANSPOSE4x4D 0,1,2,3,4
296 IDCT4_1D d,0,1,2,3,4,5
298 STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
299 STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
308 %else ; !HIGH_BIT_DEPTH
311 cglobal add4x4_idct, 2,2
318 IDCT4_1D w,0,1,2,3,4,5
319 TRANSPOSE4x4W 0,1,2,3,4
321 IDCT4_1D w,0,1,2,3,4,5
322 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
323 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
324 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
325 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
329 cglobal add4x4_idct, 2,2,6
330 mova m1, [r1+0x00] ; row1/row0
331 mova m3, [r1+0x10] ; row3/row2
332 psraw m0, m1, 1 ; row1>>1/...
333 psraw m2, m3, 1 ; row3>>1/...
334 movsd m0, m1 ; row1>>1/row0
335 movsd m2, m3 ; row3>>1/row2
336 psubw m0, m3 ; row1>>1-row3/row0-2
337 paddw m2, m1 ; row3>>1+row1/row0+2
338 SBUTTERFLY2 wd, 0, 2, 1
340 pshuflw m1, m2, q2301
341 pshufhw m2, m2, q2301
347 paddw m1, m0 ; row1/row0 corrected
348 psraw m0, 1 ; row1>>1/...
349 psraw m3, m2, 1 ; row3>>1/...
350 movsd m0, m1 ; row1>>1/row0
351 movsd m3, m2 ; row3>>1/row2
352 psubw m0, m2 ; row1>>1-row3/row0-2
353 paddw m3, m1 ; row3>>1+row1/row0+2
354 SBUTTERFLY2 qdq, 0, 3, 1
357 movd m4, [r0+FDEC_STRIDE*0]
358 movd m1, [r0+FDEC_STRIDE*1]
359 movd m2, [r0+FDEC_STRIDE*2]
360 movd m5, [r0+FDEC_STRIDE*3]
361 punpckldq m1, m4 ; row0/row1
363 punpckldq m2, m5 ; row3/row2
370 packuswb m0, m3 ; row0/row1/row3/row2
371 pextrd [r0+FDEC_STRIDE*0], m0, 3
372 pextrd [r0+FDEC_STRIDE*1], m0, 2
373 movd [r0+FDEC_STRIDE*2], m0
374 pextrd [r0+FDEC_STRIDE*3], m0, 1
383 %macro STOREx2_AVX2 9
384 movq xm%3, [r0+%5*FDEC_STRIDE]
385 vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
386 movq xm%4, [r0+%7*FDEC_STRIDE]
387 vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
395 vextracti128 xm%2, m%1, 1
396 movq [r0+%5*FDEC_STRIDE], xm%1
397 movq [r0+%6*FDEC_STRIDE], xm%2
398 movhps [r0+%7*FDEC_STRIDE], xm%1
399 movhps [r0+%8*FDEC_STRIDE], xm%2
403 cglobal add8x8_idct, 2,3,8
404 add r0, 4*FDEC_STRIDE
406 TAIL_CALL .skip_prologue, 0
407 global current_function %+ .skip_prologue
414 vinserti128 m0, m0, [r1+ 64], 1
415 vinserti128 m1, m1, [r1+ 96], 1
416 vinserti128 m2, m2, [r1+ 80], 1
417 vinserti128 m3, m3, [r1+112], 1
418 SBUTTERFLY qdq, 0, 1, 4
419 SBUTTERFLY qdq, 2, 3, 4
420 IDCT4_1D w,0,1,2,3,4,5
421 TRANSPOSE2x4x4W 0,1,2,3,4
423 IDCT4_1D w,0,1,2,3,4,5
424 STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
425 STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
428 ; 2xdst, 2xtmp, 4xsrcrow, 1xzero
429 %macro LOAD_DIFF8x2_AVX2 9
430 movq xm%1, [r1+%5*FENC_STRIDE]
431 movq xm%2, [r1+%6*FENC_STRIDE]
432 vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
433 vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
436 movq xm%3, [r2+(%5-4)*FDEC_STRIDE]
437 movq xm%4, [r2+(%6-4)*FDEC_STRIDE]
438 vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
439 vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
447 %macro STORE8_DCT_AVX2 5
448 SBUTTERFLY qdq, %1, %2, %5
449 SBUTTERFLY qdq, %3, %4, %5
454 vextracti128 [r0+ 64], m%1, 1
455 vextracti128 [r0+ 80], m%3, 1
456 vextracti128 [r0+ 96], m%2, 1
457 vextracti128 [r0+112], m%4, 1
460 %macro STORE16_DCT_AVX2 5
461 SBUTTERFLY qdq, %1, %2, %5
462 SBUTTERFLY qdq, %3, %4, %5
463 mova [r0+ 0-128], xm%1
464 mova [r0+16-128], xm%3
465 mova [r0+32-128], xm%2
466 mova [r0+48-128], xm%4
467 vextracti128 [r0+ 0], m%1, 1
468 vextracti128 [r0+16], m%3, 1
469 vextracti128 [r0+32], m%2, 1
470 vextracti128 [r0+48], m%4, 1
474 cglobal sub8x8_dct, 3,3,7
476 add r2, 4*FDEC_STRIDE
477 LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
478 LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
479 DCT4_1D 0, 1, 2, 3, 4
480 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
481 DCT4_1D 0, 1, 2, 3, 4
482 STORE8_DCT_AVX2 0, 1, 2, 3, 4
486 cglobal sub16x16_dct, 3,3,6
488 add r2, 4*FDEC_STRIDE
491 add r1, 4*FENC_STRIDE
492 add r2, 4*FDEC_STRIDE
495 add r1, 4*FENC_STRIDE
496 add r2, 4*FDEC_STRIDE
499 add r1, 4*FENC_STRIDE
500 add r2, 4*FDEC_STRIDE
504 LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
505 LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
506 DCT4_1D 0, 1, 2, 3, 4
507 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
508 DCT4_1D 0, 1, 2, 3, 4
509 STORE16_DCT_AVX2 0, 1, 2, 3, 4
511 %endif ; HIGH_BIT_DEPTH
514 ;-----------------------------------------------------------------------------
515 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
516 ;-----------------------------------------------------------------------------
519 %if HIGH_BIT_DEPTH == 0
523 add r2, 4*FDEC_STRIDE
526 %endif ; !HIGH_BIT_DEPTH
528 call %2.skip_prologue
530 add r1, %4-%5-%6*FENC_STRIDE
531 add r2, %4-%5-%6*FDEC_STRIDE
532 call %2.skip_prologue
534 add r1, (%4-%6)*FENC_STRIDE-%5-%4
535 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
536 call %2.skip_prologue
538 add r1, %4-%5-%6*FENC_STRIDE
539 add r2, %4-%5-%6*FDEC_STRIDE
540 TAIL_CALL %2.skip_prologue, 1
543 ;-----------------------------------------------------------------------------
544 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
545 ;-----------------------------------------------------------------------------
546 %macro ADD_NxN_IDCT 6-7
556 %if mmsize>=16 && %3!=256
557 add r0, 4*FDEC_STRIDE
560 call %2.skip_prologue
561 add r0, %4-%5-%6*FDEC_STRIDE
563 call %2.skip_prologue
564 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
566 call %2.skip_prologue
567 add r0, %4-%5-%6*FDEC_STRIDE
569 TAIL_CALL %2.skip_prologue, 1
574 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
575 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
577 ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
578 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
579 ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
580 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
581 cextern add8x8_idct8_sse2.skip_prologue
582 cextern add8x8_idct8_avx.skip_prologue
583 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
584 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
585 cextern sub8x8_dct8_sse2.skip_prologue
586 cextern sub8x8_dct8_sse4.skip_prologue
587 cextern sub8x8_dct8_avx.skip_prologue
588 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
589 SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
590 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
591 %else ; !HIGH_BIT_DEPTH
594 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
595 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
596 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
597 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
599 cextern sub8x8_dct8_mmx.skip_prologue
600 cextern add8x8_idct8_mmx.skip_prologue
601 SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
602 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
606 cextern sub8x8_dct_sse2.skip_prologue
607 cextern sub8x8_dct_ssse3.skip_prologue
608 cextern sub8x8_dct_avx.skip_prologue
609 cextern sub8x8_dct_xop.skip_prologue
610 SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
611 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
612 SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
613 SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
615 cextern add8x8_idct_sse2.skip_prologue
616 cextern add8x8_idct_avx.skip_prologue
617 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
618 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0
620 cextern add8x8_idct8_sse2.skip_prologue
621 cextern add8x8_idct8_avx.skip_prologue
622 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
623 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
625 cextern sub8x8_dct8_sse2.skip_prologue
626 cextern sub8x8_dct8_ssse3.skip_prologue
627 cextern sub8x8_dct8_avx.skip_prologue
628 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
629 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
630 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
633 ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
634 %endif ; HIGH_BIT_DEPTH
637 ;-----------------------------------------------------------------------------
638 ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
639 ;-----------------------------------------------------------------------------
641 mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
642 mova m1, [%1+FDEC_STRIDEB*1]
643 mova m2, [%1+FDEC_STRIDEB*2]
647 paddsw %2, [%1+FDEC_STRIDEB*3]
652 mova [%1+FDEC_STRIDEB*0], m0
653 mova [%1+FDEC_STRIDEB*1], m1
654 mova [%1+FDEC_STRIDEB*2], m2
655 mova [%1+FDEC_STRIDEB*3], %2
659 cglobal add8x8_idct_dc, 2,2,7
660 mova m6, [pw_pixel_max]
664 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
665 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
666 pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
667 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
668 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
669 ADD_DC r0+FDEC_STRIDEB*0, m4
670 ADD_DC r0+FDEC_STRIDEB*4, m3
673 cglobal add16x16_idct_dc, 2,3,8
675 mova m6, [pw_pixel_max]
681 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
682 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
683 pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
684 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
685 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
686 ADD_DC r0+FDEC_STRIDEB*0, m4
687 ADD_DC r0+SIZEOF_PIXEL*8, m3
689 add r0, 4*FDEC_STRIDEB
693 %endmacro ; ADD_IDCT_DC
700 %else ;!HIGH_BIT_DEPTH
702 mova m4, [%3+FDEC_STRIDE*0]
703 mova m5, [%3+FDEC_STRIDE*1]
704 mova m6, [%3+FDEC_STRIDE*2]
708 paddusb %1, [%3+FDEC_STRIDE*3]
713 mova [%3+FDEC_STRIDE*0], m4
714 mova [%3+FDEC_STRIDE*1], m5
715 mova [%3+FDEC_STRIDE*2], m6
716 mova [%3+FDEC_STRIDE*3], %1
720 cglobal add8x8_idct_dc, 2,2
723 add r0, FDEC_STRIDE*4
735 ADD_DC m0, m1, r0-FDEC_STRIDE*4
740 cglobal add8x8_idct_dc, 2,2
743 add r0, FDEC_STRIDE*4
744 pmulhrsw m0, [pw_512]
746 mova m5, [pb_unpackbd1]
751 movh m2, [r0+FDEC_STRIDE*-4]
752 movh m3, [r0+FDEC_STRIDE*-3]
753 movh m4, [r0+FDEC_STRIDE*-2]
754 movh m5, [r0+FDEC_STRIDE*-1]
755 movhps m2, [r0+FDEC_STRIDE* 0]
756 movhps m3, [r0+FDEC_STRIDE* 1]
757 movhps m4, [r0+FDEC_STRIDE* 2]
758 movhps m5, [r0+FDEC_STRIDE* 3]
767 movh [r0+FDEC_STRIDE*-4], m2
768 movh [r0+FDEC_STRIDE*-3], m3
769 movh [r0+FDEC_STRIDE*-2], m4
770 movh [r0+FDEC_STRIDE*-1], m5
771 movhps [r0+FDEC_STRIDE* 0], m2
772 movhps [r0+FDEC_STRIDE* 1], m3
773 movhps [r0+FDEC_STRIDE* 2], m4
774 movhps [r0+FDEC_STRIDE* 3], m5
778 cglobal add16x16_idct_dc, 2,3
797 add r0, FDEC_STRIDE*4
803 cglobal add16x16_idct_dc, 2,2,8
805 add r0, FDEC_STRIDE*4
808 add r0, FDEC_STRIDE*4
827 ADD_DC m0, m1, r0+FDEC_STRIDE*-4
832 cglobal add16x16_idct_dc, 2,2,8
834 add r0, FDEC_STRIDE*4
837 add r0, FDEC_STRIDE*4
841 pmulhrsw m0, [pw_512]
843 mova m5, [pb_unpackbd1]
844 mova m6, [pb_unpackbd2]
851 ADD_DC m0, m1, r0+FDEC_STRIDE*-4
862 mova xm4, [r0+FDEC_STRIDE*0+%3]
863 mova xm5, [r0+FDEC_STRIDE*1+%3]
864 vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
865 vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
870 mova [r0+FDEC_STRIDE*0+%3], xm4
871 mova [r0+FDEC_STRIDE*1+%3], xm5
872 vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
873 vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
877 cglobal add16x16_idct_dc, 2,3,6
878 add r0, FDEC_STRIDE*4
881 pmulhrsw m0, [pw_512]
883 mova m4, [pb_unpackbd1]
884 mova m5, [pb_unpackbd2]
887 pshufb m2, m0, m4 ; row0, row2
888 pshufb m3, m1, m4 ; row0, row2
889 pshufb m0, m5 ; row1, row3
890 pshufb m1, m5 ; row1, row3
891 lea r2, [r0+FDEC_STRIDE*8]
892 ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
893 ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
894 ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
895 ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
898 %endif ; HIGH_BIT_DEPTH
900 ;-----------------------------------------------------------------------------
901 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
902 ;-----------------------------------------------------------------------------
904 %macro DCTDC_2ROW_MMX 4
905 mova %1, [r1+FENC_STRIDE*(0+%3)]
906 mova m1, [r1+FENC_STRIDE*(1+%3)]
907 mova m2, [r2+FDEC_STRIDE*(0+%4)]
908 mova m3, [r2+FDEC_STRIDE*(1+%4)]
924 %macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
925 PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0
926 PSHUFLW m0, %2, q2301 ; s3 __ s2 __
927 paddw m1, %2 ; s1 s13 s0 s02
928 psubw m1, m0 ; d13 s13 d02 s02
929 PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02
930 psrlq m1, 32 ; __ __ d13 s13
931 paddw m0, m1 ; d02 s02 d02+d13 s02+s13
932 psllq m1, 32 ; d13 s13
933 psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
936 %if HIGH_BIT_DEPTH == 0
938 cglobal sub8x8_dct_dc, 3,3
939 DCTDC_2ROW_MMX m0, m4, 0, 0
940 DCTDC_2ROW_MMX m5, m6, 2, 2
944 add r2, FDEC_STRIDE*4
945 DCTDC_2ROW_MMX m7, m4, 4, 0
946 DCTDC_2ROW_MMX m5, m6, 6, 2
954 %macro DCTDC_2ROW_SSE2 4
955 movh m1, [r1+FENC_STRIDE*(0+%1)]
956 movh m2, [r1+FENC_STRIDE*(1+%1)]
958 movh m2, [r2+FDEC_STRIDE*(0+%2)]
959 punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
962 ACCUM paddd, %4, 1, %3
967 cglobal sub8x8_dct_dc, 3,3
969 DCTDC_2ROW_SSE2 0, 0, 0, 3
970 DCTDC_2ROW_SSE2 2, 2, 1, 3
971 add r2, FDEC_STRIDE*4
972 DCTDC_2ROW_SSE2 4, 0, 0, 4
973 DCTDC_2ROW_SSE2 6, 2, 1, 4
980 %macro SUB8x16_DCT_DC 0
981 cglobal sub8x16_dct_dc, 3,3
983 DCTDC_2ROW_SSE2 0, 0, 0, 3
984 DCTDC_2ROW_SSE2 2, 2, 1, 3
985 add r1, FENC_STRIDE*8
986 add r2, FDEC_STRIDE*8
987 DCTDC_2ROW_SSE2 -4, -4, 0, 4
988 DCTDC_2ROW_SSE2 -2, -2, 1, 4
990 DCTDC_2ROW_SSE2 0, 0, 0, 5
991 DCTDC_2ROW_SSE2 2, 2, 1, 5
992 add r2, FDEC_STRIDE*4
993 DCTDC_2ROW_SSE2 4, 0, 0, 4
994 DCTDC_2ROW_SSE2 6, 2, 1, 4
997 %define %%sign psignw
999 %define %%sign pmullw
1001 SUMSUB_BA d, 5, 3, 0
1003 pshuflw m0, m5, q2301
1004 pshufhw m0, m0, q2301
1005 %%sign m5, [pw_pmpmpmpm]
1007 pshufd m1, m0, q1320
1008 pshufd m0, m0, q0231
1009 %%sign m1, [pw_ppppmmmm]
1013 %endmacro ; SUB8x16_DCT_DC
1020 %endif ; !HIGH_BIT_DEPTH
1022 %macro DCTDC_4ROW_SSE2 2
1023 mova %1, [r1+FENC_STRIDEB*%2]
1024 mova m0, [r2+FDEC_STRIDEB*%2]
1027 paddw %1, [r1+FENC_STRIDEB*Y]
1028 paddw m0, [r2+FDEC_STRIDEB*Y]
1032 pshufd m0, %1, q2301
1037 %macro SUB8x8_DCT_DC_10 0
1038 cglobal sub8x8_dct_dc, 3,3,3
1039 DCTDC_4ROW_SSE2 m1, 0
1040 DCTDC_4ROW_SSE2 m2, 4
1041 mova m0, [pw_ppmmmmpp]
1044 pshufd m0, m1, q2200 ; -1 -1 +0 +0
1045 pshufd m1, m1, q0033 ; +0 +0 +1 +1
1047 pshufd m0, m2, q1023 ; -2 +2 -3 +3
1056 %macro SUB8x16_DCT_DC_10 0
1057 cglobal sub8x16_dct_dc, 3,3,6
1058 DCTDC_4ROW_SSE2 m1, 0
1059 DCTDC_4ROW_SSE2 m2, 4
1060 DCTDC_4ROW_SSE2 m3, 8
1061 DCTDC_4ROW_SSE2 m4, 12
1062 mova m0, [pw_ppmmmmpp]
1065 pshufd m5, m1, q2200 ; -1 -1 +0 +0
1066 pshufd m1, m1, q0033 ; +0 +0 +1 +1
1068 pshufd m5, m2, q1023 ; -2 +2 -3 +3
1070 paddd m1, m5 ; a6 a2 a4 a0
1073 pshufd m5, m3, q2200
1074 pshufd m3, m3, q0033
1076 pshufd m5, m4, q1023
1078 paddd m3, m5 ; a7 a3 a5 a1
1081 pshufd m0, m0, q3120
1082 pshufd m1, m1, q3120
1083 punpcklqdq m2, m0, m1
1095 ;-----------------------------------------------------------------------------
1096 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
1097 ;-----------------------------------------------------------------------------
1099 cglobal zigzag_scan_8x8_frame, 2,2,8
1101 movdqa xmm1, [r1+16]
1103 PALIGNR xmm1, xmm1, 14, xmm2
1106 movdqa xmm2, [r1+32]
1107 movdqa xmm3, [r1+48]
1108 PALIGNR xmm2, xmm2, 12, xmm4
1110 PALIGNR xmm3, xmm3, 10, xmm4
1113 punpckhwd xmm0, xmm1
1114 punpckhwd xmm2, xmm3
1133 movdqa xmm4, [r1+64]
1134 movdqa xmm5, [r1+80]
1135 movdqa xmm6, [r1+96]
1136 movdqa xmm7, [r1+112]
1144 PALIGNR xmm4, xmm4, 14, xmm3
1146 PALIGNR xmm5, xmm5, 12, xmm3
1148 PALIGNR xmm6, xmm6, 10, xmm3
1151 PALIGNR xmm7, xmm7, 8, xmm3
1155 punpcklqdq xmm7, xmm7
1159 punpckhwd xmm4, xmm5
1160 punpckhwd xmm6, xmm7
1175 pshufw mm4, mm4, q1230
1183 punpckhdq xmm3, xmm0, xmm2
1184 punpckldq xmm0, xmm2
1185 punpckhdq xmm7, xmm4, xmm6
1186 punpckldq xmm4, xmm6
1187 pshufhw xmm0, xmm0, q0123
1188 pshuflw xmm4, xmm4, q0123
1189 pshufhw xmm3, xmm3, q0123
1190 pshuflw xmm7, xmm7, q0123
1192 movlps [r0+2*10], xmm0
1193 movhps [r0+2*17], xmm0
1194 movlps [r0+2*21], xmm3
1195 movlps [r0+2*28], xmm4
1196 movhps [r0+2*32], xmm3
1197 movhps [r0+2*39], xmm4
1198 movlps [r0+2*43], xmm7
1199 movhps [r0+2*50], xmm7
1204 %if HIGH_BIT_DEPTH == 0
1211 ;-----------------------------------------------------------------------------
1212 ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
1213 ;-----------------------------------------------------------------------------
1215 ; 0 8 1 2 9 16 24 17
1216 ; 10 3 4 11 18 25 32 40
1217 ; 33 26 19 12 5 6 13 20
1218 ; 27 34 41 48 56 49 42 35
1219 ; 28 21 14 7 15 22 29 36
1220 ; 43 50 57 58 51 44 37 30
1221 ; 23 31 38 45 52 59 60 53
1222 ; 46 39 47 54 61 62 55 63
1223 %macro SCAN_8x8_FRAME 5
1224 cglobal zigzag_scan_8x8_frame, 2,2,8
1226 mova m1, [r1+ 8*SIZEOF_DCTCOEF]
1227 movu m2, [r1+14*SIZEOF_DCTCOEF]
1228 movu m3, [r1+21*SIZEOF_DCTCOEF]
1229 mova m4, [r1+28*SIZEOF_DCTCOEF]
1230 punpckl%4 m5, m0, m1
1232 punpckh%4 m6, m1, m0
1236 mova m7, [r1+52*SIZEOF_DCTCOEF]
1237 mova m0, [r1+60*SIZEOF_DCTCOEF]
1244 mova [r0+ 4*SIZEOF_DCTCOEF], m1
1245 mova [r0+ 8*SIZEOF_DCTCOEF], m6
1248 mova m1, [r1+32*SIZEOF_DCTCOEF]
1249 movu m5, [r1+39*SIZEOF_DCTCOEF]
1250 movu m2, [r1+46*SIZEOF_DCTCOEF]
1251 movu [r0+35*SIZEOF_DCTCOEF], m3
1252 movu [r0+47*SIZEOF_DCTCOEF], m4
1255 punpckh%3 m3, m5, m5
1258 mova [r0+52*SIZEOF_DCTCOEF], m6
1259 movu [r0+13*SIZEOF_DCTCOEF], m5
1260 movu m4, [r1+11*SIZEOF_DCTCOEF]
1261 movu m6, [r1+25*SIZEOF_DCTCOEF]
1265 mova m3, [r1+ 4*SIZEOF_DCTCOEF]
1266 movu m7, [r1+18*SIZEOF_DCTCOEF]
1268 movu [r0+25*SIZEOF_DCTCOEF], m1
1275 punpckh%3 m3, m6, m4
1276 punpckh%3 m7, m5, m1
1279 movu m4, [r1+35*SIZEOF_DCTCOEF]
1280 movu m1, [r1+49*SIZEOF_DCTCOEF]
1281 pshuf%5 m6, m6, q0123
1282 pshuf%5 m5, m5, q0123
1283 mova [r0+60*SIZEOF_DCTCOEF], m0
1284 mova [r0+56*SIZEOF_DCTCOEF], m2
1285 movu m0, [r1+42*SIZEOF_DCTCOEF]
1286 mova m2, [r1+56*SIZEOF_DCTCOEF]
1287 movu [r0+17*SIZEOF_DCTCOEF], m3
1288 mova [r0+32*SIZEOF_DCTCOEF], m7
1289 movu [r0+10*SIZEOF_DCTCOEF], m6
1290 movu [r0+21*SIZEOF_DCTCOEF], m5
1291 punpckh%4 m3, m0, m4
1292 punpckh%4 m7, m2, m1
1295 punpckl%3 m4, m2, m0
1296 punpckl%3 m1, m7, m3
1299 pshuf%5 m2, m2, q0123
1300 pshuf%5 m7, m7, q0123
1301 mova [r0+28*SIZEOF_DCTCOEF], m4
1302 movu [r0+43*SIZEOF_DCTCOEF], m1
1303 movu [r0+39*SIZEOF_DCTCOEF], m2
1304 movu [r0+50*SIZEOF_DCTCOEF], m7
1310 SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1312 SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1315 SCAN_8x8_FRAME 16, q , dq , wd, w
1318 ;-----------------------------------------------------------------------------
1319 ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
1320 ;-----------------------------------------------------------------------------
1322 cglobal zigzag_scan_4x4_frame, 2,2,6
1323 mova m0, [r1+ 0*SIZEOF_DCTCOEF]
1324 mova m1, [r1+ 4*SIZEOF_DCTCOEF]
1325 mova m2, [r1+ 8*SIZEOF_DCTCOEF]
1326 mova m3, [r1+12*SIZEOF_DCTCOEF]
1327 punpckl%4 m4, m0, m1
1330 mova [r0+ 0*SIZEOF_DCTCOEF], m4
1332 punpckh%4 m4, m2, m3
1335 punpckl%4 m5, m1, m3
1340 mova [r0+ 4*SIZEOF_DCTCOEF], m5
1341 mova [r0+ 8*SIZEOF_DCTCOEF], m1
1342 mova [r0+12*SIZEOF_DCTCOEF], m3
1348 SCAN_4x4 4, dq, qdq, dq
1350 SCAN_4x4 4, dq, qdq, dq
1353 SCAN_4x4 16, q , dq , wd
1355 ;-----------------------------------------------------------------------------
1356 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
1357 ;-----------------------------------------------------------------------------
1358 %macro SCAN_4x4_FRAME 0
1359 cglobal zigzag_scan_4x4_frame, 2,2
1362 pshufb m1, [pb_scan4frameb]
1363 pshufb m0, [pb_scan4framea]
1379 cglobal zigzag_scan_4x4_frame, 2,2
1382 vpperm m2, m0, m1, [pb_scan4frame2a]
1383 vpperm m1, m0, m1, [pb_scan4frame2b]
1387 %endif ; !HIGH_BIT_DEPTH
1390 ;-----------------------------------------------------------------------------
1391 ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
1392 ;-----------------------------------------------------------------------------
1394 cglobal zigzag_scan_4x4_field, 2,3
1396 pshufd m0, m4, q3102
1408 ;-----------------------------------------------------------------------------
1409 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
1410 ;-----------------------------------------------------------------------------
1411 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
1413 cglobal zigzag_scan_4x4_field, 2,3
1414 pshufw m0, [r1+4], q3102
1425 %endif ; HIGH_BIT_DEPTH
1427 ;-----------------------------------------------------------------------------
1428 ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
1429 ;-----------------------------------------------------------------------------
1432 ; 16 11 5 6 7 12 17 24
1433 ; 18 13 14 15 19 25 32 26
1434 ; 20 21 22 23 27 33 40 34
1435 ; 28 29 30 31 35 41 48 42
1436 ; 36 37 38 39 43 49 50 44
1437 ; 45 46 47 51 56 57 52 53
1438 ; 54 55 58 59 60 61 62 63
1441 cglobal zigzag_scan_8x8_field, 2,3,8
1442 mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
1443 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
1444 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
1445 pshuf%1 m3, m0, q3333 ; 03 03 03 03
1446 movd r2d, m2 ; 09 08
1447 pshuf%1 m2, m2, q0321 ; 08 11 10 09
1448 punpckl%2 m3, m1 ; 05 03 04 03
1449 pinsr%1 m0, r2d, 3 ; 08 02 01 00
1450 punpckl%2 m4, m2, m3 ; 04 10 03 09
1451 pshuf%1 m4, m4, q2310 ; 10 04 03 09
1452 mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
1453 mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
1454 mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
1455 mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
1456 punpckl%3 m6, m5 ; 17 16 XX XX
1457 psrl%4 m1, %5 ; XX 07 06 05
1458 punpckh%2 m6, m2 ; 08 17 11 16
1459 punpckl%3 m6, m1 ; 06 05 11 16
1460 mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
1461 psrl%4 m1, %5 ; XX XX 07 06
1462 punpckl%2 m1, m5 ; 17 07 16 06
1463 mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
1464 mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
1465 punpckh%3 m1, m1 ; 17 07 17 07
1466 punpckl%2 m6, m3, m2 ; 25 13 24 12
1468 mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
1469 punpckl%2 m1, m6 ; 24 17 12 07
1470 mova [r0+12*SIZEOF_DCTCOEF], m1
1471 pinsr%1 m3, r2d, 0 ; 15 14 13 18
1472 mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
1473 mova m7, [r1+28*SIZEOF_DCTCOEF]
1474 mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
1475 psrl%4 m5, %5*3 ; XX XX XX 19
1476 pshuf%1 m1, m2, q3321 ; 27 27 26 25
1477 punpckl%2 m5, m0 ; 33 XX 32 19
1478 psrl%4 m2, %5*3 ; XX XX XX 27
1479 punpckl%2 m5, m1 ; 26 32 25 19
1480 mova [r0+32*SIZEOF_DCTCOEF], m7
1481 mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
1482 mova m7, [r1+36*SIZEOF_DCTCOEF]
1483 mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
1484 pshuf%1 m3, m0, q3321 ; 35 35 34 33
1485 punpckl%2 m2, m1 ; 41 XX 40 27
1486 mova [r0+40*SIZEOF_DCTCOEF], m7
1487 punpckl%2 m2, m3 ; 34 40 33 27
1488 mova [r0+28*SIZEOF_DCTCOEF], m2
1489 mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
1490 mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
1491 psrl%4 m0, %5*3 ; XX XX XX 35
1492 punpckl%2 m0, m2 ; 49 XX 48 35
1493 pshuf%1 m3, m1, q3321 ; 43 43 42 41
1494 punpckl%2 m0, m3 ; 42 48 41 35
1495 mova [r0+36*SIZEOF_DCTCOEF], m0
1496 pextr%1 r2d, m2, 3 ; 51
1497 psrl%4 m1, %5*3 ; XX XX XX 43
1498 punpckl%2 m1, m7 ; 45 XX 44 43
1499 psrl%4 m2, %5 ; XX 51 50 49
1500 punpckl%2 m1, m2 ; 50 44 49 43
1501 pshuf%1 m1, m1, q2310 ; 44 50 49 43
1502 mova [r0+44*SIZEOF_DCTCOEF], m1
1503 psrl%4 m7, %5 ; XX 47 46 45
1504 pinsr%1 m7, r2d, 3 ; 51 47 46 45
1505 mova [r0+48*SIZEOF_DCTCOEF], m7
1506 mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
1507 mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
1508 mova m7, [r1+60*SIZEOF_DCTCOEF]
1509 punpckl%3 m2, m0, m1 ; 53 52 57 56
1510 punpckh%3 m1, m0 ; 59 58 55 54
1511 mova [r0+52*SIZEOF_DCTCOEF], m2
1512 mova [r0+56*SIZEOF_DCTCOEF], m1
1513 mova [r0+60*SIZEOF_DCTCOEF], m7
1518 SCAN_8x8 d, dq, qdq, dq, 4
1520 SCAN_8x8 d, dq, qdq, dq, 4
1523 SCAN_8x8 w, wd, dq , q , 16
1526 ;-----------------------------------------------------------------------------
1527 ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
1528 ;-----------------------------------------------------------------------------
1529 %macro ZIGZAG_SUB_4x4 2
1531 cglobal zigzag_sub_4x4%1_%2, 4,4,8
1533 cglobal zigzag_sub_4x4%1_%2, 3,3,8
1535 movd m0, [r1+0*FENC_STRIDE]
1536 movd m1, [r1+1*FENC_STRIDE]
1537 movd m2, [r1+2*FENC_STRIDE]
1538 movd m3, [r1+3*FENC_STRIDE]
1539 movd m4, [r2+0*FDEC_STRIDE]
1540 movd m5, [r2+1*FDEC_STRIDE]
1541 movd m6, [r2+2*FDEC_STRIDE]
1542 movd m7, [r2+3*FDEC_STRIDE]
1543 movd [r2+0*FDEC_STRIDE], m0
1544 movd [r2+1*FDEC_STRIDE], m1
1545 movd [r2+2*FDEC_STRIDE], m2
1546 movd [r2+3*FDEC_STRIDE], m3
1553 mova m7, [pb_sub4%2]
1557 punpckhbw m1, m0, m4
1563 pand m0, [pb_subacmask]
1579 %if HIGH_BIT_DEPTH == 0
1581 ZIGZAG_SUB_4x4 , frame
1582 ZIGZAG_SUB_4x4 ac, frame
1583 ZIGZAG_SUB_4x4 , field
1584 ZIGZAG_SUB_4x4 ac, field
1586 ZIGZAG_SUB_4x4 , frame
1587 ZIGZAG_SUB_4x4 ac, frame
1588 ZIGZAG_SUB_4x4 , field
1589 ZIGZAG_SUB_4x4 ac, field
1590 %endif ; !HIGH_BIT_DEPTH
1592 %if HIGH_BIT_DEPTH == 0
1594 cglobal zigzag_scan_8x8_field, 2,3,7
1595 lea r2, [pb_scan8field1]
1596 %define off(m) (r2+m-pb_scan8field1)
1599 vpperm m5, m0, m1, [off(pb_scan8field1)]
1601 vpperm m0, m0, m1, [off(pb_scan8field2a)]
1604 vpperm m5, m2, m3, [off(pb_scan8field2b)]
1607 mova m4, [off(pb_scan8field3b)]
1608 vpperm m1, m1, m2, [off(pb_scan8field3a)]
1610 vpperm m5, m3, m0, m4
1613 ; 4b, 5b are the same as pb_scan8field3b.
1614 ; 5a is the same as pb_scan8field4a.
1615 mova m5, [off(pb_scan8field4a)]
1616 vpperm m2, m2, m3, m5
1618 vpperm m6, m0, m1, m4
1621 vpperm m3, m3, m0, m5
1623 vpperm m5, m1, m2, m4
1626 vpperm m5, m0, m1, [off(pb_scan8field6)]
1628 vpperm m5, m1, m2, [off(pb_scan8field7)]
1640 cglobal zigzag_scan_8x8_frame, 2,3,8
1641 lea r2, [pb_scan8frame1]
1642 %define off(m) (r2+m-pb_scan8frame1)
1645 vpperm m7, m7, m3, [off(pb_scan8framet1)] ; 8 9 14 15 16 17 21 22
1647 vpperm m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30
1650 vpperm m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45
1651 vpperm m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45
1652 vpperm m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44
1653 vpperm m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39
1655 vpperm m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55
1657 vpperm m2, m1, m3, [off(pb_scan8framet8)] ; 0 1 2 7 24 28 29 36
1658 vpperm m1, m2, m7, [off(pb_scan8frame1)] ; 0 8 1 2 9 16 24 17
1661 movhps m0, [r1+ 20] ; 3 4 5 6 10 11 12 13
1662 vpperm m1, m0, m6, [off(pb_scan8frame2)] ; 10 3 4 11 18 25 32 40
1664 vpperm m1, m0, m5, [off(pb_scan8frame3)] ; 33 26 19 12 5 6 13 20
1666 vpperm m1, m2, m7, [off(pb_scan8frame5)] ; 28 21 14 7 15 22 29 36
1669 movhps m0, [r1+114] ; 50 51 52 53 57 58 59 60
1670 vpperm m1, m5, m0, [off(pb_scan8frame6)] ; 43 50 57 58 51 44 37 30
1672 vpperm m1, m6, m0, [off(pb_scan8frame7)] ; 23 31 38 45 52 59 60 53
1675 vpperm m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63
1676 vpperm m1, m0, m4, [off(pb_scan8frame4)] ; 27 34 41 48 56 49 42 35
1678 vpperm m1, m0, m4, [off(pb_scan8frame8)] ; 46 39 47 54 61 62 55 63
1684 ;-----------------------------------------------------------------------------
1685 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1686 ;-----------------------------------------------------------------------------
1688 mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
1689 mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
1690 mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
1691 mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
1692 TRANSPOSE4x4%2 0,1,2,3,4
1693 mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
1694 mova [r0+(%1+32)*SIZEOF_PIXEL], m1
1695 mova [r0+(%1+64)*SIZEOF_PIXEL], m2
1696 mova [r0+(%1+96)*SIZEOF_PIXEL], m3
1703 %macro ZIGZAG_8x8_CAVLC 1
1704 cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1735 %macro INTERLEAVE_XMM 1
1736 mova m0, [r1+%1*4+ 0]
1737 mova m1, [r1+%1*4+16]
1738 mova m4, [r1+%1*4+32]
1739 mova m5, [r1+%1*4+48]
1740 SBUTTERFLY wd, 0, 1, 6
1741 SBUTTERFLY wd, 4, 5, 7
1742 SBUTTERFLY wd, 0, 1, 6
1743 SBUTTERFLY wd, 4, 5, 7
1745 movhps [r0+%1+ 32], m0
1746 movh [r0+%1+ 64], m1
1747 movhps [r0+%1+ 96], m1
1749 movhps [r0+%1+ 40], m4
1750 movh [r0+%1+ 72], m5
1751 movhps [r0+%1+104], m5
1758 %if HIGH_BIT_DEPTH == 0
1759 %macro ZIGZAG_8x8_CAVLC 0
1760 cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1782 cglobal zigzag_interleave_8x8_cavlc, 3,3,6
1787 mova m5, [deinterleave_shufd]
1788 SBUTTERFLY wd, 0, 1, 4
1789 SBUTTERFLY wd, 2, 3, 4
1790 SBUTTERFLY wd, 0, 1, 4
1791 SBUTTERFLY wd, 2, 3, 4
1798 vextracti128 [r0+ 32], m0, 1
1799 vextracti128 [r0+ 48], m2, 1
1802 vextracti128 [r0+ 96], m1, 1
1803 vextracti128 [r0+112], m3, 1
1805 packsswb m0, m2 ; nnz0, nnz1
1806 packsswb m1, m3 ; nnz2, nnz3
1807 packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3}
1808 vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3}
1818 %endif ; !HIGH_BIT_DEPTH