1 ;*****************************************************************************
2 ;* dct-a.asm: x86 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2016 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
34 pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
35 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
36 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
37 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
38 pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
39 pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
40 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
41 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
43 pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14
44 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14
45 pb_scan8framet3: SHUFFLE_MASK_W 0, 1, 5, 6, 8, 11, 12, 13
46 pb_scan8framet4: SHUFFLE_MASK_W 0, 3, 4, 5, 8, 11, 12, 15
47 pb_scan8framet5: SHUFFLE_MASK_W 1, 2, 6, 7, 9, 10, 13, 14
48 pb_scan8framet6: SHUFFLE_MASK_W 0, 3, 4, 5, 10, 11, 12, 15
49 pb_scan8framet7: SHUFFLE_MASK_W 1, 2, 6, 7, 8, 9, 14, 15
50 pb_scan8framet8: SHUFFLE_MASK_W 0, 1, 2, 7, 8, 10, 11, 14
51 pb_scan8framet9: SHUFFLE_MASK_W 1, 4, 5, 7, 8, 13, 14, 15
53 pb_scan8frame1: SHUFFLE_MASK_W 0, 8, 1, 2, 9, 12, 4, 13
54 pb_scan8frame2: SHUFFLE_MASK_W 4, 0, 1, 5, 8, 10, 12, 14
55 pb_scan8frame3: SHUFFLE_MASK_W 12, 10, 8, 6, 2, 3, 7, 9
56 pb_scan8frame4: SHUFFLE_MASK_W 0, 1, 8, 12, 4, 13, 9, 2
57 pb_scan8frame5: SHUFFLE_MASK_W 5, 14, 10, 3, 11, 15, 6, 7
58 pb_scan8frame6: SHUFFLE_MASK_W 6, 8, 12, 13, 9, 7, 5, 3
59 pb_scan8frame7: SHUFFLE_MASK_W 1, 3, 5, 7, 10, 14, 15, 11
60 pb_scan8frame8: SHUFFLE_MASK_W 10, 3, 11, 14, 5, 6, 15, 7
62 pb_scan8field1 : SHUFFLE_MASK_W 0, 1, 2, 8, 9, 3, 4, 10
63 pb_scan8field2a: SHUFFLE_MASK_W 0x80, 11, 5, 6, 7, 12,0x80,0x80
64 pb_scan8field2b: SHUFFLE_MASK_W 0,0x80,0x80,0x80,0x80,0x80, 1, 8
65 pb_scan8field3a: SHUFFLE_MASK_W 10, 5, 6, 7, 11,0x80,0x80,0x80
66 pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80, 1, 8, 2
67 pb_scan8field4a: SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80,0x80
68 pb_scan8field6 : SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80, 12
69 pb_scan8field7 : SHUFFLE_MASK_W 5, 6, 7, 11,0x80,0x80, 12, 13
85 cextern deinterleave_shufd
90 SUMSUB_BADC %1, %5, %4, %3, %2, %6
91 SUMSUB_BADC %1, %5, %3, %4, %2, %6
95 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
116 ;-----------------------------------------------------------------------------
117 ; void dct4x4dc( dctcoef d[4][4] )
118 ;-----------------------------------------------------------------------------
120 cglobal dct4x4dc, 1,1,5
125 WALSH4_1D d, 0,1,2,3,4
126 TRANSPOSE4x4D 0,1,2,3,4
128 WALSH4_1D d, 0,1,2,3,4
138 %endmacro ; DCT4x4_DC
147 cglobal dct4x4dc, 1,1
152 movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
153 WALSH4_1D w, 0,1,2,3,4
154 TRANSPOSE4x4W 0,1,2,3,4
155 SUMSUB_BADC w, 1, 0, 3, 2, 4
165 %endif ; HIGH_BIT_DEPTH
168 ;-----------------------------------------------------------------------------
169 ; void idct4x4dc( int32_t d[4][4] )
170 ;-----------------------------------------------------------------------------
172 cglobal idct4x4dc, 1,1
177 WALSH4_1D d,0,1,2,3,4
178 TRANSPOSE4x4D 0,1,2,3,4
179 WALSH4_1D d,0,1,2,3,4
185 %endmacro ; IDCT4x4DC
193 ;-----------------------------------------------------------------------------
194 ; void idct4x4dc( int16_t d[4][4] )
195 ;-----------------------------------------------------------------------------
197 cglobal idct4x4dc, 1,1
202 WALSH4_1D w,0,1,2,3,4
203 TRANSPOSE4x4W 0,1,2,3,4
204 WALSH4_1D w,0,1,2,3,4
210 %endif ; HIGH_BIT_DEPTH
212 ;-----------------------------------------------------------------------------
213 ; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
214 ;-----------------------------------------------------------------------------
216 DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
221 %macro INSERT_COEFF 3 ; dst, src, imm
252 cglobal dct2x4dc, 2,3
254 INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0
255 INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2
256 add r1, 4*16*SIZEOF_DCTCOEF
257 INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1
258 INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3
259 INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0
260 INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2
261 INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1
262 INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3
263 SUMSUB_BA %1, 1, 0, 2
264 SBUTTERFLY %2, 1, 0, 2
265 SUMSUB_BA %1, 0, 1, 2
266 SBUTTERFLY %2, 0, 1, 2
267 SUMSUB_BA %1, 1, 0, 2
268 pshuf%1 m0, m0, q1032
285 ;-----------------------------------------------------------------------------
286 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
287 ;-----------------------------------------------------------------------------
289 cglobal sub4x4_dct, 3,3
291 LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
292 LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
293 LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
294 LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
296 TRANSPOSE4x4W 0,1,2,3,4
298 SUMSUB_BADC w, 3, 0, 2, 1
300 DCT_UNPACK m2, m4, m5
301 DCT_UNPACK m3, m6, m7
302 mova [r0+ 0], m2 ; s03 + s12
304 mova [r0+32], m3 ; s03 - s12
307 DCT_UNPACK m0, m2, m4
308 DCT_UNPACK m1, m3, m5
309 SUMSUB2_AB d, 0, 1, 4
310 SUMSUB2_AB d, 2, 3, 5
311 mova [r0+16], m0 ; d03*2 + d12
313 mova [r0+48], m4 ; d03 - 2*d12
319 cglobal sub4x4_dct, 3,3
324 LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
326 TRANSPOSE4x4W 0,1,2,3,4
339 %endif ; HIGH_BIT_DEPTH
342 ;-----------------------------------------------------------------------------
343 ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
344 ;-----------------------------------------------------------------------------
345 %macro STORE_DIFFx2 6
352 CLIPW %1, %4, [pw_pixel_max]
358 cglobal add4x4_idct, 2,2,6
359 add r0, 2*FDEC_STRIDEB
365 IDCT4_1D d,0,1,2,3,4,5
366 TRANSPOSE4x4D 0,1,2,3,4
368 IDCT4_1D d,0,1,2,3,4,5
370 STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
371 STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
380 %else ; !HIGH_BIT_DEPTH
383 cglobal add4x4_idct, 2,2
390 IDCT4_1D w,0,1,2,3,4,5
391 TRANSPOSE4x4W 0,1,2,3,4
393 IDCT4_1D w,0,1,2,3,4,5
394 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
395 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
396 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
397 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
401 cglobal add4x4_idct, 2,2,6
402 mova m1, [r1+0x00] ; row1/row0
403 mova m3, [r1+0x10] ; row3/row2
404 psraw m0, m1, 1 ; row1>>1/...
405 psraw m2, m3, 1 ; row3>>1/...
406 movsd m0, m1 ; row1>>1/row0
407 movsd m2, m3 ; row3>>1/row2
408 psubw m0, m3 ; row1>>1-row3/row0-2
409 paddw m2, m1 ; row3>>1+row1/row0+2
410 SBUTTERFLY2 wd, 0, 2, 1
412 pshuflw m1, m2, q2301
413 pshufhw m2, m2, q2301
419 paddw m1, m0 ; row1/row0 corrected
420 psraw m0, 1 ; row1>>1/...
421 psraw m3, m2, 1 ; row3>>1/...
422 movsd m0, m1 ; row1>>1/row0
423 movsd m3, m2 ; row3>>1/row2
424 psubw m0, m2 ; row1>>1-row3/row0-2
425 paddw m3, m1 ; row3>>1+row1/row0+2
426 SBUTTERFLY2 qdq, 0, 3, 1
429 movd m4, [r0+FDEC_STRIDE*0]
430 movd m1, [r0+FDEC_STRIDE*1]
431 movd m2, [r0+FDEC_STRIDE*2]
432 movd m5, [r0+FDEC_STRIDE*3]
433 punpckldq m1, m4 ; row0/row1
435 punpckldq m2, m5 ; row3/row2
442 packuswb m0, m3 ; row0/row1/row3/row2
443 pextrd [r0+FDEC_STRIDE*0], m0, 3
444 pextrd [r0+FDEC_STRIDE*1], m0, 2
445 movd [r0+FDEC_STRIDE*2], m0
446 pextrd [r0+FDEC_STRIDE*3], m0, 1
455 %macro STOREx2_AVX2 9
456 movq xm%3, [r0+%5*FDEC_STRIDE]
457 vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
458 movq xm%4, [r0+%7*FDEC_STRIDE]
459 vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
467 vextracti128 xm%2, m%1, 1
468 movq [r0+%5*FDEC_STRIDE], xm%1
469 movq [r0+%6*FDEC_STRIDE], xm%2
470 movhps [r0+%7*FDEC_STRIDE], xm%1
471 movhps [r0+%8*FDEC_STRIDE], xm%2
475 cglobal add8x8_idct, 2,3,8
476 add r0, 4*FDEC_STRIDE
478 TAIL_CALL .skip_prologue, 0
479 global current_function %+ .skip_prologue
486 vinserti128 m0, m0, [r1+ 64], 1
487 vinserti128 m1, m1, [r1+ 96], 1
488 vinserti128 m2, m2, [r1+ 80], 1
489 vinserti128 m3, m3, [r1+112], 1
490 SBUTTERFLY qdq, 0, 1, 4
491 SBUTTERFLY qdq, 2, 3, 4
492 IDCT4_1D w,0,1,2,3,4,5
493 TRANSPOSE2x4x4W 0,1,2,3,4
495 IDCT4_1D w,0,1,2,3,4,5
496 STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
497 STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
500 ; 2xdst, 2xtmp, 4xsrcrow, 1xzero
501 %macro LOAD_DIFF8x2_AVX2 9
502 movq xm%1, [r1+%5*FENC_STRIDE]
503 movq xm%2, [r1+%6*FENC_STRIDE]
504 vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
505 vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
508 movq xm%3, [r2+(%5-4)*FDEC_STRIDE]
509 movq xm%4, [r2+(%6-4)*FDEC_STRIDE]
510 vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
511 vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
519 %macro STORE8_DCT_AVX2 5
520 SBUTTERFLY qdq, %1, %2, %5
521 SBUTTERFLY qdq, %3, %4, %5
526 vextracti128 [r0+ 64], m%1, 1
527 vextracti128 [r0+ 80], m%3, 1
528 vextracti128 [r0+ 96], m%2, 1
529 vextracti128 [r0+112], m%4, 1
532 %macro STORE16_DCT_AVX2 5
533 SBUTTERFLY qdq, %1, %2, %5
534 SBUTTERFLY qdq, %3, %4, %5
535 mova [r0+ 0-128], xm%1
536 mova [r0+16-128], xm%3
537 mova [r0+32-128], xm%2
538 mova [r0+48-128], xm%4
539 vextracti128 [r0+ 0], m%1, 1
540 vextracti128 [r0+16], m%3, 1
541 vextracti128 [r0+32], m%2, 1
542 vextracti128 [r0+48], m%4, 1
546 cglobal sub8x8_dct, 3,3,7
548 add r2, 4*FDEC_STRIDE
549 LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
550 LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
551 DCT4_1D 0, 1, 2, 3, 4
552 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
553 DCT4_1D 0, 1, 2, 3, 4
554 STORE8_DCT_AVX2 0, 1, 2, 3, 4
558 cglobal sub16x16_dct, 3,3,6
560 add r2, 4*FDEC_STRIDE
563 add r1, 4*FENC_STRIDE
564 add r2, 4*FDEC_STRIDE
567 add r1, 4*FENC_STRIDE
568 add r2, 4*FDEC_STRIDE
571 add r1, 4*FENC_STRIDE
572 add r2, 4*FDEC_STRIDE
576 LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
577 LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
578 DCT4_1D 0, 1, 2, 3, 4
579 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
580 DCT4_1D 0, 1, 2, 3, 4
581 STORE16_DCT_AVX2 0, 1, 2, 3, 4
583 %endif ; HIGH_BIT_DEPTH
586 ;-----------------------------------------------------------------------------
587 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
588 ;-----------------------------------------------------------------------------
591 %if HIGH_BIT_DEPTH == 0
595 add r2, 4*FDEC_STRIDE
598 %endif ; !HIGH_BIT_DEPTH
600 call %2.skip_prologue
602 add r1, %4-%5-%6*FENC_STRIDE
603 add r2, %4-%5-%6*FDEC_STRIDE
604 call %2.skip_prologue
606 add r1, (%4-%6)*FENC_STRIDE-%5-%4
607 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
608 call %2.skip_prologue
610 add r1, %4-%5-%6*FENC_STRIDE
611 add r2, %4-%5-%6*FDEC_STRIDE
612 TAIL_CALL %2.skip_prologue, 1
615 ;-----------------------------------------------------------------------------
616 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
617 ;-----------------------------------------------------------------------------
618 %macro ADD_NxN_IDCT 6-7
628 %if mmsize>=16 && %3!=256
629 add r0, 4*FDEC_STRIDE
632 call %2.skip_prologue
633 add r0, %4-%5-%6*FDEC_STRIDE
635 call %2.skip_prologue
636 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
638 call %2.skip_prologue
639 add r0, %4-%5-%6*FDEC_STRIDE
641 TAIL_CALL %2.skip_prologue, 1
646 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
647 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
649 ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
650 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
651 ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
652 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
653 cextern add8x8_idct8_sse2.skip_prologue
654 cextern add8x8_idct8_avx.skip_prologue
655 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
656 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
657 cextern sub8x8_dct8_sse2.skip_prologue
658 cextern sub8x8_dct8_sse4.skip_prologue
659 cextern sub8x8_dct8_avx.skip_prologue
660 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
661 SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
662 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
663 %else ; !HIGH_BIT_DEPTH
666 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
667 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
668 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
669 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
671 cextern sub8x8_dct8_mmx.skip_prologue
672 cextern add8x8_idct8_mmx.skip_prologue
673 SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
674 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
678 cextern sub8x8_dct_sse2.skip_prologue
679 cextern sub8x8_dct_ssse3.skip_prologue
680 cextern sub8x8_dct_avx.skip_prologue
681 cextern sub8x8_dct_xop.skip_prologue
682 SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
683 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
684 SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
685 SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
687 cextern add8x8_idct_sse2.skip_prologue
688 cextern add8x8_idct_avx.skip_prologue
689 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
690 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0
692 cextern add8x8_idct8_sse2.skip_prologue
693 cextern add8x8_idct8_avx.skip_prologue
694 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
695 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
697 cextern sub8x8_dct8_sse2.skip_prologue
698 cextern sub8x8_dct8_ssse3.skip_prologue
699 cextern sub8x8_dct8_avx.skip_prologue
700 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
701 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
702 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
705 ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
706 %endif ; HIGH_BIT_DEPTH
709 ;-----------------------------------------------------------------------------
710 ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
711 ;-----------------------------------------------------------------------------
713 mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
714 mova m1, [%1+FDEC_STRIDEB*1]
715 mova m2, [%1+FDEC_STRIDEB*2]
719 paddsw %2, [%1+FDEC_STRIDEB*3]
724 mova [%1+FDEC_STRIDEB*0], m0
725 mova [%1+FDEC_STRIDEB*1], m1
726 mova [%1+FDEC_STRIDEB*2], m2
727 mova [%1+FDEC_STRIDEB*3], %2
731 cglobal add8x8_idct_dc, 2,2,7
732 mova m6, [pw_pixel_max]
736 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
737 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
738 pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
739 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
740 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
741 ADD_DC r0+FDEC_STRIDEB*0, m4
742 ADD_DC r0+FDEC_STRIDEB*4, m3
745 cglobal add16x16_idct_dc, 2,3,8
747 mova m6, [pw_pixel_max]
753 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
754 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
755 pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
756 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
757 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
758 ADD_DC r0+FDEC_STRIDEB*0, m4
759 ADD_DC r0+SIZEOF_PIXEL*8, m3
761 add r0, 4*FDEC_STRIDEB
765 %endmacro ; ADD_IDCT_DC
772 %else ;!HIGH_BIT_DEPTH
774 mova m4, [%3+FDEC_STRIDE*0]
775 mova m5, [%3+FDEC_STRIDE*1]
776 mova m6, [%3+FDEC_STRIDE*2]
780 paddusb %1, [%3+FDEC_STRIDE*3]
785 mova [%3+FDEC_STRIDE*0], m4
786 mova [%3+FDEC_STRIDE*1], m5
787 mova [%3+FDEC_STRIDE*2], m6
788 mova [%3+FDEC_STRIDE*3], %1
792 cglobal add8x8_idct_dc, 2,2
795 add r0, FDEC_STRIDE*4
807 ADD_DC m0, m1, r0-FDEC_STRIDE*4
812 cglobal add8x8_idct_dc, 2,2
815 add r0, FDEC_STRIDE*4
816 pmulhrsw m0, [pw_512]
818 mova m5, [pb_unpackbd1]
823 movh m2, [r0+FDEC_STRIDE*-4]
824 movh m3, [r0+FDEC_STRIDE*-3]
825 movh m4, [r0+FDEC_STRIDE*-2]
826 movh m5, [r0+FDEC_STRIDE*-1]
827 movhps m2, [r0+FDEC_STRIDE* 0]
828 movhps m3, [r0+FDEC_STRIDE* 1]
829 movhps m4, [r0+FDEC_STRIDE* 2]
830 movhps m5, [r0+FDEC_STRIDE* 3]
839 movh [r0+FDEC_STRIDE*-4], m2
840 movh [r0+FDEC_STRIDE*-3], m3
841 movh [r0+FDEC_STRIDE*-2], m4
842 movh [r0+FDEC_STRIDE*-1], m5
843 movhps [r0+FDEC_STRIDE* 0], m2
844 movhps [r0+FDEC_STRIDE* 1], m3
845 movhps [r0+FDEC_STRIDE* 2], m4
846 movhps [r0+FDEC_STRIDE* 3], m5
850 cglobal add16x16_idct_dc, 2,3
869 add r0, FDEC_STRIDE*4
875 cglobal add16x16_idct_dc, 2,2,8
877 add r0, FDEC_STRIDE*4
880 add r0, FDEC_STRIDE*4
899 ADD_DC m0, m1, r0+FDEC_STRIDE*-4
904 cglobal add16x16_idct_dc, 2,2,8
906 add r0, FDEC_STRIDE*4
909 add r0, FDEC_STRIDE*4
913 pmulhrsw m0, [pw_512]
915 mova m5, [pb_unpackbd1]
916 mova m6, [pb_unpackbd2]
923 ADD_DC m0, m1, r0+FDEC_STRIDE*-4
934 mova xm4, [r0+FDEC_STRIDE*0+%3]
935 mova xm5, [r0+FDEC_STRIDE*1+%3]
936 vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
937 vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
942 mova [r0+FDEC_STRIDE*0+%3], xm4
943 mova [r0+FDEC_STRIDE*1+%3], xm5
944 vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
945 vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
949 cglobal add16x16_idct_dc, 2,3,6
950 add r0, FDEC_STRIDE*4
953 pmulhrsw m0, [pw_512]
955 mova m4, [pb_unpackbd1]
956 mova m5, [pb_unpackbd2]
959 pshufb m2, m0, m4 ; row0, row2
960 pshufb m3, m1, m4 ; row0, row2
961 pshufb m0, m5 ; row1, row3
962 pshufb m1, m5 ; row1, row3
963 lea r2, [r0+FDEC_STRIDE*8]
964 ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
965 ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
966 ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
967 ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
970 %endif ; HIGH_BIT_DEPTH
972 ;-----------------------------------------------------------------------------
973 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
974 ;-----------------------------------------------------------------------------
976 %macro DCTDC_2ROW_MMX 4
977 mova %1, [r1+FENC_STRIDE*(0+%3)]
978 mova m1, [r1+FENC_STRIDE*(1+%3)]
979 mova m2, [r2+FDEC_STRIDE*(0+%4)]
980 mova m3, [r2+FDEC_STRIDE*(1+%4)]
996 %macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
997 PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0
998 PSHUFLW m0, %2, q2301 ; s3 __ s2 __
999 paddw m1, %2 ; s1 s13 s0 s02
1000 psubw m1, m0 ; d13 s13 d02 s02
1001 PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02
1002 psrlq m1, 32 ; __ __ d13 s13
1003 paddw m0, m1 ; d02 s02 d02+d13 s02+s13
1004 psllq m1, 32 ; d13 s13
1005 psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
1008 %if HIGH_BIT_DEPTH == 0
1010 cglobal sub8x8_dct_dc, 3,3
1011 DCTDC_2ROW_MMX m0, m4, 0, 0
1012 DCTDC_2ROW_MMX m5, m6, 2, 2
1016 add r2, FDEC_STRIDE*4
1017 DCTDC_2ROW_MMX m7, m4, 4, 0
1018 DCTDC_2ROW_MMX m5, m6, 6, 2
1026 %macro DCTDC_2ROW_SSE2 4
1027 movh m1, [r1+FENC_STRIDE*(0+%1)]
1028 movh m2, [r1+FENC_STRIDE*(1+%1)]
1030 movh m2, [r2+FDEC_STRIDE*(0+%2)]
1031 punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
1034 ACCUM paddd, %4, 1, %3
1039 cglobal sub8x8_dct_dc, 3,3
1041 DCTDC_2ROW_SSE2 0, 0, 0, 3
1042 DCTDC_2ROW_SSE2 2, 2, 1, 3
1043 add r2, FDEC_STRIDE*4
1044 DCTDC_2ROW_SSE2 4, 0, 0, 4
1045 DCTDC_2ROW_SSE2 6, 2, 1, 4
1052 %macro SUB8x16_DCT_DC 0
1053 cglobal sub8x16_dct_dc, 3,3
1055 DCTDC_2ROW_SSE2 0, 0, 0, 3
1056 DCTDC_2ROW_SSE2 2, 2, 1, 3
1057 add r1, FENC_STRIDE*8
1058 add r2, FDEC_STRIDE*8
1059 DCTDC_2ROW_SSE2 -4, -4, 0, 4
1060 DCTDC_2ROW_SSE2 -2, -2, 1, 4
1061 shufps m3, m4, q2020
1062 DCTDC_2ROW_SSE2 0, 0, 0, 5
1063 DCTDC_2ROW_SSE2 2, 2, 1, 5
1064 add r2, FDEC_STRIDE*4
1065 DCTDC_2ROW_SSE2 4, 0, 0, 4
1066 DCTDC_2ROW_SSE2 6, 2, 1, 4
1067 shufps m5, m4, q2020
1069 %define %%sign psignw
1071 %define %%sign pmullw
1073 SUMSUB_BA d, 5, 3, 0
1075 pshuflw m0, m5, q2301
1076 pshufhw m0, m0, q2301
1077 %%sign m5, [pw_pmpmpmpm]
1079 pshufd m1, m0, q1320
1080 pshufd m0, m0, q0231
1081 %%sign m1, [pw_ppppmmmm]
1085 %endmacro ; SUB8x16_DCT_DC
1092 %endif ; !HIGH_BIT_DEPTH
1094 %macro DCTDC_4ROW_SSE2 2
1095 mova %1, [r1+FENC_STRIDEB*%2]
1096 mova m0, [r2+FDEC_STRIDEB*%2]
1099 paddw %1, [r1+FENC_STRIDEB*Y]
1100 paddw m0, [r2+FDEC_STRIDEB*Y]
1104 pshufd m0, %1, q2301
1109 %macro SUB8x8_DCT_DC_10 0
1110 cglobal sub8x8_dct_dc, 3,3,3
1111 DCTDC_4ROW_SSE2 m1, 0
1112 DCTDC_4ROW_SSE2 m2, 4
1113 mova m0, [pw_ppmmmmpp]
1116 pshufd m0, m1, q2200 ; -1 -1 +0 +0
1117 pshufd m1, m1, q0033 ; +0 +0 +1 +1
1119 pshufd m0, m2, q1023 ; -2 +2 -3 +3
1128 %macro SUB8x16_DCT_DC_10 0
1129 cglobal sub8x16_dct_dc, 3,3,6
1130 DCTDC_4ROW_SSE2 m1, 0
1131 DCTDC_4ROW_SSE2 m2, 4
1132 DCTDC_4ROW_SSE2 m3, 8
1133 DCTDC_4ROW_SSE2 m4, 12
1134 mova m0, [pw_ppmmmmpp]
1137 pshufd m5, m1, q2200 ; -1 -1 +0 +0
1138 pshufd m1, m1, q0033 ; +0 +0 +1 +1
1140 pshufd m5, m2, q1023 ; -2 +2 -3 +3
1142 paddd m1, m5 ; a6 a2 a4 a0
1145 pshufd m5, m3, q2200
1146 pshufd m3, m3, q0033
1148 pshufd m5, m4, q1023
1150 paddd m3, m5 ; a7 a3 a5 a1
1153 pshufd m0, m0, q3120
1154 pshufd m1, m1, q3120
1155 punpcklqdq m2, m0, m1
1167 ;-----------------------------------------------------------------------------
1168 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
1169 ;-----------------------------------------------------------------------------
1171 cglobal zigzag_scan_8x8_frame, 2,2,8
1173 movdqa xmm1, [r1+16]
1175 PALIGNR xmm1, xmm1, 14, xmm2
1178 movdqa xmm2, [r1+32]
1179 movdqa xmm3, [r1+48]
1180 PALIGNR xmm2, xmm2, 12, xmm4
1182 PALIGNR xmm3, xmm3, 10, xmm4
1185 punpckhwd xmm0, xmm1
1186 punpckhwd xmm2, xmm3
1205 movdqa xmm4, [r1+64]
1206 movdqa xmm5, [r1+80]
1207 movdqa xmm6, [r1+96]
1208 movdqa xmm7, [r1+112]
1216 PALIGNR xmm4, xmm4, 14, xmm3
1218 PALIGNR xmm5, xmm5, 12, xmm3
1220 PALIGNR xmm6, xmm6, 10, xmm3
1223 PALIGNR xmm7, xmm7, 8, xmm3
1227 punpcklqdq xmm7, xmm7
1231 punpckhwd xmm4, xmm5
1232 punpckhwd xmm6, xmm7
1247 pshufw mm4, mm4, q1230
1255 punpckhdq xmm3, xmm0, xmm2
1256 punpckldq xmm0, xmm2
1257 punpckhdq xmm7, xmm4, xmm6
1258 punpckldq xmm4, xmm6
1259 pshufhw xmm0, xmm0, q0123
1260 pshuflw xmm4, xmm4, q0123
1261 pshufhw xmm3, xmm3, q0123
1262 pshuflw xmm7, xmm7, q0123
1264 movlps [r0+2*10], xmm0
1265 movhps [r0+2*17], xmm0
1266 movlps [r0+2*21], xmm3
1267 movlps [r0+2*28], xmm4
1268 movhps [r0+2*32], xmm3
1269 movhps [r0+2*39], xmm4
1270 movlps [r0+2*43], xmm7
1271 movhps [r0+2*50], xmm7
1276 %if HIGH_BIT_DEPTH == 0
1283 ;-----------------------------------------------------------------------------
1284 ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
1285 ;-----------------------------------------------------------------------------
1287 ; 0 8 1 2 9 16 24 17
1288 ; 10 3 4 11 18 25 32 40
1289 ; 33 26 19 12 5 6 13 20
1290 ; 27 34 41 48 56 49 42 35
1291 ; 28 21 14 7 15 22 29 36
1292 ; 43 50 57 58 51 44 37 30
1293 ; 23 31 38 45 52 59 60 53
1294 ; 46 39 47 54 61 62 55 63
1295 %macro SCAN_8x8_FRAME 5
1296 cglobal zigzag_scan_8x8_frame, 2,2,8
1298 mova m1, [r1+ 8*SIZEOF_DCTCOEF]
1299 movu m2, [r1+14*SIZEOF_DCTCOEF]
1300 movu m3, [r1+21*SIZEOF_DCTCOEF]
1301 mova m4, [r1+28*SIZEOF_DCTCOEF]
1302 punpckl%4 m5, m0, m1
1304 punpckh%4 m6, m1, m0
1308 mova m7, [r1+52*SIZEOF_DCTCOEF]
1309 mova m0, [r1+60*SIZEOF_DCTCOEF]
1316 mova [r0+ 4*SIZEOF_DCTCOEF], m1
1317 mova [r0+ 8*SIZEOF_DCTCOEF], m6
1320 mova m1, [r1+32*SIZEOF_DCTCOEF]
1321 movu m5, [r1+39*SIZEOF_DCTCOEF]
1322 movu m2, [r1+46*SIZEOF_DCTCOEF]
1323 movu [r0+35*SIZEOF_DCTCOEF], m3
1324 movu [r0+47*SIZEOF_DCTCOEF], m4
1327 punpckh%3 m3, m5, m5
1330 mova [r0+52*SIZEOF_DCTCOEF], m6
1331 movu [r0+13*SIZEOF_DCTCOEF], m5
1332 movu m4, [r1+11*SIZEOF_DCTCOEF]
1333 movu m6, [r1+25*SIZEOF_DCTCOEF]
1337 mova m3, [r1+ 4*SIZEOF_DCTCOEF]
1338 movu m7, [r1+18*SIZEOF_DCTCOEF]
1340 movu [r0+25*SIZEOF_DCTCOEF], m1
1347 punpckh%3 m3, m6, m4
1348 punpckh%3 m7, m5, m1
1351 movu m4, [r1+35*SIZEOF_DCTCOEF]
1352 movu m1, [r1+49*SIZEOF_DCTCOEF]
1353 pshuf%5 m6, m6, q0123
1354 pshuf%5 m5, m5, q0123
1355 mova [r0+60*SIZEOF_DCTCOEF], m0
1356 mova [r0+56*SIZEOF_DCTCOEF], m2
1357 movu m0, [r1+42*SIZEOF_DCTCOEF]
1358 mova m2, [r1+56*SIZEOF_DCTCOEF]
1359 movu [r0+17*SIZEOF_DCTCOEF], m3
1360 mova [r0+32*SIZEOF_DCTCOEF], m7
1361 movu [r0+10*SIZEOF_DCTCOEF], m6
1362 movu [r0+21*SIZEOF_DCTCOEF], m5
1363 punpckh%4 m3, m0, m4
1364 punpckh%4 m7, m2, m1
1367 punpckl%3 m4, m2, m0
1368 punpckl%3 m1, m7, m3
1371 pshuf%5 m2, m2, q0123
1372 pshuf%5 m7, m7, q0123
1373 mova [r0+28*SIZEOF_DCTCOEF], m4
1374 movu [r0+43*SIZEOF_DCTCOEF], m1
1375 movu [r0+39*SIZEOF_DCTCOEF], m2
1376 movu [r0+50*SIZEOF_DCTCOEF], m7
1382 SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1384 SCAN_8x8_FRAME 4 , dq, qdq, dq, d
1387 SCAN_8x8_FRAME 16, q , dq , wd, w
1390 ;-----------------------------------------------------------------------------
1391 ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
1392 ;-----------------------------------------------------------------------------
1394 cglobal zigzag_scan_4x4_frame, 2,2,6
1395 mova m0, [r1+ 0*SIZEOF_DCTCOEF]
1396 mova m1, [r1+ 4*SIZEOF_DCTCOEF]
1397 mova m2, [r1+ 8*SIZEOF_DCTCOEF]
1398 mova m3, [r1+12*SIZEOF_DCTCOEF]
1399 punpckl%4 m4, m0, m1
1402 mova [r0+ 0*SIZEOF_DCTCOEF], m4
1404 punpckh%4 m4, m2, m3
1407 punpckl%4 m5, m1, m3
1412 mova [r0+ 4*SIZEOF_DCTCOEF], m5
1413 mova [r0+ 8*SIZEOF_DCTCOEF], m1
1414 mova [r0+12*SIZEOF_DCTCOEF], m3
1420 SCAN_4x4 4, dq, qdq, dq
1422 SCAN_4x4 4, dq, qdq, dq
1425 SCAN_4x4 16, q , dq , wd
1427 ;-----------------------------------------------------------------------------
1428 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
1429 ;-----------------------------------------------------------------------------
1430 %macro SCAN_4x4_FRAME 0
1431 cglobal zigzag_scan_4x4_frame, 2,2
1434 pshufb m1, [pb_scan4frameb]
1435 pshufb m0, [pb_scan4framea]
1451 cglobal zigzag_scan_4x4_frame, 2,2
1454 vpperm m2, m0, m1, [pb_scan4frame2a]
1455 vpperm m1, m0, m1, [pb_scan4frame2b]
1459 %endif ; !HIGH_BIT_DEPTH
1462 ;-----------------------------------------------------------------------------
1463 ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
1464 ;-----------------------------------------------------------------------------
1466 cglobal zigzag_scan_4x4_field, 2,3
1468 pshufd m0, m4, q3102
1480 ;-----------------------------------------------------------------------------
1481 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
1482 ;-----------------------------------------------------------------------------
1483 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
1485 cglobal zigzag_scan_4x4_field, 2,3
1486 pshufw m0, [r1+4], q3102
1497 %endif ; HIGH_BIT_DEPTH
1499 ;-----------------------------------------------------------------------------
1500 ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
1501 ;-----------------------------------------------------------------------------
1504 ; 16 11 5 6 7 12 17 24
1505 ; 18 13 14 15 19 25 32 26
1506 ; 20 21 22 23 27 33 40 34
1507 ; 28 29 30 31 35 41 48 42
1508 ; 36 37 38 39 43 49 50 44
1509 ; 45 46 47 51 56 57 52 53
1510 ; 54 55 58 59 60 61 62 63
1513 cglobal zigzag_scan_8x8_field, 2,3,8
1514 mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
1515 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
1516 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
1517 pshuf%1 m3, m0, q3333 ; 03 03 03 03
1518 movd r2d, m2 ; 09 08
1519 pshuf%1 m2, m2, q0321 ; 08 11 10 09
1520 punpckl%2 m3, m1 ; 05 03 04 03
1521 pinsr%1 m0, r2d, 3 ; 08 02 01 00
1522 punpckl%2 m4, m2, m3 ; 04 10 03 09
1523 pshuf%1 m4, m4, q2310 ; 10 04 03 09
1524 mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
1525 mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
1526 mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
1527 mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
1528 punpckl%3 m6, m5 ; 17 16 XX XX
1529 psrl%4 m1, %5 ; XX 07 06 05
1530 punpckh%2 m6, m2 ; 08 17 11 16
1531 punpckl%3 m6, m1 ; 06 05 11 16
1532 mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
1533 psrl%4 m1, %5 ; XX XX 07 06
1534 punpckl%2 m1, m5 ; 17 07 16 06
1535 mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
1536 mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
1537 punpckh%3 m1, m1 ; 17 07 17 07
1538 punpckl%2 m6, m3, m2 ; 25 13 24 12
1540 mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
1541 punpckl%2 m1, m6 ; 24 17 12 07
1542 mova [r0+12*SIZEOF_DCTCOEF], m1
1543 pinsr%1 m3, r2d, 0 ; 15 14 13 18
1544 mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
1545 mova m7, [r1+28*SIZEOF_DCTCOEF]
1546 mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
1547 psrl%4 m5, %5*3 ; XX XX XX 19
1548 pshuf%1 m1, m2, q3321 ; 27 27 26 25
1549 punpckl%2 m5, m0 ; 33 XX 32 19
1550 psrl%4 m2, %5*3 ; XX XX XX 27
1551 punpckl%2 m5, m1 ; 26 32 25 19
1552 mova [r0+32*SIZEOF_DCTCOEF], m7
1553 mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
1554 mova m7, [r1+36*SIZEOF_DCTCOEF]
1555 mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
1556 pshuf%1 m3, m0, q3321 ; 35 35 34 33
1557 punpckl%2 m2, m1 ; 41 XX 40 27
1558 mova [r0+40*SIZEOF_DCTCOEF], m7
1559 punpckl%2 m2, m3 ; 34 40 33 27
1560 mova [r0+28*SIZEOF_DCTCOEF], m2
1561 mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
1562 mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
1563 psrl%4 m0, %5*3 ; XX XX XX 35
1564 punpckl%2 m0, m2 ; 49 XX 48 35
1565 pshuf%1 m3, m1, q3321 ; 43 43 42 41
1566 punpckl%2 m0, m3 ; 42 48 41 35
1567 mova [r0+36*SIZEOF_DCTCOEF], m0
1568 pextr%1 r2d, m2, 3 ; 51
1569 psrl%4 m1, %5*3 ; XX XX XX 43
1570 punpckl%2 m1, m7 ; 45 XX 44 43
1571 psrl%4 m2, %5 ; XX 51 50 49
1572 punpckl%2 m1, m2 ; 50 44 49 43
1573 pshuf%1 m1, m1, q2310 ; 44 50 49 43
1574 mova [r0+44*SIZEOF_DCTCOEF], m1
1575 psrl%4 m7, %5 ; XX 47 46 45
1576 pinsr%1 m7, r2d, 3 ; 51 47 46 45
1577 mova [r0+48*SIZEOF_DCTCOEF], m7
1578 mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
1579 mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
1580 mova m7, [r1+60*SIZEOF_DCTCOEF]
1581 punpckl%3 m2, m0, m1 ; 53 52 57 56
1582 punpckh%3 m1, m0 ; 59 58 55 54
1583 mova [r0+52*SIZEOF_DCTCOEF], m2
1584 mova [r0+56*SIZEOF_DCTCOEF], m1
1585 mova [r0+60*SIZEOF_DCTCOEF], m7
1590 SCAN_8x8 d, dq, qdq, dq, 4
1592 SCAN_8x8 d, dq, qdq, dq, 4
1595 SCAN_8x8 w, wd, dq , q , 16
1598 ;-----------------------------------------------------------------------------
1599 ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
1600 ;-----------------------------------------------------------------------------
1601 %macro ZIGZAG_SUB_4x4 2
1603 cglobal zigzag_sub_4x4%1_%2, 4,4,8
1605 cglobal zigzag_sub_4x4%1_%2, 3,3,8
1607 movd m0, [r1+0*FENC_STRIDE]
1608 movd m1, [r1+1*FENC_STRIDE]
1609 movd m2, [r1+2*FENC_STRIDE]
1610 movd m3, [r1+3*FENC_STRIDE]
1611 movd m4, [r2+0*FDEC_STRIDE]
1612 movd m5, [r2+1*FDEC_STRIDE]
1613 movd m6, [r2+2*FDEC_STRIDE]
1614 movd m7, [r2+3*FDEC_STRIDE]
1615 movd [r2+0*FDEC_STRIDE], m0
1616 movd [r2+1*FDEC_STRIDE], m1
1617 movd [r2+2*FDEC_STRIDE], m2
1618 movd [r2+3*FDEC_STRIDE], m3
1625 mova m7, [pb_sub4%2]
1629 punpckhbw m1, m0, m4
1635 pand m0, [pb_subacmask]
1651 %if HIGH_BIT_DEPTH == 0
1653 ZIGZAG_SUB_4x4 , frame
1654 ZIGZAG_SUB_4x4 ac, frame
1655 ZIGZAG_SUB_4x4 , field
1656 ZIGZAG_SUB_4x4 ac, field
1658 ZIGZAG_SUB_4x4 , frame
1659 ZIGZAG_SUB_4x4 ac, frame
1660 ZIGZAG_SUB_4x4 , field
1661 ZIGZAG_SUB_4x4 ac, field
1662 %endif ; !HIGH_BIT_DEPTH
1664 %if HIGH_BIT_DEPTH == 0
1666 cglobal zigzag_scan_8x8_field, 2,3,7
1667 lea r2, [pb_scan8field1]
1668 %define off(m) (r2+m-pb_scan8field1)
1671 vpperm m5, m0, m1, [off(pb_scan8field1)]
1673 vpperm m0, m0, m1, [off(pb_scan8field2a)]
1676 vpperm m5, m2, m3, [off(pb_scan8field2b)]
1679 mova m4, [off(pb_scan8field3b)]
1680 vpperm m1, m1, m2, [off(pb_scan8field3a)]
1682 vpperm m5, m3, m0, m4
1685 ; 4b, 5b are the same as pb_scan8field3b.
1686 ; 5a is the same as pb_scan8field4a.
1687 mova m5, [off(pb_scan8field4a)]
1688 vpperm m2, m2, m3, m5
1690 vpperm m6, m0, m1, m4
1693 vpperm m3, m3, m0, m5
1695 vpperm m5, m1, m2, m4
1698 vpperm m5, m0, m1, [off(pb_scan8field6)]
1700 vpperm m5, m1, m2, [off(pb_scan8field7)]
1712 cglobal zigzag_scan_8x8_frame, 2,3,8
1713 lea r2, [pb_scan8frame1]
1714 %define off(m) (r2+m-pb_scan8frame1)
1717 vpperm m7, m7, m3, [off(pb_scan8framet1)] ; 8 9 14 15 16 17 21 22
1719 vpperm m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30
1722 vpperm m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45
1723 vpperm m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45
1724 vpperm m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44
1725 vpperm m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39
1727 vpperm m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55
1729 vpperm m2, m1, m3, [off(pb_scan8framet8)] ; 0 1 2 7 24 28 29 36
1730 vpperm m1, m2, m7, [off(pb_scan8frame1)] ; 0 8 1 2 9 16 24 17
1733 movhps m0, [r1+ 20] ; 3 4 5 6 10 11 12 13
1734 vpperm m1, m0, m6, [off(pb_scan8frame2)] ; 10 3 4 11 18 25 32 40
1736 vpperm m1, m0, m5, [off(pb_scan8frame3)] ; 33 26 19 12 5 6 13 20
1738 vpperm m1, m2, m7, [off(pb_scan8frame5)] ; 28 21 14 7 15 22 29 36
1741 movhps m0, [r1+114] ; 50 51 52 53 57 58 59 60
1742 vpperm m1, m5, m0, [off(pb_scan8frame6)] ; 43 50 57 58 51 44 37 30
1744 vpperm m1, m6, m0, [off(pb_scan8frame7)] ; 23 31 38 45 52 59 60 53
1747 vpperm m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63
1748 vpperm m1, m0, m4, [off(pb_scan8frame4)] ; 27 34 41 48 56 49 42 35
1750 vpperm m1, m0, m4, [off(pb_scan8frame8)] ; 46 39 47 54 61 62 55 63
1756 ;-----------------------------------------------------------------------------
1757 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1758 ;-----------------------------------------------------------------------------
1760 mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
1761 mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
1762 mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
1763 mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
1764 TRANSPOSE4x4%2 0,1,2,3,4
1765 mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
1766 mova [r0+(%1+32)*SIZEOF_PIXEL], m1
1767 mova [r0+(%1+64)*SIZEOF_PIXEL], m2
1768 mova [r0+(%1+96)*SIZEOF_PIXEL], m3
1775 %macro ZIGZAG_8x8_CAVLC 1
1776 cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1807 %macro INTERLEAVE_XMM 1
1808 mova m0, [r1+%1*4+ 0]
1809 mova m1, [r1+%1*4+16]
1810 mova m4, [r1+%1*4+32]
1811 mova m5, [r1+%1*4+48]
1812 SBUTTERFLY wd, 0, 1, 6
1813 SBUTTERFLY wd, 4, 5, 7
1814 SBUTTERFLY wd, 0, 1, 6
1815 SBUTTERFLY wd, 4, 5, 7
1817 movhps [r0+%1+ 32], m0
1818 movh [r0+%1+ 64], m1
1819 movhps [r0+%1+ 96], m1
1821 movhps [r0+%1+ 40], m4
1822 movh [r0+%1+ 72], m5
1823 movhps [r0+%1+104], m5
1830 %if HIGH_BIT_DEPTH == 0
1831 %macro ZIGZAG_8x8_CAVLC 0
1832 cglobal zigzag_interleave_8x8_cavlc, 3,3,8
1854 cglobal zigzag_interleave_8x8_cavlc, 3,3,6
1859 mova m5, [deinterleave_shufd]
1860 SBUTTERFLY wd, 0, 1, 4
1861 SBUTTERFLY wd, 2, 3, 4
1862 SBUTTERFLY wd, 0, 1, 4
1863 SBUTTERFLY wd, 2, 3, 4
1870 vextracti128 [r0+ 32], m0, 1
1871 vextracti128 [r0+ 48], m2, 1
1874 vextracti128 [r0+ 96], m1, 1
1875 vextracti128 [r0+112], m3, 1
1877 packsswb m0, m2 ; nnz0, nnz1
1878 packsswb m1, m3 ; nnz2, nnz3
1879 packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3}
1880 vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3}
1890 %endif ; !HIGH_BIT_DEPTH