1 ;*****************************************************************************
2 ;* dct-a.asm: x86 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2011 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
33 %macro SHUFFLE_16BIT 8
42 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
43 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
44 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
45 pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
46 pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
47 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
48 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
63 SUMSUB_BADC %1, %5, %4, %3, %2, %6
64 SUMSUB_BADC %1, %5, %3, %4, %2, %6
68 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
89 ;-----------------------------------------------------------------------------
90 ; void dct4x4dc( dctcoef d[4][4] )
91 ;-----------------------------------------------------------------------------
93 cglobal dct4x4dc_%1, 1,1,5
98 WALSH4_1D d, 0,1,2,3,4
99 TRANSPOSE4x4D 0,1,2,3,4
101 WALSH4_1D d, 0,1,2,3,4
111 %endmacro ; DCT4x4_DC
120 cglobal dct4x4dc_mmx, 1,1
125 movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
126 WALSH4_1D w, 0,1,2,3,4
127 TRANSPOSE4x4W 0,1,2,3,4
128 SUMSUB_BADC w, 1, 0, 3, 2, 4
138 %endif ; HIGH_BIT_DEPTH
140 %ifdef HIGH_BIT_DEPTH
141 ;-----------------------------------------------------------------------------
142 ; void idct4x4dc( int32_t d[4][4] )
143 ;-----------------------------------------------------------------------------
145 cglobal idct4x4dc_%1, 1,1
150 WALSH4_1D d,0,1,2,3,4
151 TRANSPOSE4x4D 0,1,2,3,4
152 WALSH4_1D d,0,1,2,3,4
158 %endmacro ; IDCT4x4DC
167 ;-----------------------------------------------------------------------------
168 ; void idct4x4dc( int16_t d[4][4] )
169 ;-----------------------------------------------------------------------------
170 cglobal idct4x4dc_mmx, 1,1
175 WALSH4_1D w,0,1,2,3,4
176 TRANSPOSE4x4W 0,1,2,3,4
177 WALSH4_1D w,0,1,2,3,4
183 %endif ; HIGH_BIT_DEPTH
186 %ifdef HIGH_BIT_DEPTH
187 ;-----------------------------------------------------------------------------
188 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
189 ;-----------------------------------------------------------------------------
190 cglobal sub4x4_dct_mmx, 3,3
192 LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
193 LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
194 LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
195 LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
197 TRANSPOSE4x4W 0,1,2,3,4
199 SUMSUB_BADC w, 3, 0, 2, 1
201 DCT_UNPACK m2, m4, m5
202 DCT_UNPACK m3, m6, m7
203 mova [r0+ 0], m2 ; s03 + s12
205 mova [r0+32], m3 ; s03 - s12
208 DCT_UNPACK m0, m2, m4
209 DCT_UNPACK m1, m3, m5
210 SUMSUB2_AB d, 0, 1, 4
211 SUMSUB2_AB d, 2, 3, 5
212 mova [r0+16], m0 ; d03*2 + d12
214 mova [r0+48], m4 ; d03 - 2*d12
220 cglobal sub4x4_dct_%1, 3,3
223 LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
224 LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
225 LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
226 LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
229 LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
232 TRANSPOSE4x4W 0,1,2,3,4
243 %endif ; HIGH_BIT_DEPTH
245 %ifdef HIGH_BIT_DEPTH
246 ;-----------------------------------------------------------------------------
247 ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
248 ;-----------------------------------------------------------------------------
249 %macro STORE_DIFFx2 6
256 CLIPW %1, %4, [pw_pixel_max]
262 cglobal add4x4_idct_%1, 2,2,6
263 add r0, 4*FDEC_STRIDE
269 IDCT4_1D d,0,1,2,3,4,5
270 TRANSPOSE4x4D 0,1,2,3,4
272 IDCT4_1D d,0,1,2,3,4,5
274 STORE_DIFFx2 m0, m1, m4, m5, [r0-4*FDEC_STRIDE], [r0-2*FDEC_STRIDE]
275 STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
284 %else ; !HIGH_BIT_DEPTH
286 cglobal add4x4_idct_mmx, 2,2
293 IDCT4_1D w,0,1,2,3,4,5
294 TRANSPOSE4x4W 0,1,2,3,4
296 IDCT4_1D w,0,1,2,3,4,5
297 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
298 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
299 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
300 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
304 cglobal add4x4_idct_%1, 2,2,6
305 mova m1, [r1+0x00] ; row1/row0
306 mova m3, [r1+0x10] ; row3/row2
307 psraw m0, m1, 1 ; row1>>1/...
308 psraw m2, m3, 1 ; row3>>1/...
309 movsd m0, m1 ; row1>>1/row0
310 movsd m2, m3 ; row3>>1/row2
311 psubw m0, m3 ; row1>>1-row3/row0-2
312 paddw m2, m1 ; row3>>1+row1/row0+2
313 SBUTTERFLY2 wd, 0, 2, 1
315 pshuflw m1, m2, 10110001b
316 pshufhw m2, m2, 10110001b
322 paddw m1, m0 ; row1/row0 corrected
323 psraw m0, 1 ; row1>>1/...
324 psraw m3, m2, 1 ; row3>>1/...
325 movsd m0, m1 ; row1>>1/row0
326 movsd m3, m2 ; row3>>1/row2
327 psubw m0, m2 ; row1>>1-row3/row0-2
328 paddw m3, m1 ; row3>>1+row1/row0+2
329 SBUTTERFLY2 qdq, 0, 3, 1
332 movd m4, [r0+FDEC_STRIDE*0]
333 movd m1, [r0+FDEC_STRIDE*1]
334 movd m2, [r0+FDEC_STRIDE*2]
335 movd m5, [r0+FDEC_STRIDE*3]
336 punpckldq m1, m4 ; row0/row1
338 punpckldq m2, m5 ; row3/row2
345 packuswb m0, m3 ; row0/row1/row3/row2
346 pextrd [r0+FDEC_STRIDE*0], m0, 3
347 pextrd [r0+FDEC_STRIDE*1], m0, 2
348 movd [r0+FDEC_STRIDE*2], m0
349 pextrd [r0+FDEC_STRIDE*3], m0, 1
357 %endif ; HIGH_BIT_DEPTH
360 ;-----------------------------------------------------------------------------
361 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
362 ;-----------------------------------------------------------------------------
364 cglobal %1, 3,3,11*(mmsize/16)
365 %ifndef HIGH_BIT_DEPTH
369 add r2, 4*FDEC_STRIDE
372 %endif ; !HIGH_BIT_DEPTH
379 add r1, %4-%5-%6*FENC_STRIDE
380 add r2, %4-%5-%6*FDEC_STRIDE
383 add r1, (%4-%6)*FENC_STRIDE-%5-%4
384 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
387 add r1, %4-%5-%6*FENC_STRIDE
388 add r2, %4-%5-%6*FDEC_STRIDE
398 ;-----------------------------------------------------------------------------
399 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
400 ;-----------------------------------------------------------------------------
401 %macro ADD_NxN_IDCT 6-7
402 %ifdef HIGH_BIT_DEPTH
403 cglobal %1, 2,2,6*(mmsize/16)
405 cglobal %1, 2,2,11*(mmsize/16)
409 add r0, 4*FDEC_STRIDE
416 add r0, %4-%5-%6*FDEC_STRIDE
419 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
422 add r0, %4-%5-%6*FDEC_STRIDE
433 %ifdef HIGH_BIT_DEPTH
435 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
436 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
438 ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
439 ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
441 ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx.skip_prologue, 64, 8, 0, 0
442 ADD_NxN_IDCT add16x16_idct_avx ,add8x8_idct_avx.skip_prologue, 64, 16, 8, 8
443 %else ; !HIGH_BIT_DEPTH
445 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
446 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
447 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
448 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
450 cextern sub8x8_dct8_mmx.skip_prologue
451 cextern add8x8_idct8_mmx.skip_prologue
452 SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
453 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
457 cextern sub8x8_dct_sse2.skip_prologue
458 cextern sub8x8_dct_ssse3.skip_prologue
459 cextern sub8x8_dct_avx.skip_prologue
460 SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
461 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
462 SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx.skip_prologue, 128, 8, 0, 0
464 cextern add8x8_idct_sse2.skip_prologue
465 cextern add8x8_idct_avx.skip_prologue
466 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 128, 8, 0, 0
467 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx.skip_prologue, 128, 8, 0, 0
469 cextern add8x8_idct8_sse2.skip_prologue
470 cextern add8x8_idct8_avx.skip_prologue
471 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
472 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx.skip_prologue, 128, 8, 0, 0
474 cextern sub8x8_dct8_sse2.skip_prologue
475 cextern sub8x8_dct8_ssse3.skip_prologue
476 cextern sub8x8_dct8_avx.skip_prologue
477 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
478 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
479 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx.skip_prologue, 128, 8, 0, 0
480 %endif ; HIGH_BIT_DEPTH
482 %ifdef HIGH_BIT_DEPTH
484 ;-----------------------------------------------------------------------------
485 ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
486 ;-----------------------------------------------------------------------------
488 mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
489 mova m1, [%1+FDEC_STRIDEB*1]
490 mova m2, [%1+FDEC_STRIDEB*2]
494 paddsw %2, [%1+FDEC_STRIDEB*3]
499 mova [%1+FDEC_STRIDEB*0], m0
500 mova [%1+FDEC_STRIDEB*1], m1
501 mova [%1+FDEC_STRIDEB*2], m2
502 mova [%1+FDEC_STRIDEB*3], %2
506 cglobal add8x8_idct_dc_%1, 2,2,7
507 mova m6, [pw_pixel_max]
511 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
512 pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
513 pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
514 pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
515 pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
516 ADD_DC r0+FDEC_STRIDEB*0, m4
517 ADD_DC r0+FDEC_STRIDEB*4, m3
520 cglobal add16x16_idct_dc_%1, 2,3,8
522 mova m6, [pw_pixel_max]
528 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
529 pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
530 pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
531 pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
532 pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
533 ADD_DC r0+FDEC_STRIDEB*0, m4
534 ADD_DC r0+SIZEOF_PIXEL*8, m3
536 add r0, 4*FDEC_STRIDEB
540 %endmacro ; ADD_IDCT_DC
547 %else ;!HIGH_BIT_DEPTH
549 movq mm4, [%3+FDEC_STRIDE*0]
550 movq mm5, [%3+FDEC_STRIDE*1]
551 movq mm6, [%3+FDEC_STRIDE*2]
555 paddusb %1, [%3+FDEC_STRIDE*3]
560 movq [%3+FDEC_STRIDE*0], mm4
561 movq [%3+FDEC_STRIDE*1], mm5
562 movq [%3+FDEC_STRIDE*2], mm6
563 movq [%3+FDEC_STRIDE*3], %1
566 cglobal add8x8_idct_dc_mmx, 2,2
569 add r0, FDEC_STRIDE*4
577 pshufw mm2, mm0, 0xFA
578 pshufw mm3, mm1, 0xFA
581 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
585 cglobal add8x8_idct_dc_ssse3, 2,2
588 add r0, FDEC_STRIDE*4
592 movdqa xmm5, [pb_idctdc_unpack]
597 movq xmm2, [r0+FDEC_STRIDE*-4]
598 movq xmm3, [r0+FDEC_STRIDE*-3]
599 movq xmm4, [r0+FDEC_STRIDE*-2]
600 movq xmm5, [r0+FDEC_STRIDE*-1]
601 movhps xmm2, [r0+FDEC_STRIDE* 0]
602 movhps xmm3, [r0+FDEC_STRIDE* 1]
603 movhps xmm4, [r0+FDEC_STRIDE* 2]
604 movhps xmm5, [r0+FDEC_STRIDE* 3]
613 movq [r0+FDEC_STRIDE*-4], xmm2
614 movq [r0+FDEC_STRIDE*-3], xmm3
615 movq [r0+FDEC_STRIDE*-2], xmm4
616 movq [r0+FDEC_STRIDE*-1], xmm5
617 movhps [r0+FDEC_STRIDE* 0], xmm2
618 movhps [r0+FDEC_STRIDE* 1], xmm3
619 movhps [r0+FDEC_STRIDE* 2], xmm4
620 movhps [r0+FDEC_STRIDE* 3], xmm5
623 cglobal add16x16_idct_dc_mmx, 2,3
635 pshufw mm2, mm0, 0xFA
636 pshufw mm3, mm1, 0xFA
640 ADD_DC mm2, mm3, r0+8
642 add r0, FDEC_STRIDE*4
647 %macro IDCT_DC_STORE 3
648 movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
649 movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
650 movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
651 movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
660 movdqa [r0+%1+FDEC_STRIDE*0], xmm4
661 movdqa [r0+%1+FDEC_STRIDE*1], xmm5
662 movdqa [r0+%1+FDEC_STRIDE*2], xmm6
663 movdqa [r0+%1+FDEC_STRIDE*3], xmm7
666 cglobal add16x16_idct_dc_sse2, 2,2,8
668 add r0, FDEC_STRIDE*4
674 add r0, FDEC_STRIDE*4
685 psubw xmm1, xmm3, xmm0
688 punpckhbw xmm1, xmm0, xmm0
690 punpckhbw xmm3, xmm2, xmm2
693 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
694 IDCT_DC_STORE 0, xmm2, xmm3
698 cglobal add16x16_idct_dc_%1, 2,2,8
700 add r0, FDEC_STRIDE*4
706 add r0, FDEC_STRIDE*4
713 movdqa xmm5, [ pb_idctdc_unpack]
714 movdqa xmm6, [pb_idctdc_unpack2]
717 pshufb xmm2, xmm0, xmm6
719 pshufb xmm3, xmm1, xmm6
721 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
722 IDCT_DC_STORE 0, xmm2, xmm3
731 %endif ; HIGH_BIT_DEPTH
733 ;-----------------------------------------------------------------------------
734 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
735 ;-----------------------------------------------------------------------------
737 %macro DCTDC_2ROW_MMX 3
738 movq %1, [r1+FENC_STRIDE*(0+%3)]
739 movq m1, [r1+FENC_STRIDE*(1+%3)]
740 movq m2, [r2+FDEC_STRIDE*(0+%3)]
741 movq m3, [r2+FDEC_STRIDE*(1+%3)]
757 %macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
758 pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
759 pshufw mm0, %2, 10110001b ; s3 __ s2 __
760 paddw mm1, %2 ; s1 s13 s0 s02
761 psubw mm1, mm0 ; d13 s13 d02 s02
762 pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
763 psrlq mm1, 32 ; __ __ d13 s13
764 paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
765 psllq mm1, 32 ; d13 s13
766 psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
770 cglobal sub8x8_dct_dc_mmxext, 3,3
771 DCTDC_2ROW_MMX m0, m4, 0
772 DCTDC_2ROW_MMX m5, m6, 2
776 add r1, FENC_STRIDE*4
777 add r2, FDEC_STRIDE*4
778 DCTDC_2ROW_MMX m7, m4, 0
779 DCTDC_2ROW_MMX m5, m6, 2
788 %macro DCTDC_2ROW_SSE2 3
789 movq m0, [r1+FENC_STRIDE*(0+%1)]
790 movq m1, [r1+FENC_STRIDE*(1+%1)]
791 movq m2, [r2+FDEC_STRIDE*(0+%1)]
792 movq m3, [r2+FDEC_STRIDE*(1+%1)]
806 cglobal sub8x8_dct_dc_sse2, 3,3,8
808 DCTDC_2ROW_SSE2 0, 0, m4
809 DCTDC_2ROW_SSE2 2, 1, m4
810 add r1, FENC_STRIDE*4
811 add r2, FDEC_STRIDE*4
813 DCTDC_2ROW_SSE2 0, 0, m5
814 DCTDC_2ROW_SSE2 2, 1, m5
824 ;-----------------------------------------------------------------------------
825 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
826 ;-----------------------------------------------------------------------------
828 cglobal zigzag_scan_8x8_frame_%1, 2,2,8
832 PALIGNR xmm1, xmm1, 14, xmm2
837 PALIGNR xmm2, xmm2, 12, xmm4
839 PALIGNR xmm3, xmm3, 10, xmm4
865 movdqa xmm7, [r1+112]
873 PALIGNR xmm4, xmm4, 14, xmm3
875 PALIGNR xmm5, xmm5, 12, xmm3
877 PALIGNR xmm6, xmm6, 10, xmm3
880 PALIGNR xmm7, xmm7, 8, xmm3
884 punpcklqdq xmm7, xmm7
904 pshufw mm4, mm4, 0x6c
912 punpckhdq xmm3, xmm0, xmm2
914 punpckhdq xmm7, xmm4, xmm6
916 pshufhw xmm0, xmm0, 0x1b
917 pshuflw xmm4, xmm4, 0x1b
918 pshufhw xmm3, xmm3, 0x1b
919 pshuflw xmm7, xmm7, 0x1b
921 movlps [r0+2*10], xmm0
922 movhps [r0+2*17], xmm0
923 movlps [r0+2*21], xmm3
924 movlps [r0+2*28], xmm4
925 movhps [r0+2*32], xmm3
926 movhps [r0+2*39], xmm4
927 movlps [r0+2*43], xmm7
928 movhps [r0+2*50], xmm7
933 %ifndef HIGH_BIT_DEPTH
935 %define PALIGNR PALIGNR_MMX
937 %define PALIGNR PALIGNR_SSSE3
941 ;-----------------------------------------------------------------------------
942 ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
943 ;-----------------------------------------------------------------------------
944 %macro SCAN_8x8_FRAME 6
945 cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
947 mova m1, [r1+ 8*SIZEOF_DCTCOEF]
948 movu m2, [r1+14*SIZEOF_DCTCOEF]
949 movu m3, [r1+21*SIZEOF_DCTCOEF]
950 mova m4, [r1+28*SIZEOF_DCTCOEF]
957 mova m7, [r1+52*SIZEOF_DCTCOEF]
958 mova m0, [r1+60*SIZEOF_DCTCOEF]
965 mova [r0+ 4*SIZEOF_DCTCOEF], m1
966 mova [r0+ 8*SIZEOF_DCTCOEF], m6
969 mova m1, [r1+32*SIZEOF_DCTCOEF]
970 movu m5, [r1+39*SIZEOF_DCTCOEF]
971 movu m2, [r1+46*SIZEOF_DCTCOEF]
972 movu [r0+35*SIZEOF_DCTCOEF], m3
973 movu [r0+47*SIZEOF_DCTCOEF], m4
979 mova [r0+52*SIZEOF_DCTCOEF], m6
980 movu [r0+13*SIZEOF_DCTCOEF], m5
981 movu m4, [r1+11*SIZEOF_DCTCOEF]
982 movu m6, [r1+25*SIZEOF_DCTCOEF]
986 mova m3, [r1+ 4*SIZEOF_DCTCOEF]
987 movu m7, [r1+18*SIZEOF_DCTCOEF]
989 movu [r0+25*SIZEOF_DCTCOEF], m1
1000 movu m4, [r1+35*SIZEOF_DCTCOEF]
1001 movu m1, [r1+49*SIZEOF_DCTCOEF]
1002 pshuf%6 m6, m6, 0x1b
1003 pshuf%6 m5, m5, 0x1b
1004 mova [r0+60*SIZEOF_DCTCOEF], m0
1005 mova [r0+56*SIZEOF_DCTCOEF], m2
1006 movu m0, [r1+42*SIZEOF_DCTCOEF]
1007 mova m2, [r1+56*SIZEOF_DCTCOEF]
1008 movu [r0+17*SIZEOF_DCTCOEF], m3
1009 mova [r0+32*SIZEOF_DCTCOEF], m7
1010 movu [r0+10*SIZEOF_DCTCOEF], m6
1011 movu [r0+21*SIZEOF_DCTCOEF], m5
1012 punpckh%5 m3, m0, m4
1013 punpckh%5 m7, m2, m1
1016 punpckl%4 m4, m2, m0
1017 punpckl%4 m1, m7, m3
1020 pshuf%6 m2, m2, 0x1b
1021 pshuf%6 m7, m7, 0x1b
1022 mova [r0+28*SIZEOF_DCTCOEF], m4
1023 movu [r0+43*SIZEOF_DCTCOEF], m1
1024 movu [r0+39*SIZEOF_DCTCOEF], m2
1025 movu [r0+50*SIZEOF_DCTCOEF], m7
1029 %ifdef HIGH_BIT_DEPTH
1031 SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
1033 SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
1036 SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
1039 ;-----------------------------------------------------------------------------
1040 ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
1041 ;-----------------------------------------------------------------------------
1043 cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
1045 mova m1, [r1+ 4*SIZEOF_DCTCOEF]
1046 mova m2, [r1+ 8*SIZEOF_DCTCOEF]
1047 mova m3, [r1+12*SIZEOF_DCTCOEF]
1048 punpckl%5 m4, m0, m1
1064 mova [r0+ 4*SIZEOF_DCTCOEF], m5
1065 mova [r0+ 8*SIZEOF_DCTCOEF], m1
1066 mova [r0+12*SIZEOF_DCTCOEF], m3
1070 %ifdef HIGH_BIT_DEPTH
1072 SCAN_4x4 sse2, 4 , dq, qdq, dq
1074 SCAN_4x4 avx , 4 , dq, qdq, dq
1077 SCAN_4x4 mmx , 16, q , dq , wd
1079 ;-----------------------------------------------------------------------------
1080 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
1081 ;-----------------------------------------------------------------------------
1082 %macro SCAN_4x4_FRAME 1
1083 cglobal zigzag_scan_4x4_frame_%1, 2,2
1084 movdqa xmm1, [r1+16]
1086 pshufb xmm1, [pb_scan4frameb]
1087 pshufb xmm0, [pb_scan4framea]
1088 psrldq xmm2, xmm1, 6
1089 palignr xmm1, xmm0, 6
1091 palignr xmm2, xmm0, 10
1093 movdqa [r0+16], xmm2
1098 SCAN_4x4_FRAME ssse3
1101 %endif ; !HIGH_BIT_DEPTH
1103 %ifdef HIGH_BIT_DEPTH
1105 ;-----------------------------------------------------------------------------
1106 ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
1107 ;-----------------------------------------------------------------------------
1108 cglobal zigzag_scan_4x4_field_sse2, 2,3
1122 ;-----------------------------------------------------------------------------
1123 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
1124 ;-----------------------------------------------------------------------------
1125 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
1126 cglobal zigzag_scan_4x4_field_mmxext, 2,3
1127 pshufw mm0, [r1+4], 0xd2
1138 %endif ; HIGH_BIT_DEPTH
1140 ;-----------------------------------------------------------------------------
1141 ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
1142 ;-----------------------------------------------------------------------------
1145 ; 16 11 5 6 7 12 17 24
1146 ; 18 13 14 15 19 25 32 26
1147 ; 20 21 22 23 27 33 40 34
1148 ; 28 29 30 31 35 41 48 42
1149 ; 36 37 38 39 43 49 50 44
1150 ; 45 46 47 51 56 57 52 53
1151 ; 54 55 58 59 60 61 62 63
1154 cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
1155 mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
1156 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
1157 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
1158 pshuf%2 m3, m0, 011111111b ; 03 03 03 03
1160 pshuf%2 m2, m2, 000111001b ; 08 11 10 09
1161 punpckl%3 m3, m1 ; 05 03 04 03
1162 pinsr%2 m0, r2d, 3 ; 08 02 01 00
1163 punpckl%3 m4, m2, m3 ; 04 10 03 09
1164 pshuf%2 m4, m4, 010110100b ; 10 04 03 09
1165 mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
1166 mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
1167 mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
1168 mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
1169 punpckl%4 m6, m5 ; 17 16 XX XX
1170 psrl%5 m1, %6 ; XX 07 06 05
1171 punpckh%3 m6, m2 ; 08 17 11 16
1172 punpckl%4 m6, m1 ; 06 05 11 16
1173 mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
1174 psrl%5 m1, %6 ; XX XX 07 06
1175 punpckl%3 m1, m5 ; 17 07 16 06
1176 mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
1177 mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
1178 punpckh%4 m1, m1 ; 17 07 17 07
1179 punpckl%3 m6, m3, m2 ; 25 13 24 12
1181 mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
1182 punpckl%3 m1, m6 ; 24 17 12 07
1183 mova [r0+12*SIZEOF_DCTCOEF], m1
1184 pinsr%2 m3, r2d, 0 ; 15 14 13 18
1185 mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
1186 mova m7, [r1+28*SIZEOF_DCTCOEF]
1187 mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
1188 psrl%5 m5, %6*3 ; XX XX XX 19
1189 pshuf%2 m1, m2, 011111001b ; 27 27 26 25
1190 punpckl%3 m5, m0 ; 33 XX 32 19
1191 psrl%5 m2, %6*3 ; XX XX XX 27
1192 punpckl%3 m5, m1 ; 26 32 25 19
1193 mova [r0+32*SIZEOF_DCTCOEF], m7
1194 mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
1195 mova m7, [r1+36*SIZEOF_DCTCOEF]
1196 mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
1197 pshuf%2 m3, m0, 011111001b ; 35 35 34 33
1198 punpckl%3 m2, m1 ; 41 XX 40 27
1199 mova [r0+40*SIZEOF_DCTCOEF], m7
1200 punpckl%3 m2, m3 ; 34 40 33 27
1201 mova [r0+28*SIZEOF_DCTCOEF], m2
1202 mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
1203 mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
1204 psrl%5 m0, %6*3 ; XX XX XX 35
1205 punpckl%3 m0, m2 ; 49 XX 48 35
1206 pshuf%2 m3, m1, 011111001b ; 43 43 42 41
1207 punpckl%3 m0, m3 ; 42 48 41 35
1208 mova [r0+36*SIZEOF_DCTCOEF], m0
1209 pextr%2 r2d, m2, 3 ; 51
1210 psrl%5 m1, %6*3 ; XX XX XX 43
1211 punpckl%3 m1, m7 ; 45 XX 44 43
1212 psrl%5 m2, %6 ; XX 51 50 49
1213 punpckl%3 m1, m2 ; 50 44 49 43
1214 pshuf%2 m1, m1, 010110100b ; 44 50 49 43
1215 mova [r0+44*SIZEOF_DCTCOEF], m1
1216 psrl%5 m7, %6 ; XX 47 46 45
1217 pinsr%2 m7, r2d, 3 ; 51 47 46 45
1218 mova [r0+48*SIZEOF_DCTCOEF], m7
1219 mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
1220 mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
1221 mova m7, [r1+60*SIZEOF_DCTCOEF]
1222 punpckl%4 m2, m0, m1 ; 53 52 57 56
1223 punpckh%4 m1, m0 ; 59 58 55 54
1224 mova [r0+52*SIZEOF_DCTCOEF], m2
1225 mova [r0+56*SIZEOF_DCTCOEF], m1
1226 mova [r0+60*SIZEOF_DCTCOEF], m7
1229 %ifdef HIGH_BIT_DEPTH
1231 SCAN_8x8 sse4 , d, dq, qdq, dq, 4
1233 SCAN_8x8 avx , d, dq, qdq, dq, 4
1236 SCAN_8x8 mmxext, w, wd, dq , q , 16
1239 ;-----------------------------------------------------------------------------
1240 ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
1241 ;-----------------------------------------------------------------------------
1242 %macro ZIGZAG_SUB_4x4 3
1244 cglobal zigzag_sub_4x4%1_%2_%3, 4,4,8
1246 cglobal zigzag_sub_4x4%1_%2_%3, 3,3,8
1248 movd xmm0, [r1+0*FENC_STRIDE]
1249 movd xmm1, [r1+1*FENC_STRIDE]
1250 movd xmm2, [r1+2*FENC_STRIDE]
1251 movd xmm3, [r1+3*FENC_STRIDE]
1252 movd xmm4, [r2+0*FDEC_STRIDE]
1253 movd xmm5, [r2+1*FDEC_STRIDE]
1254 movd xmm6, [r2+2*FDEC_STRIDE]
1255 movd xmm7, [r2+3*FDEC_STRIDE]
1256 movd [r2+0*FDEC_STRIDE], xmm0
1257 movd [r2+1*FDEC_STRIDE], xmm1
1258 movd [r2+2*FDEC_STRIDE], xmm2
1259 movd [r2+3*FDEC_STRIDE], xmm3
1260 punpckldq xmm0, xmm1
1261 punpckldq xmm2, xmm3
1262 punpckldq xmm4, xmm5
1263 punpckldq xmm6, xmm7
1264 punpcklqdq xmm0, xmm2
1265 punpcklqdq xmm4, xmm6
1267 movdqa xmm7, [pb_sub4frame]
1269 movdqa xmm7, [pb_sub4field]
1274 punpckhbw xmm1, xmm0, xmm6
1275 punpckhbw xmm5, xmm4, xmm6
1276 punpcklbw xmm0, xmm6
1277 punpcklbw xmm4, xmm6
1282 pand xmm0, [pb_subacmask]
1286 movdqa [r0+16], xmm1
1299 ZIGZAG_SUB_4x4 , frame, ssse3
1300 ZIGZAG_SUB_4x4 ac, frame, ssse3
1301 ZIGZAG_SUB_4x4 , field, ssse3
1302 ZIGZAG_SUB_4x4 ac, field, ssse3
1304 ZIGZAG_SUB_4x4 , frame, avx
1305 ZIGZAG_SUB_4x4 ac, frame, avx
1306 ZIGZAG_SUB_4x4 , field, avx
1307 ZIGZAG_SUB_4x4 ac, field, avx
1309 ;-----------------------------------------------------------------------------
1310 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1311 ;-----------------------------------------------------------------------------
1313 mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
1314 mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
1315 mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
1316 mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
1317 TRANSPOSE4x4%2 0,1,2,3,4
1318 mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
1319 mova [r0+(%1+32)*SIZEOF_PIXEL], m1
1320 mova [r0+(%1+64)*SIZEOF_PIXEL], m2
1321 mova [r0+(%1+96)*SIZEOF_PIXEL], m3
1334 %macro ZIGZAG_8x8_CAVLC 2
1335 cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8*(mmsize/16)
1344 %ifdef HIGH_BIT_DEPTH
1356 %ifdef HIGH_BIT_DEPTH
1358 ZIGZAG_8x8_CAVLC sse2, D
1360 ZIGZAG_8x8_CAVLC avx , D
1363 ZIGZAG_8x8_CAVLC mmx , W
1366 %macro INTERLEAVE_XMM 1
1367 mova m0, [r1+%1*4+ 0]
1368 mova m1, [r1+%1*4+16]
1369 mova m4, [r1+%1*4+32]
1370 mova m5, [r1+%1*4+48]
1371 SBUTTERFLY wd, 0, 1, 6
1372 SBUTTERFLY wd, 4, 5, 7
1373 SBUTTERFLY wd, 0, 1, 6
1374 SBUTTERFLY wd, 4, 5, 7
1376 movhps [r0+%1+ 32], m0
1377 movq [r0+%1+ 64], m1
1378 movhps [r0+%1+ 96], m1
1380 movhps [r0+%1+ 40], m4
1381 movq [r0+%1+ 72], m5
1382 movhps [r0+%1+104], m5
1396 %ifndef HIGH_BIT_DEPTH
1397 %macro ZIGZAG_8x8_CAVLC 1
1398 cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8
1415 ZIGZAG_8x8_CAVLC sse2
1417 ZIGZAG_8x8_CAVLC avx
1418 %endif ; !HIGH_BIT_DEPTH