1 ;*****************************************************************************
2 ;* dct-a.asm: x86 transform and zigzag
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2010 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
10 ;* Fiona Glaser <fiona@x264.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
31 %include "x86util.asm"
33 %macro SHUFFLE_16BIT 8
42 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
43 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
44 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
45 pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
46 pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
47 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
48 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
63 SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%6
64 SUMSUB_BADC %1, m%5, m%3, m%4, m%2, m%6
68 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
90 ;-----------------------------------------------------------------------------
91 ; void dct4x4dc( dctcoef d[4][4] )
92 ;-----------------------------------------------------------------------------
93 cglobal dct4x4dc_sse2, 1,1,5
98 WALSH4_1D d, 0,1,2,3,4
99 TRANSPOSE4x4D 0,1,2,3,4
101 WALSH4_1D d, 0,1,2,3,4
114 cglobal dct4x4dc_mmx, 1,1
119 movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
120 WALSH4_1D w, 0,1,2,3,4
121 TRANSPOSE4x4W 0,1,2,3,4
122 SUMSUB_BADC w, m1, m0, m3, m2, m4
132 %endif ; HIGH_BIT_DEPTH
134 %ifdef HIGH_BIT_DEPTH
135 ;-----------------------------------------------------------------------------
136 ; void idct4x4dc( int32_t d[4][4] )
137 ;-----------------------------------------------------------------------------
139 cglobal idct4x4dc_sse2, 1,1
144 WALSH4_1D d,0,1,2,3,4
145 TRANSPOSE4x4D 0,1,2,3,4
146 WALSH4_1D d,0,1,2,3,4
155 ;-----------------------------------------------------------------------------
156 ; void idct4x4dc( int16_t d[4][4] )
157 ;-----------------------------------------------------------------------------
158 cglobal idct4x4dc_mmx, 1,1
163 WALSH4_1D w,0,1,2,3,4
164 TRANSPOSE4x4W 0,1,2,3,4
165 WALSH4_1D w,0,1,2,3,4
171 %endif ; HIGH_BIT_DEPTH
174 %ifdef HIGH_BIT_DEPTH
175 ;-----------------------------------------------------------------------------
176 ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
177 ;-----------------------------------------------------------------------------
178 cglobal sub4x4_dct_mmx, 3,3
180 LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
181 LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
182 LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
183 LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
185 TRANSPOSE4x4W 0,1,2,3,4
187 SUMSUB_BADC w, m3, m0, m2, m1
188 SUMSUB_BA w, m2, m3, m4
189 DCT_UNPACK m2, m4, m5
190 DCT_UNPACK m3, m6, m7
191 mova [r0+ 0], m2 ; s03 + s12
193 mova [r0+32], m3 ; s03 - s12
196 DCT_UNPACK m0, m2, m4
197 DCT_UNPACK m1, m3, m5
198 SUMSUB2_AB d, m0, m1, m4
199 SUMSUB2_AB d, m2, m3, m5
200 mova [r0+16], m0 ; d03*2 + d12
202 mova [r0+48], m4 ; d03 - 2*d12
208 cglobal sub4x4_dct_%1, 3,3
211 LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
212 LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
213 LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
214 LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
217 LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
220 TRANSPOSE4x4W 0,1,2,3,4
231 %endif ; HIGH_BIT_DEPTH
233 %ifdef HIGH_BIT_DEPTH
234 ;-----------------------------------------------------------------------------
235 ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
236 ;-----------------------------------------------------------------------------
237 %macro STORE_DIFFx2 6
244 CLIPW %1, %4, [pw_pixel_max]
250 cglobal add4x4_idct_sse2, 2,2,6
251 add r0, 4*FDEC_STRIDE
257 IDCT4_1D d,0,1,2,3,4,5
258 TRANSPOSE4x4D 0,1,2,3,4
260 IDCT4_1D d,0,1,2,3,4,5
262 STORE_DIFFx2 m0, m1, m4, m5, [r0-4*FDEC_STRIDE], [r0-2*FDEC_STRIDE]
263 STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
267 cglobal add4x4_idct_mmx, 2,2
274 IDCT4_1D w,0,1,2,3,4,5
275 TRANSPOSE4x4W 0,1,2,3,4
277 IDCT4_1D w,0,1,2,3,4,5
278 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
279 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
280 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
281 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
285 cglobal add4x4_idct_sse4, 2,2,6
286 mova m0, [r1+0x00] ; row1/row0
287 mova m2, [r1+0x10] ; row3/row2
288 mova m1, m0 ; row1/row0
289 psraw m0, 1 ; row1>>1/...
290 mova m3, m2 ; row3/row2
291 psraw m2, 1 ; row3>>1/...
292 movsd m0, m1 ; row1>>1/row0
293 movsd m2, m3 ; row3>>1/row2
294 psubw m0, m3 ; row1>>1-row3/row0-2
295 paddw m2, m1 ; row3>>1+row1/row0+2
296 SBUTTERFLY2 wd, 0, 2, 1
297 SUMSUB_BA w, m2, m0, m1
298 pshuflw m1, m2, 10110001b
299 pshufhw m2, m2, 10110001b
305 paddw m1, m0 ; row1/row0 corrected
306 psraw m0, 1 ; row1>>1/...
307 mova m3, m2 ; row3/row2
308 psraw m2, 1 ; row3>>1/...
309 movsd m0, m1 ; row1>>1/row0
310 movsd m2, m3 ; row3>>1/row2
311 psubw m0, m3 ; row1>>1-row3/row0-2
312 paddw m2, m1 ; row3>>1+row1/row0+2
313 SBUTTERFLY2 qdq, 0, 2, 1
314 SUMSUB_BA w, m2, m0, m1
316 movd m4, [r0+FDEC_STRIDE*0]
317 movd m1, [r0+FDEC_STRIDE*1]
318 movd m3, [r0+FDEC_STRIDE*2]
319 movd m5, [r0+FDEC_STRIDE*3]
320 punpckldq m1, m4 ; row0/row1
322 punpckldq m3, m5 ; row3/row2
329 packuswb m0, m2 ; row0/row1/row3/row2
330 pextrd [r0+FDEC_STRIDE*0], m0, 3
331 pextrd [r0+FDEC_STRIDE*1], m0, 2
332 movd [r0+FDEC_STRIDE*2], m0
333 pextrd [r0+FDEC_STRIDE*3], m0, 1
335 %endif ; HIGH_BIT_DEPTH
338 ;-----------------------------------------------------------------------------
339 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
340 ;-----------------------------------------------------------------------------
342 cglobal %1, 3,3,11*(mmsize/16)
343 %ifndef HIGH_BIT_DEPTH
347 add r2, 4*FDEC_STRIDE
350 %endif ; !HIGH_BIT_DEPTH
357 add r1, %4-%5-%6*FENC_STRIDE
358 add r2, %4-%5-%6*FDEC_STRIDE
361 add r1, (%4-%6)*FENC_STRIDE-%5-%4
362 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
365 add r1, %4-%5-%6*FENC_STRIDE
366 add r2, %4-%5-%6*FDEC_STRIDE
376 ;-----------------------------------------------------------------------------
377 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
378 ;-----------------------------------------------------------------------------
379 %macro ADD_NxN_IDCT 6-7
380 %ifdef HIGH_BIT_DEPTH
381 cglobal %1, 2,2,6*(mmsize/16)
383 cglobal %1, 2,2,11*(mmsize/16)
387 add r0, 4*FDEC_STRIDE
394 add r0, %4-%5-%6*FDEC_STRIDE
397 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
400 add r0, %4-%5-%6*FDEC_STRIDE
411 %ifdef HIGH_BIT_DEPTH
413 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
414 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
416 ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
417 ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
418 %else ; !HIGH_BIT_DEPTH
420 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
421 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
422 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
423 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
425 cextern sub8x8_dct8_mmx.skip_prologue
426 cextern add8x8_idct8_mmx.skip_prologue
427 SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
428 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
433 cextern sub8x8_dct_sse2.skip_prologue
434 cextern sub8x8_dct_ssse3.skip_prologue
435 SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
436 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
437 cextern add8x8_idct_sse2.skip_prologue
438 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
440 cextern sub8x8_dct8_sse2.skip_prologue
441 cextern add8x8_idct8_sse2.skip_prologue
442 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
443 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
445 cextern sub8x8_dct8_ssse3.skip_prologue
446 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
447 %endif ; HIGH_BIT_DEPTH
449 %ifdef HIGH_BIT_DEPTH
451 ;-----------------------------------------------------------------------------
452 ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
453 ;-----------------------------------------------------------------------------
455 mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
456 mova m1, [%1+FDEC_STRIDEB*1]
457 mova m2, [%1+FDEC_STRIDEB*2]
461 paddsw %2, [%1+FDEC_STRIDEB*3]
466 mova [%1+FDEC_STRIDEB*0], m0
467 mova [%1+FDEC_STRIDEB*1], m1
468 mova [%1+FDEC_STRIDEB*2], m2
469 mova [%1+FDEC_STRIDEB*3], %2
473 cglobal add8x8_idct_dc_sse2, 2,2,7
474 mova m6, [pw_pixel_max]
478 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
479 pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
480 pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
481 pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
482 pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
483 ADD_DC r0+FDEC_STRIDEB*0, m4
484 ADD_DC r0+FDEC_STRIDEB*4, m3
487 cglobal add16x16_idct_dc_sse2, 2,3,8
489 mova m6, [pw_pixel_max]
495 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
496 pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
497 pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
498 pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
499 pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
500 ADD_DC r0+FDEC_STRIDEB*0, m4
501 ADD_DC r0+SIZEOF_PIXEL*8, m3
503 add r0, 4*FDEC_STRIDEB
508 %else ;!HIGH_BIT_DEPTH
510 movq mm4, [%3+FDEC_STRIDE*0]
511 movq mm5, [%3+FDEC_STRIDE*1]
512 movq mm6, [%3+FDEC_STRIDE*2]
516 paddusb %1, [%3+FDEC_STRIDE*3]
521 movq [%3+FDEC_STRIDE*0], mm4
522 movq [%3+FDEC_STRIDE*1], mm5
523 movq [%3+FDEC_STRIDE*2], mm6
524 movq [%3+FDEC_STRIDE*3], %1
527 cglobal add8x8_idct_dc_mmx, 2,2
530 add r0, FDEC_STRIDE*4
538 pshufw mm2, mm0, 0xFA
539 pshufw mm3, mm1, 0xFA
542 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
546 cglobal add8x8_idct_dc_ssse3, 2,2
549 add r0, FDEC_STRIDE*4
553 movdqa xmm5, [pb_idctdc_unpack]
558 movq xmm2, [r0+FDEC_STRIDE*-4]
559 movq xmm3, [r0+FDEC_STRIDE*-3]
560 movq xmm4, [r0+FDEC_STRIDE*-2]
561 movq xmm5, [r0+FDEC_STRIDE*-1]
562 movhps xmm2, [r0+FDEC_STRIDE* 0]
563 movhps xmm3, [r0+FDEC_STRIDE* 1]
564 movhps xmm4, [r0+FDEC_STRIDE* 2]
565 movhps xmm5, [r0+FDEC_STRIDE* 3]
574 movq [r0+FDEC_STRIDE*-4], xmm2
575 movq [r0+FDEC_STRIDE*-3], xmm3
576 movq [r0+FDEC_STRIDE*-2], xmm4
577 movq [r0+FDEC_STRIDE*-1], xmm5
578 movhps [r0+FDEC_STRIDE* 0], xmm2
579 movhps [r0+FDEC_STRIDE* 1], xmm3
580 movhps [r0+FDEC_STRIDE* 2], xmm4
581 movhps [r0+FDEC_STRIDE* 3], xmm5
584 cglobal add16x16_idct_dc_mmx, 2,3
596 pshufw mm2, mm0, 0xFA
597 pshufw mm3, mm1, 0xFA
601 ADD_DC mm2, mm3, r0+8
603 add r0, FDEC_STRIDE*4
608 %macro IDCT_DC_STORE 3
609 movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
610 movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
611 movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
612 movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
621 movdqa [r0+%1+FDEC_STRIDE*0], xmm4
622 movdqa [r0+%1+FDEC_STRIDE*1], xmm5
623 movdqa [r0+%1+FDEC_STRIDE*2], xmm6
624 movdqa [r0+%1+FDEC_STRIDE*3], xmm7
627 cglobal add16x16_idct_dc_sse2, 2,2,8
629 add r0, FDEC_STRIDE*4
635 add r0, FDEC_STRIDE*4
657 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
658 IDCT_DC_STORE 0, xmm2, xmm3
661 cglobal add16x16_idct_dc_ssse3, 2,2,8
663 add r0, FDEC_STRIDE*4
669 add r0, FDEC_STRIDE*4
676 movdqa xmm5, [ pb_idctdc_unpack]
677 movdqa xmm6, [pb_idctdc_unpack2]
686 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
687 IDCT_DC_STORE 0, xmm2, xmm3
690 %endif ; HIGH_BIT_DEPTH
692 ;-----------------------------------------------------------------------------
693 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
694 ;-----------------------------------------------------------------------------
696 %macro DCTDC_2ROW_MMX 3
697 movq %1, [r1+FENC_STRIDE*(0+%3)]
698 movq m1, [r1+FENC_STRIDE*(1+%3)]
699 movq m2, [r2+FDEC_STRIDE*(0+%3)]
700 movq m3, [r2+FDEC_STRIDE*(1+%3)]
716 %macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
717 pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
718 pshufw mm0, %2, 10110001b ; s3 __ s2 __
719 paddw mm1, %2 ; s1 s13 s0 s02
720 psubw mm1, mm0 ; d13 s13 d02 s02
721 pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
722 psrlq mm1, 32 ; __ __ d13 s13
723 paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
724 psllq mm1, 32 ; d13 s13
725 psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
729 cglobal sub8x8_dct_dc_mmxext, 3,3
730 DCTDC_2ROW_MMX m0, m4, 0
731 DCTDC_2ROW_MMX m5, m6, 2
735 add r1, FENC_STRIDE*4
736 add r2, FDEC_STRIDE*4
737 DCTDC_2ROW_MMX m7, m4, 0
738 DCTDC_2ROW_MMX m5, m6, 2
747 %macro DCTDC_2ROW_SSE2 3
748 movq m0, [r1+FENC_STRIDE*(0+%1)]
749 movq m1, [r1+FENC_STRIDE*(1+%1)]
750 movq m2, [r2+FDEC_STRIDE*(0+%1)]
751 movq m3, [r2+FDEC_STRIDE*(1+%1)]
765 cglobal sub8x8_dct_dc_sse2, 3,3,8
767 DCTDC_2ROW_SSE2 0, 0, m4
768 DCTDC_2ROW_SSE2 2, 1, m4
769 add r1, FENC_STRIDE*4
770 add r2, FDEC_STRIDE*4
772 DCTDC_2ROW_SSE2 0, 0, m5
773 DCTDC_2ROW_SSE2 2, 1, m5
783 ;-----------------------------------------------------------------------------
784 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
785 ;-----------------------------------------------------------------------------
787 cglobal zigzag_scan_8x8_frame_%1, 2,2,8
791 PALIGNR xmm1, xmm1, 14, xmm2
796 PALIGNR xmm2, xmm2, 12, xmm4
798 PALIGNR xmm3, xmm3, 10, xmm4
824 movdqa xmm7, [r1+112]
832 PALIGNR xmm4, xmm4, 14, xmm3
834 PALIGNR xmm5, xmm5, 12, xmm3
836 PALIGNR xmm6, xmm6, 10, xmm3
839 PALIGNR xmm7, xmm7, 8, xmm3
843 punpcklqdq xmm7, xmm7
863 pshufw mm4, mm4, 0x6c
877 pshufhw xmm0, xmm0, 0x1b
878 pshuflw xmm4, xmm4, 0x1b
879 pshufhw xmm3, xmm3, 0x1b
880 pshuflw xmm7, xmm7, 0x1b
882 movlps [r0+2*10], xmm0
883 movhps [r0+2*17], xmm0
884 movlps [r0+2*21], xmm3
885 movlps [r0+2*28], xmm4
886 movhps [r0+2*32], xmm3
887 movhps [r0+2*39], xmm4
888 movlps [r0+2*43], xmm7
889 movhps [r0+2*50], xmm7
895 %define PALIGNR PALIGNR_MMX
897 %define PALIGNR PALIGNR_SSSE3
900 ;-----------------------------------------------------------------------------
901 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
902 ;-----------------------------------------------------------------------------
903 cglobal zigzag_scan_8x8_frame_mmxext, 2,2
965 pshufw mm6, mm6, 0x1b
966 pshufw mm5, mm5, 0x1b
987 pshufw mm2, mm2, 0x1b
988 pshufw mm7, mm7, 0x1b
995 ;-----------------------------------------------------------------------------
996 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
997 ;-----------------------------------------------------------------------------
998 cglobal zigzag_scan_4x4_frame_mmx, 2,2
1025 ;-----------------------------------------------------------------------------
1026 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
1027 ;-----------------------------------------------------------------------------
1028 cglobal zigzag_scan_4x4_frame_ssse3, 2,2
1029 movdqa xmm1, [r1+16]
1031 pshufb xmm1, [pb_scan4frameb]
1032 pshufb xmm0, [pb_scan4framea]
1035 palignr xmm2, xmm0, 6
1037 palignr xmm1, xmm0, 10
1039 movdqa [r0+16], xmm1
1042 ;-----------------------------------------------------------------------------
1043 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
1044 ;-----------------------------------------------------------------------------
1045 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
1046 cglobal zigzag_scan_4x4_field_mmxext, 2,3
1047 pshufw mm0, [r1+4], 0xd2
1059 ;-----------------------------------------------------------------------------
1060 ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
1061 ;-----------------------------------------------------------------------------
1065 ; 16 11 5 6 7 12 17 24
1066 ; 18 13 14 15 19 25 32 26
1067 ; 20 21 22 23 27 33 40 34
1068 ; 28 29 30 31 35 41 48 42
1069 ; 36 37 38 39 43 49 50 44
1070 ; 45 46 47 51 56 57 52 53
1071 ; 54 55 58 59 60 61 62 63
1073 cglobal zigzag_scan_8x8_field_mmxext, 2,3
1074 movq mm0, [r1+2*0] ; 03 02 01 00
1075 movq mm1, [r1+2*4] ; 07 06 05 04
1076 movq mm2, [r1+2*8] ; 11 10 09 08
1077 pshufw mm3, mm0, 011111111b ; 03 03 03 03
1078 movd r2, mm2 ; 09 08
1079 pshufw mm2, mm2, 000111001b ; 08 11 10 09
1080 punpcklwd mm3, mm1 ; 05 03 04 03
1081 pinsrw mm0, r2, 3 ; 08 02 01 00
1083 punpcklwd mm2, mm3 ; 04 10 03 09
1084 pshufw mm2, mm2, 010110100b ; 10 04 03 09
1085 movq [r0+2*0], mm0 ; 08 02 01 00
1086 movq [r0+2*4], mm2 ; 10 04 03 09
1087 movq mm3, [r1+2*12] ; 15 14 13 12
1088 movq mm5, [r1+2*16] ; 19 18 17 16
1089 punpckldq mm6, mm5 ; 17 16 XX XX
1090 psrlq mm1, 16 ; XX 07 06 05
1091 punpckhwd mm6, mm4 ; 08 17 11 16
1092 punpckldq mm6, mm1 ; 06 05 11 16
1093 movq [r0+2*8], mm6 ; 06 05 11 16
1094 psrlq mm1, 16 ; XX XX 07 06
1095 punpcklwd mm1, mm5 ; 17 07 16 06
1096 movq mm0, [r1+2*20] ; 23 22 21 20
1097 movq mm2, [r1+2*24] ; 27 26 25 24
1099 punpckhdq mm1, mm1 ; 17 07 17 07
1100 punpcklwd mm6, mm2 ; 25 13 24 12
1102 movq [r0+2*24], mm0 ; 23 22 21 20
1103 punpcklwd mm1, mm6 ; 24 17 12 07
1105 pinsrw mm3, r2, 0 ; 15 14 13 18
1106 movq [r0+2*16], mm3 ; 15 14 13 18
1108 movq mm0, [r1+2*32] ; 35 34 33 32
1109 psrlq mm5, 48 ; XX XX XX 19
1110 pshufw mm1, mm2, 011111001b ; 27 27 26 25
1111 punpcklwd mm5, mm0 ; 33 XX 32 19
1112 psrlq mm2, 48 ; XX XX XX 27
1113 punpcklwd mm5, mm1 ; 26 32 25 19
1115 movq [r0+2*20], mm5 ; 26 32 25 19
1117 movq mm1, [r1+2*40] ; 43 42 41 40
1118 pshufw mm3, mm0, 011111001b ; 35 35 34 33
1119 punpcklwd mm2, mm1 ; 41 XX 40 27
1121 punpcklwd mm2, mm3 ; 34 40 33 27
1123 movq mm7, [r1+2*44] ; 47 46 45 44
1124 movq mm2, [r1+2*48] ; 51 50 49 48
1125 psrlq mm0, 48 ; XX XX XX 35
1126 punpcklwd mm0, mm2 ; 49 XX 48 35
1127 pshufw mm3, mm1, 011111001b ; 43 43 42 41
1128 punpcklwd mm0, mm3 ; 42 48 41 35
1130 pextrw r2, mm2, 3 ; 51
1131 psrlq mm1, 48 ; XX XX XX 43
1132 punpcklwd mm1, mm7 ; 45 XX 44 43
1133 psrlq mm2, 16 ; XX 51 50 49
1134 punpcklwd mm1, mm2 ; 50 44 49 43
1135 pshufw mm1, mm1, 010110100b ; 44 50 49 43
1137 psrlq mm7, 16 ; XX 47 46 45
1138 pinsrw mm7, r2, 3 ; 51 47 46 45
1140 movq mm0, [r1+2*56] ; 59 58 57 56
1141 movq mm1, [r1+2*52] ; 55 54 53 52
1144 punpckldq mm2, mm1 ; 53 52 57 56
1145 punpckhdq mm1, mm0 ; 59 58 55 54
1151 ;-----------------------------------------------------------------------------
1152 ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
1153 ;-----------------------------------------------------------------------------
1154 %macro ZIGZAG_SUB_4x4 2
1156 cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
1158 cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
1160 movd xmm0, [r1+0*FENC_STRIDE]
1161 movd xmm1, [r1+1*FENC_STRIDE]
1162 movd xmm2, [r1+2*FENC_STRIDE]
1163 movd xmm3, [r1+3*FENC_STRIDE]
1164 movd xmm4, [r2+0*FDEC_STRIDE]
1165 movd xmm5, [r2+1*FDEC_STRIDE]
1166 movd xmm6, [r2+2*FDEC_STRIDE]
1167 movd xmm7, [r2+3*FDEC_STRIDE]
1168 movd [r2+0*FDEC_STRIDE], xmm0
1169 movd [r2+1*FDEC_STRIDE], xmm1
1170 movd [r2+2*FDEC_STRIDE], xmm2
1171 movd [r2+3*FDEC_STRIDE], xmm3
1172 punpckldq xmm0, xmm1
1173 punpckldq xmm2, xmm3
1174 punpckldq xmm4, xmm5
1175 punpckldq xmm6, xmm7
1176 punpcklqdq xmm0, xmm2
1177 punpcklqdq xmm4, xmm6
1179 movdqa xmm7, [pb_sub4frame]
1181 movdqa xmm7, [pb_sub4field]
1188 punpcklbw xmm0, xmm6
1189 punpckhbw xmm1, xmm6
1190 punpcklbw xmm4, xmm6
1191 punpckhbw xmm5, xmm6
1196 pand xmm0, [pb_subacmask]
1200 movdqa [r0+16], xmm1
1212 ZIGZAG_SUB_4x4 , frame
1213 ZIGZAG_SUB_4x4 ac, frame
1214 ZIGZAG_SUB_4x4 , field
1215 ZIGZAG_SUB_4x4 ac, field
1217 ;-----------------------------------------------------------------------------
1218 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1219 ;-----------------------------------------------------------------------------
1222 movq m0, [r1+%1*4+ 0]
1223 movq m1, [r1+%1*4+ 8]
1224 movq m2, [r1+%1*4+16]
1225 movq m3, [r1+%1*4+24]
1226 TRANSPOSE4x4W 0,1,2,3,4
1245 cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
1262 %macro INTERLEAVE_XMM 1
1263 mova m0, [r1+%1*4+ 0]
1264 mova m1, [r1+%1*4+16]
1265 mova m4, [r1+%1*4+32]
1266 mova m5, [r1+%1*4+48]
1267 SBUTTERFLY wd, 0, 1, 6
1268 SBUTTERFLY wd, 4, 5, 7
1269 SBUTTERFLY wd, 0, 1, 6
1270 SBUTTERFLY wd, 4, 5, 7
1272 movhps [r0+%1+ 32], m0
1273 movq [r0+%1+ 64], m1
1274 movhps [r0+%1+ 96], m1
1276 movhps [r0+%1+ 40], m4
1277 movq [r0+%1+ 72], m5
1278 movhps [r0+%1+104], m5
1293 cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8