1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
29 %macro SHUFFLE_16BIT 8
38 pw_32_0: times 4 dw 32
41 pw_8000: times 8 dw 0x8000
42 hsub_mul: times 8 db 1, -1
44 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
45 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
46 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
47 pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
48 pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
49 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
50 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
57 SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
58 SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
62 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
75 ;-----------------------------------------------------------------------------
76 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
77 ;-----------------------------------------------------------------------------
78 cglobal x264_dct4x4dc_mmx, 1,1
83 movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
85 TRANSPOSE4x4W 0,1,2,3,4
86 SUMSUB_BADC m1, m0, m3, m2, m4
97 ;-----------------------------------------------------------------------------
98 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
99 ;-----------------------------------------------------------------------------
100 cglobal x264_idct4x4dc_mmx, 1,1
106 TRANSPOSE4x4W 0,1,2,3,4
115 ;-----------------------------------------------------------------------------
116 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
117 ;-----------------------------------------------------------------------------
118 cglobal x264_sub4x4_dct_%1, 3,3
121 LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
122 LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
123 LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
124 LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
126 mova m5, [hsub_mul GLOBAL]
127 LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
130 TRANSPOSE4x4W 0,1,2,3,4
142 ;-----------------------------------------------------------------------------
143 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
144 ;-----------------------------------------------------------------------------
145 cglobal x264_add4x4_idct_mmx, 2,2
153 TRANSPOSE4x4W 0,1,2,3,4
154 paddw m0, [pw_32 GLOBAL]
156 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
157 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
158 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
159 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
163 cglobal x264_add4x4_idct_sse4, 2,2,6
164 mova m0, [r1+0x00] ; row1/row0
165 mova m2, [r1+0x10] ; row3/row2
166 mova m1, m0 ; row1/row0
167 psraw m0, 1 ; row1>>1/...
168 mova m3, m2 ; row3/row2
169 psraw m2, 1 ; row3>>1/...
170 movsd m0, m1 ; row1>>1/row0
171 movsd m2, m3 ; row3>>1/row2
172 psubw m0, m3 ; row1>>1-row3/row0-2
173 paddw m2, m1 ; row3>>1+row1/row0+2
174 SBUTTERFLY2 wd, 0, 2, 1
176 pshuflw m1, m2, 10110001b
177 pshufhw m2, m2, 10110001b
182 mova m1, [pw_32_0 GLOBAL]
183 paddw m1, m0 ; row1/row0 corrected
184 psraw m0, 1 ; row1>>1/...
185 mova m3, m2 ; row3/row2
186 psraw m2, 1 ; row3>>1/...
187 movsd m0, m1 ; row1>>1/row0
188 movsd m2, m3 ; row3>>1/row2
189 psubw m0, m3 ; row1>>1-row3/row0-2
190 paddw m2, m1 ; row3>>1+row1/row0+2
191 SBUTTERFLY2 qdq, 0, 2, 1
194 movd m4, [r0+FDEC_STRIDE*0]
195 movd m1, [r0+FDEC_STRIDE*1]
196 movd m3, [r0+FDEC_STRIDE*2]
197 movd m5, [r0+FDEC_STRIDE*3]
198 punpckldq m1, m4 ; row0/row1
200 punpckldq m3, m5 ; row3/row2
207 packuswb m0, m2 ; row0/row1/row3/row2
208 pextrd [r0+FDEC_STRIDE*0], m0, 3
209 pextrd [r0+FDEC_STRIDE*1], m0, 2
210 movd [r0+FDEC_STRIDE*2], m0
211 pextrd [r0+FDEC_STRIDE*3], m0, 1
215 ;-----------------------------------------------------------------------------
216 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
217 ;-----------------------------------------------------------------------------
223 add r2, 4*FDEC_STRIDE
224 mova m7, [hsub_mul GLOBAL]
232 add r1, %4-%5-%6*FENC_STRIDE
233 add r2, %4-%5-%6*FDEC_STRIDE
236 add r1, (%4-%6)*FENC_STRIDE-%5-%4
237 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
240 add r1, %4-%5-%6*FENC_STRIDE
241 add r2, %4-%5-%6*FDEC_STRIDE
251 ;-----------------------------------------------------------------------------
252 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
253 ;-----------------------------------------------------------------------------
254 %macro ADD_NxN_IDCT 6-7
258 add r0, 4*FDEC_STRIDE
265 add r0, %4-%5-%6*FDEC_STRIDE
268 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
271 add r0, %4-%5-%6*FDEC_STRIDE
283 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
284 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
285 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
286 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
288 cextern x264_sub8x8_dct8_mmx.skip_prologue
289 cextern x264_add8x8_idct8_mmx.skip_prologue
290 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
291 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
296 cextern x264_sub8x8_dct_sse2.skip_prologue
297 cextern x264_sub8x8_dct_ssse3.skip_prologue
298 SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
299 SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
300 cextern x264_add8x8_idct_sse2.skip_prologue
301 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
303 cextern x264_sub8x8_dct8_sse2.skip_prologue
304 cextern x264_add8x8_idct8_sse2.skip_prologue
305 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
306 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
308 cextern x264_sub8x8_dct8_ssse3.skip_prologue
309 SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
312 ;-----------------------------------------------------------------------------
313 ; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
314 ;-----------------------------------------------------------------------------
317 movq mm4, [%3+FDEC_STRIDE*0]
318 movq mm5, [%3+FDEC_STRIDE*1]
319 movq mm6, [%3+FDEC_STRIDE*2]
323 paddusb %1, [%3+FDEC_STRIDE*3]
328 movq [%3+FDEC_STRIDE*0], mm4
329 movq [%3+FDEC_STRIDE*1], mm5
330 movq [%3+FDEC_STRIDE*2], mm6
331 movq [%3+FDEC_STRIDE*3], %1
334 cglobal x264_add8x8_idct_dc_mmx, 2,2
337 add r0, FDEC_STRIDE*4
338 paddw mm0, [pw_32 GLOBAL]
345 pshufw mm2, mm0, 0xFA
346 pshufw mm3, mm1, 0xFA
349 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
353 cglobal x264_add8x8_idct_dc_ssse3, 2,2
356 add r0, FDEC_STRIDE*4
357 paddw xmm0, [pw_32 GLOBAL]
360 movdqa xmm5, [pb_idctdc_unpack GLOBAL]
365 movq xmm2, [r0+FDEC_STRIDE*-4]
366 movq xmm3, [r0+FDEC_STRIDE*-3]
367 movq xmm4, [r0+FDEC_STRIDE*-2]
368 movq xmm5, [r0+FDEC_STRIDE*-1]
369 movhps xmm2, [r0+FDEC_STRIDE* 0]
370 movhps xmm3, [r0+FDEC_STRIDE* 1]
371 movhps xmm4, [r0+FDEC_STRIDE* 2]
372 movhps xmm5, [r0+FDEC_STRIDE* 3]
381 movq [r0+FDEC_STRIDE*-4], xmm2
382 movq [r0+FDEC_STRIDE*-3], xmm3
383 movq [r0+FDEC_STRIDE*-2], xmm4
384 movq [r0+FDEC_STRIDE*-1], xmm5
385 movhps [r0+FDEC_STRIDE* 0], xmm2
386 movhps [r0+FDEC_STRIDE* 1], xmm3
387 movhps [r0+FDEC_STRIDE* 2], xmm4
388 movhps [r0+FDEC_STRIDE* 3], xmm5
391 cglobal x264_add16x16_idct_dc_mmx, 2,3
396 paddw mm0, [pw_32 GLOBAL]
403 pshufw mm2, mm0, 0xFA
404 pshufw mm3, mm1, 0xFA
408 ADD_DC mm2, mm3, r0+8
410 add r0, FDEC_STRIDE*4
415 %macro IDCT_DC_STORE 3
416 movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
417 movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
418 movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
419 movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
428 movdqa [r0+%1+FDEC_STRIDE*0], xmm4
429 movdqa [r0+%1+FDEC_STRIDE*1], xmm5
430 movdqa [r0+%1+FDEC_STRIDE*2], xmm6
431 movdqa [r0+%1+FDEC_STRIDE*3], xmm7
434 cglobal x264_add16x16_idct_dc_sse2, 2,2,8
436 add r0, FDEC_STRIDE*4
442 add r0, FDEC_STRIDE*4
450 paddw xmm0, [pw_32 GLOBAL]
451 paddw xmm2, [pw_32 GLOBAL]
464 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
465 IDCT_DC_STORE 0, xmm2, xmm3
468 cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
470 add r0, FDEC_STRIDE*4
476 add r0, FDEC_STRIDE*4
480 paddw xmm0, [pw_32 GLOBAL]
483 movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
484 movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
493 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
494 IDCT_DC_STORE 0, xmm2, xmm3
497 ;-----------------------------------------------------------------------------
498 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
499 ;-----------------------------------------------------------------------------
501 %macro DCTDC_2ROW_MMX 3
502 movq %1, [r1+FENC_STRIDE*(0+%3)]
503 movq m1, [r1+FENC_STRIDE*(1+%3)]
504 movq m2, [r2+FDEC_STRIDE*(0+%3)]
505 movq m3, [r2+FDEC_STRIDE*(1+%3)]
521 cglobal x264_sub8x8_dct_dc_mmxext, 3,3
524 add r1, FENC_STRIDE*4
525 add r2, FDEC_STRIDE*4
528 DCTDC_2ROW_MMX m0, m4, 0
529 DCTDC_2ROW_MMX m5, m6, 2
537 %macro DCTDC_2ROW_SSE2 3
538 movq m0, [r1+FENC_STRIDE*(0+%1)]
539 movq m1, [r1+FENC_STRIDE*(1+%1)]
540 movq m2, [r2+FDEC_STRIDE*(0+%1)]
541 movq m3, [r2+FDEC_STRIDE*(1+%1)]
555 cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
557 DCTDC_2ROW_SSE2 0, 0, m4
558 DCTDC_2ROW_SSE2 2, 1, m4
559 add r1, FENC_STRIDE*4
560 add r2, FDEC_STRIDE*4
562 DCTDC_2ROW_SSE2 0, 0, m5
563 DCTDC_2ROW_SSE2 2, 1, m5
570 ;-----------------------------------------------------------------------------
571 ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
572 ;-----------------------------------------------------------------------------
574 cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
578 PALIGNR xmm1, xmm1, 14, xmm2
583 PALIGNR xmm2, xmm2, 12, xmm4
585 PALIGNR xmm3, xmm3, 10, xmm4
611 movdqa xmm7, [r1+112]
619 PALIGNR xmm4, xmm4, 14, xmm3
621 PALIGNR xmm5, xmm5, 12, xmm3
623 PALIGNR xmm6, xmm6, 10, xmm3
626 PALIGNR xmm7, xmm7, 8, xmm3
630 punpcklqdq xmm7, xmm7
650 pshufw mm4, mm4, 0x6c
664 pshufhw xmm0, xmm0, 0x1b
665 pshuflw xmm4, xmm4, 0x1b
666 pshufhw xmm3, xmm3, 0x1b
667 pshuflw xmm7, xmm7, 0x1b
669 movlps [r0+2*10], xmm0
670 movhps [r0+2*17], xmm0
671 movlps [r0+2*21], xmm3
672 movlps [r0+2*28], xmm4
673 movhps [r0+2*32], xmm3
674 movhps [r0+2*39], xmm4
675 movlps [r0+2*43], xmm7
676 movhps [r0+2*50], xmm7
682 %define PALIGNR PALIGNR_MMX
684 %define PALIGNR PALIGNR_SSSE3
687 ;-----------------------------------------------------------------------------
688 ; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
689 ;-----------------------------------------------------------------------------
690 cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
752 pshufw mm6, mm6, 0x1b
753 pshufw mm5, mm5, 0x1b
774 pshufw mm2, mm2, 0x1b
775 pshufw mm7, mm7, 0x1b
782 ;-----------------------------------------------------------------------------
783 ; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
784 ;-----------------------------------------------------------------------------
785 cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
812 ;-----------------------------------------------------------------------------
813 ; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
814 ;-----------------------------------------------------------------------------
815 cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
818 pshufb xmm1, [pb_scan4frameb GLOBAL]
819 pshufb xmm0, [pb_scan4framea GLOBAL]
822 palignr xmm2, xmm0, 6
824 palignr xmm1, xmm0, 10
829 ;-----------------------------------------------------------------------------
830 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
831 ;-----------------------------------------------------------------------------
832 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
833 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
834 pshufw mm0, [r1+4], 0xd2
846 ;-----------------------------------------------------------------------------
847 ; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
848 ;-----------------------------------------------------------------------------
852 ; 16 11 5 6 7 12 17 24
853 ; 18 13 14 15 19 25 32 26
854 ; 20 21 22 23 27 33 40 34
855 ; 28 29 30 31 35 41 48 42
856 ; 36 37 38 39 43 49 50 44
857 ; 45 46 47 51 56 57 52 53
858 ; 54 55 58 59 60 61 62 63
860 cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
861 movq mm0, [r1+2*0] ; 03 02 01 00
862 movq mm1, [r1+2*4] ; 07 06 05 04
863 movq mm2, [r1+2*8] ; 11 10 09 08
864 pshufw mm3, mm0, 011111111b ; 03 03 03 03
866 pshufw mm2, mm2, 000111001b ; 08 11 10 09
867 punpcklwd mm3, mm1 ; 05 03 04 03
868 pinsrw mm0, r2, 3 ; 08 02 01 00
870 punpcklwd mm2, mm3 ; 04 10 03 09
871 pshufw mm2, mm2, 010110100b ; 10 04 03 09
872 movq [r0+2*0], mm0 ; 08 02 01 00
873 movq [r0+2*4], mm2 ; 10 04 03 09
874 movq mm3, [r1+2*12] ; 15 14 13 12
875 movq mm5, [r1+2*16] ; 19 18 17 16
876 punpckldq mm6, mm5 ; 17 16 XX XX
877 psrlq mm1, 16 ; XX 07 06 05
878 punpckhwd mm6, mm4 ; 08 17 11 16
879 punpckldq mm6, mm1 ; 06 05 11 16
880 movq [r0+2*8], mm6 ; 06 05 11 16
881 psrlq mm1, 16 ; XX XX 07 06
882 punpcklwd mm1, mm5 ; 17 07 16 06
883 movq mm0, [r1+2*20] ; 23 22 21 20
884 movq mm2, [r1+2*24] ; 27 26 25 24
886 punpckhdq mm1, mm1 ; 17 07 17 07
887 punpcklwd mm6, mm2 ; 25 13 24 12
889 movq [r0+2*24], mm0 ; 23 22 21 20
890 punpcklwd mm1, mm6 ; 24 17 12 07
892 pinsrw mm3, r2, 0 ; 15 14 13 18
893 movq [r0+2*16], mm3 ; 15 14 13 18
895 movq mm0, [r1+2*32] ; 35 34 33 32
896 psrlq mm5, 48 ; XX XX XX 19
897 pshufw mm1, mm2, 011111001b ; 27 27 26 25
898 punpcklwd mm5, mm0 ; 33 XX 32 19
899 psrlq mm2, 48 ; XX XX XX 27
900 punpcklwd mm5, mm1 ; 26 32 25 19
902 movq [r0+2*20], mm5 ; 26 32 25 19
904 movq mm1, [r1+2*40] ; 43 42 41 40
905 pshufw mm3, mm0, 011111001b ; 35 35 34 33
906 punpcklwd mm2, mm1 ; 41 XX 40 27
908 punpcklwd mm2, mm3 ; 34 40 33 27
910 movq mm7, [r1+2*44] ; 47 46 45 44
911 movq mm2, [r1+2*48] ; 51 50 49 48
912 psrlq mm0, 48 ; XX XX XX 35
913 punpcklwd mm0, mm2 ; 49 XX 48 35
914 pshufw mm3, mm1, 011111001b ; 43 43 42 41
915 punpcklwd mm0, mm3 ; 42 48 41 35
917 pextrw r2, mm2, 3 ; 51
918 psrlq mm1, 48 ; XX XX XX 43
919 punpcklwd mm1, mm7 ; 45 XX 44 43
920 psrlq mm2, 16 ; XX 51 50 49
921 punpcklwd mm1, mm2 ; 50 44 49 43
922 pshufw mm1, mm1, 010110100b ; 44 50 49 43
924 psrlq mm7, 16 ; XX 47 46 45
925 pinsrw mm7, r2, 3 ; 51 47 46 45
927 movq mm0, [r1+2*56] ; 59 58 57 56
928 movq mm1, [r1+2*52] ; 55 54 53 52
931 punpckldq mm2, mm1 ; 53 52 57 56
932 punpckhdq mm1, mm0 ; 59 58 55 54
938 ;-----------------------------------------------------------------------------
939 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
940 ;-----------------------------------------------------------------------------
941 %macro ZIGZAG_SUB_4x4 2
943 cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
945 cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
947 movd xmm0, [r1+0*FENC_STRIDE]
948 movd xmm1, [r1+1*FENC_STRIDE]
949 movd xmm2, [r1+2*FENC_STRIDE]
950 movd xmm3, [r1+3*FENC_STRIDE]
951 movd xmm4, [r2+0*FDEC_STRIDE]
952 movd xmm5, [r2+1*FDEC_STRIDE]
953 movd xmm6, [r2+2*FDEC_STRIDE]
954 movd xmm7, [r2+3*FDEC_STRIDE]
955 movd [r2+0*FDEC_STRIDE], xmm0
956 movd [r2+1*FDEC_STRIDE], xmm1
957 movd [r2+2*FDEC_STRIDE], xmm2
958 movd [r2+3*FDEC_STRIDE], xmm3
963 punpcklqdq xmm0, xmm2
964 punpcklqdq xmm4, xmm6
966 movdqa xmm7, [pb_sub4frame GLOBAL]
968 movdqa xmm7, [pb_sub4field GLOBAL]
983 pand xmm0, [pb_subacmask GLOBAL]
999 ZIGZAG_SUB_4x4 , frame
1000 ZIGZAG_SUB_4x4 ac, frame
1001 ZIGZAG_SUB_4x4 , field
1002 ZIGZAG_SUB_4x4 ac, field
1004 ;-----------------------------------------------------------------------------
1005 ; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
1006 ;-----------------------------------------------------------------------------
1009 movq m0, [r1+%1*4+ 0]
1010 movq m1, [r1+%1*4+ 8]
1011 movq m2, [r1+%1*4+16]
1012 movq m3, [r1+%1*4+24]
1013 TRANSPOSE4x4W 0,1,2,3,4
1032 cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
1042 paddb m5, [pb_1 GLOBAL]
1049 %macro INTERLEAVE_XMM 1
1050 mova m0, [r1+%1*4+ 0]
1051 mova m1, [r1+%1*4+16]
1052 mova m4, [r1+%1*4+32]
1053 mova m5, [r1+%1*4+48]
1054 SBUTTERFLY wd, 0, 1, 6
1055 SBUTTERFLY wd, 4, 5, 7
1056 SBUTTERFLY wd, 0, 1, 6
1057 SBUTTERFLY wd, 4, 5, 7
1059 movhps [r0+%1+ 32], m0
1060 movq [r0+%1+ 64], m1
1061 movhps [r0+%1+ 96], m1
1063 movhps [r0+%1+ 40], m4
1064 movq [r0+%1+ 72], m5
1065 movhps [r0+%1+104], m5
1080 cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
1088 paddb m5, [pb_1 GLOBAL]