1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Holger Lubitz <holger@lubitz.org>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
29 %macro SHUFFLE_16BIT 8
38 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
39 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
40 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
41 pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
42 pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
43 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
44 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
56 SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
57 SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
61 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
74 ;-----------------------------------------------------------------------------
75 ; void dct4x4dc( int16_t d[4][4] )
76 ;-----------------------------------------------------------------------------
77 cglobal dct4x4dc_mmx, 1,1
82 movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
84 TRANSPOSE4x4W 0,1,2,3,4
85 SUMSUB_BADC m1, m0, m3, m2, m4
96 ;-----------------------------------------------------------------------------
97 ; void idct4x4dc( int16_t d[4][4] )
98 ;-----------------------------------------------------------------------------
99 cglobal idct4x4dc_mmx, 1,1
105 TRANSPOSE4x4W 0,1,2,3,4
114 ;-----------------------------------------------------------------------------
115 ; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
116 ;-----------------------------------------------------------------------------
117 cglobal sub4x4_dct_%1, 3,3
120 LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
121 LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
122 LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
123 LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
126 LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
129 TRANSPOSE4x4W 0,1,2,3,4
141 ;-----------------------------------------------------------------------------
142 ; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
143 ;-----------------------------------------------------------------------------
144 cglobal add4x4_idct_mmx, 2,2
152 TRANSPOSE4x4W 0,1,2,3,4
155 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
156 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
157 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
158 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
162 cglobal add4x4_idct_sse4, 2,2,6
163 mova m0, [r1+0x00] ; row1/row0
164 mova m2, [r1+0x10] ; row3/row2
165 mova m1, m0 ; row1/row0
166 psraw m0, 1 ; row1>>1/...
167 mova m3, m2 ; row3/row2
168 psraw m2, 1 ; row3>>1/...
169 movsd m0, m1 ; row1>>1/row0
170 movsd m2, m3 ; row3>>1/row2
171 psubw m0, m3 ; row1>>1-row3/row0-2
172 paddw m2, m1 ; row3>>1+row1/row0+2
173 SBUTTERFLY2 wd, 0, 2, 1
175 pshuflw m1, m2, 10110001b
176 pshufhw m2, m2, 10110001b
182 paddw m1, m0 ; row1/row0 corrected
183 psraw m0, 1 ; row1>>1/...
184 mova m3, m2 ; row3/row2
185 psraw m2, 1 ; row3>>1/...
186 movsd m0, m1 ; row1>>1/row0
187 movsd m2, m3 ; row3>>1/row2
188 psubw m0, m3 ; row1>>1-row3/row0-2
189 paddw m2, m1 ; row3>>1+row1/row0+2
190 SBUTTERFLY2 qdq, 0, 2, 1
193 movd m4, [r0+FDEC_STRIDE*0]
194 movd m1, [r0+FDEC_STRIDE*1]
195 movd m3, [r0+FDEC_STRIDE*2]
196 movd m5, [r0+FDEC_STRIDE*3]
197 punpckldq m1, m4 ; row0/row1
199 punpckldq m3, m5 ; row3/row2
206 packuswb m0, m2 ; row0/row1/row3/row2
207 pextrd [r0+FDEC_STRIDE*0], m0, 3
208 pextrd [r0+FDEC_STRIDE*1], m0, 2
209 movd [r0+FDEC_STRIDE*2], m0
210 pextrd [r0+FDEC_STRIDE*3], m0, 1
214 ;-----------------------------------------------------------------------------
215 ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
216 ;-----------------------------------------------------------------------------
222 add r2, 4*FDEC_STRIDE
231 add r1, %4-%5-%6*FENC_STRIDE
232 add r2, %4-%5-%6*FDEC_STRIDE
235 add r1, (%4-%6)*FENC_STRIDE-%5-%4
236 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
239 add r1, %4-%5-%6*FENC_STRIDE
240 add r2, %4-%5-%6*FDEC_STRIDE
250 ;-----------------------------------------------------------------------------
251 ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
252 ;-----------------------------------------------------------------------------
253 %macro ADD_NxN_IDCT 6-7
257 add r0, 4*FDEC_STRIDE
264 add r0, %4-%5-%6*FDEC_STRIDE
267 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
270 add r0, %4-%5-%6*FDEC_STRIDE
282 SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
283 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
284 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
285 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
287 cextern sub8x8_dct8_mmx.skip_prologue
288 cextern add8x8_idct8_mmx.skip_prologue
289 SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
290 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
295 cextern sub8x8_dct_sse2.skip_prologue
296 cextern sub8x8_dct_ssse3.skip_prologue
297 SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
298 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
299 cextern add8x8_idct_sse2.skip_prologue
300 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
302 cextern sub8x8_dct8_sse2.skip_prologue
303 cextern add8x8_idct8_sse2.skip_prologue
304 SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
305 ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
307 cextern sub8x8_dct8_ssse3.skip_prologue
308 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
311 ;-----------------------------------------------------------------------------
312 ; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
313 ;-----------------------------------------------------------------------------
316 movq mm4, [%3+FDEC_STRIDE*0]
317 movq mm5, [%3+FDEC_STRIDE*1]
318 movq mm6, [%3+FDEC_STRIDE*2]
322 paddusb %1, [%3+FDEC_STRIDE*3]
327 movq [%3+FDEC_STRIDE*0], mm4
328 movq [%3+FDEC_STRIDE*1], mm5
329 movq [%3+FDEC_STRIDE*2], mm6
330 movq [%3+FDEC_STRIDE*3], %1
333 cglobal add8x8_idct_dc_mmx, 2,2
336 add r0, FDEC_STRIDE*4
344 pshufw mm2, mm0, 0xFA
345 pshufw mm3, mm1, 0xFA
348 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
352 cglobal add8x8_idct_dc_ssse3, 2,2
355 add r0, FDEC_STRIDE*4
359 movdqa xmm5, [pb_idctdc_unpack]
364 movq xmm2, [r0+FDEC_STRIDE*-4]
365 movq xmm3, [r0+FDEC_STRIDE*-3]
366 movq xmm4, [r0+FDEC_STRIDE*-2]
367 movq xmm5, [r0+FDEC_STRIDE*-1]
368 movhps xmm2, [r0+FDEC_STRIDE* 0]
369 movhps xmm3, [r0+FDEC_STRIDE* 1]
370 movhps xmm4, [r0+FDEC_STRIDE* 2]
371 movhps xmm5, [r0+FDEC_STRIDE* 3]
380 movq [r0+FDEC_STRIDE*-4], xmm2
381 movq [r0+FDEC_STRIDE*-3], xmm3
382 movq [r0+FDEC_STRIDE*-2], xmm4
383 movq [r0+FDEC_STRIDE*-1], xmm5
384 movhps [r0+FDEC_STRIDE* 0], xmm2
385 movhps [r0+FDEC_STRIDE* 1], xmm3
386 movhps [r0+FDEC_STRIDE* 2], xmm4
387 movhps [r0+FDEC_STRIDE* 3], xmm5
390 cglobal add16x16_idct_dc_mmx, 2,3
402 pshufw mm2, mm0, 0xFA
403 pshufw mm3, mm1, 0xFA
407 ADD_DC mm2, mm3, r0+8
409 add r0, FDEC_STRIDE*4
414 %macro IDCT_DC_STORE 3
415 movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
416 movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
417 movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
418 movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
427 movdqa [r0+%1+FDEC_STRIDE*0], xmm4
428 movdqa [r0+%1+FDEC_STRIDE*1], xmm5
429 movdqa [r0+%1+FDEC_STRIDE*2], xmm6
430 movdqa [r0+%1+FDEC_STRIDE*3], xmm7
433 cglobal add16x16_idct_dc_sse2, 2,2,8
435 add r0, FDEC_STRIDE*4
441 add r0, FDEC_STRIDE*4
463 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
464 IDCT_DC_STORE 0, xmm2, xmm3
467 cglobal add16x16_idct_dc_ssse3, 2,2,8
469 add r0, FDEC_STRIDE*4
475 add r0, FDEC_STRIDE*4
482 movdqa xmm5, [ pb_idctdc_unpack]
483 movdqa xmm6, [pb_idctdc_unpack2]
492 IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
493 IDCT_DC_STORE 0, xmm2, xmm3
496 ;-----------------------------------------------------------------------------
497 ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
498 ;-----------------------------------------------------------------------------
500 %macro DCTDC_2ROW_MMX 3
501 movq %1, [r1+FENC_STRIDE*(0+%3)]
502 movq m1, [r1+FENC_STRIDE*(1+%3)]
503 movq m2, [r2+FDEC_STRIDE*(0+%3)]
504 movq m3, [r2+FDEC_STRIDE*(1+%3)]
520 %macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
521 pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
522 pshufw mm0, %2, 10110001b ; s3 __ s2 __
523 paddw mm1, %2 ; s1 s13 s0 s02
524 psubw mm1, mm0 ; d13 s13 d02 s02
525 pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
526 psrlq mm1, 32 ; __ __ d13 s13
527 paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
528 psllq mm1, 32 ; d13 s13
529 psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
533 cglobal sub8x8_dct_dc_mmxext, 3,3
534 DCTDC_2ROW_MMX m0, m4, 0
535 DCTDC_2ROW_MMX m5, m6, 2
539 add r1, FENC_STRIDE*4
540 add r2, FDEC_STRIDE*4
541 DCTDC_2ROW_MMX m7, m4, 0
542 DCTDC_2ROW_MMX m5, m6, 2
551 %macro DCTDC_2ROW_SSE2 3
552 movq m0, [r1+FENC_STRIDE*(0+%1)]
553 movq m1, [r1+FENC_STRIDE*(1+%1)]
554 movq m2, [r2+FDEC_STRIDE*(0+%1)]
555 movq m3, [r2+FDEC_STRIDE*(1+%1)]
569 cglobal sub8x8_dct_dc_sse2, 3,3,8
571 DCTDC_2ROW_SSE2 0, 0, m4
572 DCTDC_2ROW_SSE2 2, 1, m4
573 add r1, FENC_STRIDE*4
574 add r2, FDEC_STRIDE*4
576 DCTDC_2ROW_SSE2 0, 0, m5
577 DCTDC_2ROW_SSE2 2, 1, m5
587 ;-----------------------------------------------------------------------------
588 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
589 ;-----------------------------------------------------------------------------
591 cglobal zigzag_scan_8x8_frame_%1, 2,2,8
595 PALIGNR xmm1, xmm1, 14, xmm2
600 PALIGNR xmm2, xmm2, 12, xmm4
602 PALIGNR xmm3, xmm3, 10, xmm4
628 movdqa xmm7, [r1+112]
636 PALIGNR xmm4, xmm4, 14, xmm3
638 PALIGNR xmm5, xmm5, 12, xmm3
640 PALIGNR xmm6, xmm6, 10, xmm3
643 PALIGNR xmm7, xmm7, 8, xmm3
647 punpcklqdq xmm7, xmm7
667 pshufw mm4, mm4, 0x6c
681 pshufhw xmm0, xmm0, 0x1b
682 pshuflw xmm4, xmm4, 0x1b
683 pshufhw xmm3, xmm3, 0x1b
684 pshuflw xmm7, xmm7, 0x1b
686 movlps [r0+2*10], xmm0
687 movhps [r0+2*17], xmm0
688 movlps [r0+2*21], xmm3
689 movlps [r0+2*28], xmm4
690 movhps [r0+2*32], xmm3
691 movhps [r0+2*39], xmm4
692 movlps [r0+2*43], xmm7
693 movhps [r0+2*50], xmm7
699 %define PALIGNR PALIGNR_MMX
701 %define PALIGNR PALIGNR_SSSE3
704 ;-----------------------------------------------------------------------------
705 ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
706 ;-----------------------------------------------------------------------------
707 cglobal zigzag_scan_8x8_frame_mmxext, 2,2
769 pshufw mm6, mm6, 0x1b
770 pshufw mm5, mm5, 0x1b
791 pshufw mm2, mm2, 0x1b
792 pshufw mm7, mm7, 0x1b
799 ;-----------------------------------------------------------------------------
800 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
801 ;-----------------------------------------------------------------------------
802 cglobal zigzag_scan_4x4_frame_mmx, 2,2
829 ;-----------------------------------------------------------------------------
830 ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
831 ;-----------------------------------------------------------------------------
832 cglobal zigzag_scan_4x4_frame_ssse3, 2,2
835 pshufb xmm1, [pb_scan4frameb]
836 pshufb xmm0, [pb_scan4framea]
839 palignr xmm2, xmm0, 6
841 palignr xmm1, xmm0, 10
846 ;-----------------------------------------------------------------------------
847 ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
848 ;-----------------------------------------------------------------------------
849 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
850 cglobal zigzag_scan_4x4_field_mmxext, 2,3
851 pshufw mm0, [r1+4], 0xd2
863 ;-----------------------------------------------------------------------------
864 ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
865 ;-----------------------------------------------------------------------------
869 ; 16 11 5 6 7 12 17 24
870 ; 18 13 14 15 19 25 32 26
871 ; 20 21 22 23 27 33 40 34
872 ; 28 29 30 31 35 41 48 42
873 ; 36 37 38 39 43 49 50 44
874 ; 45 46 47 51 56 57 52 53
875 ; 54 55 58 59 60 61 62 63
877 cglobal zigzag_scan_8x8_field_mmxext, 2,3
878 movq mm0, [r1+2*0] ; 03 02 01 00
879 movq mm1, [r1+2*4] ; 07 06 05 04
880 movq mm2, [r1+2*8] ; 11 10 09 08
881 pshufw mm3, mm0, 011111111b ; 03 03 03 03
883 pshufw mm2, mm2, 000111001b ; 08 11 10 09
884 punpcklwd mm3, mm1 ; 05 03 04 03
885 pinsrw mm0, r2, 3 ; 08 02 01 00
887 punpcklwd mm2, mm3 ; 04 10 03 09
888 pshufw mm2, mm2, 010110100b ; 10 04 03 09
889 movq [r0+2*0], mm0 ; 08 02 01 00
890 movq [r0+2*4], mm2 ; 10 04 03 09
891 movq mm3, [r1+2*12] ; 15 14 13 12
892 movq mm5, [r1+2*16] ; 19 18 17 16
893 punpckldq mm6, mm5 ; 17 16 XX XX
894 psrlq mm1, 16 ; XX 07 06 05
895 punpckhwd mm6, mm4 ; 08 17 11 16
896 punpckldq mm6, mm1 ; 06 05 11 16
897 movq [r0+2*8], mm6 ; 06 05 11 16
898 psrlq mm1, 16 ; XX XX 07 06
899 punpcklwd mm1, mm5 ; 17 07 16 06
900 movq mm0, [r1+2*20] ; 23 22 21 20
901 movq mm2, [r1+2*24] ; 27 26 25 24
903 punpckhdq mm1, mm1 ; 17 07 17 07
904 punpcklwd mm6, mm2 ; 25 13 24 12
906 movq [r0+2*24], mm0 ; 23 22 21 20
907 punpcklwd mm1, mm6 ; 24 17 12 07
909 pinsrw mm3, r2, 0 ; 15 14 13 18
910 movq [r0+2*16], mm3 ; 15 14 13 18
912 movq mm0, [r1+2*32] ; 35 34 33 32
913 psrlq mm5, 48 ; XX XX XX 19
914 pshufw mm1, mm2, 011111001b ; 27 27 26 25
915 punpcklwd mm5, mm0 ; 33 XX 32 19
916 psrlq mm2, 48 ; XX XX XX 27
917 punpcklwd mm5, mm1 ; 26 32 25 19
919 movq [r0+2*20], mm5 ; 26 32 25 19
921 movq mm1, [r1+2*40] ; 43 42 41 40
922 pshufw mm3, mm0, 011111001b ; 35 35 34 33
923 punpcklwd mm2, mm1 ; 41 XX 40 27
925 punpcklwd mm2, mm3 ; 34 40 33 27
927 movq mm7, [r1+2*44] ; 47 46 45 44
928 movq mm2, [r1+2*48] ; 51 50 49 48
929 psrlq mm0, 48 ; XX XX XX 35
930 punpcklwd mm0, mm2 ; 49 XX 48 35
931 pshufw mm3, mm1, 011111001b ; 43 43 42 41
932 punpcklwd mm0, mm3 ; 42 48 41 35
934 pextrw r2, mm2, 3 ; 51
935 psrlq mm1, 48 ; XX XX XX 43
936 punpcklwd mm1, mm7 ; 45 XX 44 43
937 psrlq mm2, 16 ; XX 51 50 49
938 punpcklwd mm1, mm2 ; 50 44 49 43
939 pshufw mm1, mm1, 010110100b ; 44 50 49 43
941 psrlq mm7, 16 ; XX 47 46 45
942 pinsrw mm7, r2, 3 ; 51 47 46 45
944 movq mm0, [r1+2*56] ; 59 58 57 56
945 movq mm1, [r1+2*52] ; 55 54 53 52
948 punpckldq mm2, mm1 ; 53 52 57 56
949 punpckhdq mm1, mm0 ; 59 58 55 54
955 ;-----------------------------------------------------------------------------
956 ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
957 ;-----------------------------------------------------------------------------
958 %macro ZIGZAG_SUB_4x4 2
960 cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
962 cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
964 movd xmm0, [r1+0*FENC_STRIDE]
965 movd xmm1, [r1+1*FENC_STRIDE]
966 movd xmm2, [r1+2*FENC_STRIDE]
967 movd xmm3, [r1+3*FENC_STRIDE]
968 movd xmm4, [r2+0*FDEC_STRIDE]
969 movd xmm5, [r2+1*FDEC_STRIDE]
970 movd xmm6, [r2+2*FDEC_STRIDE]
971 movd xmm7, [r2+3*FDEC_STRIDE]
972 movd [r2+0*FDEC_STRIDE], xmm0
973 movd [r2+1*FDEC_STRIDE], xmm1
974 movd [r2+2*FDEC_STRIDE], xmm2
975 movd [r2+3*FDEC_STRIDE], xmm3
980 punpcklqdq xmm0, xmm2
981 punpcklqdq xmm4, xmm6
983 movdqa xmm7, [pb_sub4frame]
985 movdqa xmm7, [pb_sub4field]
1000 pand xmm0, [pb_subacmask]
1004 movdqa [r0+16], xmm1
1016 ZIGZAG_SUB_4x4 , frame
1017 ZIGZAG_SUB_4x4 ac, frame
1018 ZIGZAG_SUB_4x4 , field
1019 ZIGZAG_SUB_4x4 ac, field
1021 ;-----------------------------------------------------------------------------
1022 ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
1023 ;-----------------------------------------------------------------------------
1026 movq m0, [r1+%1*4+ 0]
1027 movq m1, [r1+%1*4+ 8]
1028 movq m2, [r1+%1*4+16]
1029 movq m3, [r1+%1*4+24]
1030 TRANSPOSE4x4W 0,1,2,3,4
1049 cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
1066 %macro INTERLEAVE_XMM 1
1067 mova m0, [r1+%1*4+ 0]
1068 mova m1, [r1+%1*4+16]
1069 mova m4, [r1+%1*4+32]
1070 mova m5, [r1+%1*4+48]
1071 SBUTTERFLY wd, 0, 1, 6
1072 SBUTTERFLY wd, 4, 5, 7
1073 SBUTTERFLY wd, 0, 1, 6
1074 SBUTTERFLY wd, 4, 5, 7
1076 movhps [r0+%1+ 32], m0
1077 movq [r0+%1+ 64], m1
1078 movhps [r0+%1+ 96], m1
1080 movhps [r0+%1+ 40], m4
1081 movq [r0+%1+ 72], m5
1082 movhps [r0+%1+104], m5
1097 cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8