1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Holger Lubitz <hal@duncan.ol.sub.de>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
27 %include "x86util.asm"
31 pw_8000: times 8 dw 0x8000
32 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
33 pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
34 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
35 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
40 SUMSUB_BADC m%2, m%1, m%4, m%3
41 SUMSUB_BADC m%4, m%2, m%3, m%1
45 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
57 ;-----------------------------------------------------------------------------
58 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
59 ;-----------------------------------------------------------------------------
60 cglobal x264_dct4x4dc_mmx, 1,1
65 movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
67 TRANSPOSE4x4W 0,1,2,3,4
68 SUMSUB_BADC m1, m0, m3, m2
79 ;-----------------------------------------------------------------------------
80 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
81 ;-----------------------------------------------------------------------------
82 cglobal x264_idct4x4dc_mmx, 1,1
88 TRANSPOSE4x4W 0,1,2,3,4
97 SUMSUB_BADC m%4, m%1, m%3, m%2
99 SUMSUB2_AB m%1, m%2, m%5
100 SWAP %1, %3, %4, %5, %2
105 SUMSUBD2_AB m%2, m%4, m%6, m%5
106 SUMSUB_BADC m%2, m%3, m%5, m%1
107 SWAP %1, %2, %5, %4, %3
110 ;-----------------------------------------------------------------------------
111 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
112 ;-----------------------------------------------------------------------------
113 cglobal x264_sub4x4_dct_mmx, 3,3
116 LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
117 LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
118 LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
119 LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
121 TRANSPOSE%1 0,1,2,3,4
131 ;-----------------------------------------------------------------------------
132 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
133 ;-----------------------------------------------------------------------------
134 cglobal x264_add4x4_idct_mmx, 2,2
142 TRANSPOSE%1 0,1,2,3,4
143 paddw m0, [pw_32 GLOBAL]
146 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
147 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
148 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
149 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
156 cglobal x264_sub8x8_dct_sse2, 3,3
160 add r1, 4*FENC_STRIDE
161 add r2, 4*FDEC_STRIDE
170 cglobal x264_add8x8_idct_sse2, 2,2
174 add r0, 4*FDEC_STRIDE
187 ;-----------------------------------------------------------------------------
188 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
189 ;-----------------------------------------------------------------------------
195 add r1, %4-%5-%6*FENC_STRIDE
196 add r2, %4-%5-%6*FDEC_STRIDE
199 add r1, (%4-%6)*FENC_STRIDE-%5-%4
200 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
203 add r1, %4-%5-%6*FENC_STRIDE
204 add r2, %4-%5-%6*FDEC_STRIDE
208 ;-----------------------------------------------------------------------------
209 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
210 ;-----------------------------------------------------------------------------
211 %macro ADD_NxN_IDCT 6
215 add r0, %4-%5-%6*FDEC_STRIDE
218 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
221 add r0, %4-%5-%6*FDEC_STRIDE
227 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
228 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
229 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
230 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
232 cextern x264_sub8x8_dct8_mmx.skip_prologue
233 cextern x264_add8x8_idct8_mmx.skip_prologue
234 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
235 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
236 %define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
237 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
240 SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
241 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
243 cextern x264_sub8x8_dct8_sse2
244 cextern x264_add8x8_idct8_sse2
245 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
246 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
248 ;-----------------------------------------------------------------------------
249 ; void add8x8_idct_dc( uint8_t *p_dst, int16_t *dct2x2 )
250 ;-----------------------------------------------------------------------------
253 movq mm4, [%3+FDEC_STRIDE*0]
254 movq mm5, [%3+FDEC_STRIDE*1]
255 movq mm6, [%3+FDEC_STRIDE*2]
259 paddusb %1, [%3+FDEC_STRIDE*3]
264 movq [%3+FDEC_STRIDE*0], mm4
265 movq [%3+FDEC_STRIDE*1], mm5
266 movq [%3+FDEC_STRIDE*2], mm6
267 movq [%3+FDEC_STRIDE*3], %1
270 cglobal x264_add8x8_idct_dc_mmx, 2,2
273 add r0, FDEC_STRIDE*4
274 paddw mm0, [pw_32 GLOBAL]
281 pshufw mm2, mm0, 0xFA
282 pshufw mm3, mm1, 0xFA
285 ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
289 cglobal x264_add8x8_idct_dc_ssse3, 2,2
292 add r0, FDEC_STRIDE*4
293 paddw xmm0, [pw_32 GLOBAL]
296 movdqa xmm5, [pb_idctdc_unpack GLOBAL]
301 movq xmm2, [r0+FDEC_STRIDE*-4]
302 movq xmm3, [r0+FDEC_STRIDE*-3]
303 movq xmm4, [r0+FDEC_STRIDE*-2]
304 movq xmm5, [r0+FDEC_STRIDE*-1]
305 movhps xmm2, [r0+FDEC_STRIDE* 0]
306 movhps xmm3, [r0+FDEC_STRIDE* 1]
307 movhps xmm4, [r0+FDEC_STRIDE* 2]
308 movhps xmm5, [r0+FDEC_STRIDE* 3]
317 movq [r0+FDEC_STRIDE*-4], xmm2
318 movq [r0+FDEC_STRIDE*-3], xmm3
319 movq [r0+FDEC_STRIDE*-2], xmm4
320 movq [r0+FDEC_STRIDE*-1], xmm5
321 movhps [r0+FDEC_STRIDE* 0], xmm2
322 movhps [r0+FDEC_STRIDE* 1], xmm3
323 movhps [r0+FDEC_STRIDE* 2], xmm4
324 movhps [r0+FDEC_STRIDE* 3], xmm5
327 ;-----------------------------------------------------------------------------
328 ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
329 ;-----------------------------------------------------------------------------
331 cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
335 PALIGNR xmm1, xmm1, 14, xmm2
340 PALIGNR xmm2, xmm2, 12, xmm4
342 PALIGNR xmm3, xmm3, 10, xmm4
368 movdqa xmm7, [r1+112]
376 PALIGNR xmm4, xmm4, 14, xmm3
378 PALIGNR xmm5, xmm5, 12, xmm3
380 PALIGNR xmm6, xmm6, 10, xmm3
383 PALIGNR xmm7, xmm7, 8, xmm3
387 punpcklqdq xmm7, xmm7
407 pshufw mm4, mm4, 0x6c
421 pshufhw xmm0, xmm0, 0x1b
422 pshuflw xmm4, xmm4, 0x1b
423 pshufhw xmm3, xmm3, 0x1b
424 pshuflw xmm7, xmm7, 0x1b
426 movlps [r0+2*10], xmm0
427 movhps [r0+2*17], xmm0
428 movlps [r0+2*21], xmm3
429 movlps [r0+2*28], xmm4
430 movhps [r0+2*32], xmm3
431 movhps [r0+2*39], xmm4
432 movlps [r0+2*43], xmm7
433 movhps [r0+2*50], xmm7
439 %define PALIGNR PALIGNR_MMX
441 %define PALIGNR PALIGNR_SSSE3
444 ;-----------------------------------------------------------------------------
445 ; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
446 ;-----------------------------------------------------------------------------
447 cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
509 pshufw mm6, mm6, 0x1b
510 pshufw mm5, mm5, 0x1b
531 pshufw mm2, mm2, 0x1b
532 pshufw mm7, mm7, 0x1b
539 ;-----------------------------------------------------------------------------
540 ; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
541 ;-----------------------------------------------------------------------------
542 cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
569 ;-----------------------------------------------------------------------------
570 ; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
571 ;-----------------------------------------------------------------------------
572 cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
575 pshufb xmm1, [pb_scan4frameb GLOBAL]
576 pshufb xmm0, [pb_scan4framea GLOBAL]
579 palignr xmm2, xmm0, 6
581 palignr xmm1, xmm0, 10
586 ;-----------------------------------------------------------------------------
587 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
588 ;-----------------------------------------------------------------------------
589 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
590 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
591 pshufw mm0, [r1+4], 0xd2
603 ;-----------------------------------------------------------------------------
604 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
605 ;-----------------------------------------------------------------------------
606 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
607 movd xmm0, [r1+0*FENC_STRIDE]
608 movd xmm1, [r1+1*FENC_STRIDE]
609 movd xmm2, [r1+2*FENC_STRIDE]
610 movd xmm3, [r1+3*FENC_STRIDE]
611 movd xmm4, [r2+0*FDEC_STRIDE]
612 movd xmm5, [r2+1*FDEC_STRIDE]
613 movd xmm6, [r2+2*FDEC_STRIDE]
614 movd xmm7, [r2+3*FDEC_STRIDE]
615 movd [r2+0*FDEC_STRIDE], xmm0
616 movd [r2+1*FDEC_STRIDE], xmm1
617 movd [r2+2*FDEC_STRIDE], xmm2
618 movd [r2+3*FDEC_STRIDE], xmm3
623 punpcklqdq xmm0, xmm2
624 punpcklqdq xmm4, xmm6
625 movdqa xmm7, [pb_sub4frame GLOBAL]
642 cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
645 movq m0, [r1+r2*4+ 0]
646 movq m1, [r1+r2*4+ 8]
647 movq m2, [r1+r2*4+16]
648 movq m3, [r1+r2*4+24]
649 TRANSPOSE4x4W 0,1,2,3,4