1 ;*****************************************************************************
2 ;* dct-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 ;* Loren Merritt <lorenm@u.washington.edu>
8 ;* Min Chen <chenm001.163.com>
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;*****************************************************************************
26 %include "x86util.asm"
30 pw_8000: times 8 dw 0x8000
31 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
32 pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
33 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
38 SUMSUB_BADC m%2, m%1, m%4, m%3
39 SUMSUB_BADC m%4, m%2, m%3, m%1
43 %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
55 ;-----------------------------------------------------------------------------
56 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
57 ;-----------------------------------------------------------------------------
58 cglobal x264_dct4x4dc_mmx, 1,1
63 movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
65 TRANSPOSE4x4W 0,1,2,3,4
66 SUMSUB_BADC m1, m0, m3, m2
77 ;-----------------------------------------------------------------------------
78 ; void x264_idct4x4dc_mmx( int16_t d[4][4] )
79 ;-----------------------------------------------------------------------------
80 cglobal x264_idct4x4dc_mmx, 1,1
86 TRANSPOSE4x4W 0,1,2,3,4
95 SUMSUB_BADC m%4, m%1, m%3, m%2
97 SUMSUB2_AB m%1, m%2, m%5
98 SWAP %1, %3, %4, %5, %2
103 SUMSUBD2_AB m%2, m%4, m%6, m%5
104 SUMSUB_BADC m%2, m%3, m%5, m%1
105 SWAP %1, %2, %5, %4, %3
108 ;-----------------------------------------------------------------------------
109 ; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
110 ;-----------------------------------------------------------------------------
111 cglobal x264_sub4x4_dct_mmx, 3,3
114 LOAD_DIFF m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
115 LOAD_DIFF m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
116 LOAD_DIFF m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
117 LOAD_DIFF m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
119 TRANSPOSE%1 0,1,2,3,4
129 ;-----------------------------------------------------------------------------
130 ; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
131 ;-----------------------------------------------------------------------------
132 cglobal x264_add4x4_idct_mmx, 2,2
140 TRANSPOSE%1 0,1,2,3,4
141 paddw m0, [pw_32 GLOBAL]
144 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
145 STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
146 STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
147 STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
154 cglobal x264_sub8x8_dct_sse2, 3,3
158 add r1, 4*FENC_STRIDE
159 add r2, 4*FDEC_STRIDE
168 cglobal x264_add8x8_idct_sse2, 2,2
172 add r0, 4*FDEC_STRIDE
185 ;-----------------------------------------------------------------------------
186 ; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
187 ;-----------------------------------------------------------------------------
193 add r1, %4-%5-%6*FENC_STRIDE
194 add r2, %4-%5-%6*FDEC_STRIDE
197 add r1, (%4-%6)*FENC_STRIDE-%5-%4
198 add r2, (%4-%6)*FDEC_STRIDE-%5-%4
201 add r1, %4-%5-%6*FENC_STRIDE
202 add r2, %4-%5-%6*FDEC_STRIDE
206 ;-----------------------------------------------------------------------------
207 ; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
208 ;-----------------------------------------------------------------------------
209 %macro ADD_NxN_IDCT 6
213 add r0, %4-%5-%6*FDEC_STRIDE
216 add r0, (%4-%6)*FDEC_STRIDE-%5-%4
219 add r0, %4-%5-%6*FDEC_STRIDE
225 SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
226 ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
227 SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
228 ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
230 cextern x264_sub8x8_dct8_mmx.skip_prologue
231 cextern x264_add8x8_idct8_mmx.skip_prologue
232 SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
233 ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
234 %define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
235 %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
238 SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
239 ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
241 cextern x264_sub8x8_dct8_sse2
242 cextern x264_add8x8_idct8_sse2
243 SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
244 ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
246 ;-----------------------------------------------------------------------------
247 ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
248 ;-----------------------------------------------------------------------------
250 cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
254 PALIGNR xmm1, xmm1, 14, xmm2
259 PALIGNR xmm2, xmm2, 12, xmm4
261 PALIGNR xmm3, xmm3, 10, xmm4
287 movdqa xmm7, [r1+112]
295 PALIGNR xmm4, xmm4, 14, xmm3
297 PALIGNR xmm5, xmm5, 12, xmm3
299 PALIGNR xmm6, xmm6, 10, xmm3
302 PALIGNR xmm7, xmm7, 8, xmm3
306 punpcklqdq xmm7, xmm7
326 pshufw mm4, mm4, 0x6c
340 pshufhw xmm0, xmm0, 0x1b
341 pshuflw xmm4, xmm4, 0x1b
342 pshufhw xmm3, xmm3, 0x1b
343 pshuflw xmm7, xmm7, 0x1b
345 movlps [r0+2*10], xmm0
346 movhps [r0+2*17], xmm0
347 movlps [r0+2*21], xmm3
348 movlps [r0+2*28], xmm4
349 movhps [r0+2*32], xmm3
350 movhps [r0+2*39], xmm4
351 movlps [r0+2*43], xmm7
352 movhps [r0+2*50], xmm7
358 %define PALIGNR PALIGNR_MMX
360 %define PALIGNR PALIGNR_SSSE3
363 ;-----------------------------------------------------------------------------
364 ; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
365 ;-----------------------------------------------------------------------------
366 cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
428 pshufw mm6, mm6, 0x1b
429 pshufw mm5, mm5, 0x1b
450 pshufw mm2, mm2, 0x1b
451 pshufw mm7, mm7, 0x1b
458 ;-----------------------------------------------------------------------------
459 ; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
460 ;-----------------------------------------------------------------------------
461 cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
488 ;-----------------------------------------------------------------------------
489 ; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
490 ;-----------------------------------------------------------------------------
491 cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
494 pshufb xmm1, [pb_scan4frameb GLOBAL]
495 pshufb xmm0, [pb_scan4framea GLOBAL]
498 palignr xmm2, xmm0, 6
500 palignr xmm1, xmm0, 10
505 ;-----------------------------------------------------------------------------
506 ; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
507 ;-----------------------------------------------------------------------------
508 ; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
509 cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
510 pshufw mm0, [r1+4], 0xd2
522 ;-----------------------------------------------------------------------------
523 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
524 ;-----------------------------------------------------------------------------
525 cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
526 movd xmm0, [r1+0*FENC_STRIDE]
527 movd xmm1, [r1+1*FENC_STRIDE]
528 movd xmm2, [r1+2*FENC_STRIDE]
529 movd xmm3, [r1+3*FENC_STRIDE]
530 movd xmm4, [r2+0*FDEC_STRIDE]
531 movd xmm5, [r2+1*FDEC_STRIDE]
532 movd xmm6, [r2+2*FDEC_STRIDE]
533 movd xmm7, [r2+3*FDEC_STRIDE]
534 movd [r2+0*FDEC_STRIDE], xmm0
535 movd [r2+1*FDEC_STRIDE], xmm1
536 movd [r2+2*FDEC_STRIDE], xmm2
537 movd [r2+3*FDEC_STRIDE], xmm3
542 punpcklqdq xmm0, xmm2
543 punpcklqdq xmm4, xmm6
544 movdqa xmm7, [pb_sub4frame GLOBAL]
561 cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
564 movq m0, [r1+r2*4+ 0]
565 movq m1, [r1+r2*4+ 8]
566 movq m2, [r1+r2*4+16]
567 movq m3, [r1+r2*4+24]
568 TRANSPOSE4x4W 0,1,2,3,4