1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 iDCT
3 ;*****************************************************************************
4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5 ;* Copyright (C) 2003-2008 x264 project
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* Holger Lubitz <hal@duncan.ol.sub.de>
10 ;* Min Chen <chenm001.163.com>
12 ;* This file is part of Libav.
14 ;* Libav is free software; you can redistribute it and/or
15 ;* modify it under the terms of the GNU Lesser General Public
16 ;* License as published by the Free Software Foundation; either
17 ;* version 2.1 of the License, or (at your option) any later version.
19 ;* Libav is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 ;* Lesser General Public License for more details.
24 ;* You should have received a copy of the GNU Lesser General Public
25 ;* License along with Libav; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 ;*****************************************************************************
30 %include "x86util.asm"
34 ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
35 scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
36 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
37 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
38 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
39 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
40 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
41 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
42 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
43 db 4+11*8, 5+11*8, 4+12*8, 5+12*8
44 db 6+11*8, 7+11*8, 6+12*8, 7+12*8
45 db 4+13*8, 5+13*8, 4+14*8, 5+14*8
46 db 6+13*8, 7+13*8, 6+14*8, 7+14*8
50 %define scan8 scan8_mem
58 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
66 IDCT4_1D w, 0, 1, 2, 3, 4, 5
68 TRANSPOSE4x4W 0, 1, 2, 3, 4
70 IDCT4_1D w, 0, 1, 2, 3, 4, 5
73 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
75 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
79 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
80 cglobal h264_idct_add_8_mmx, 3, 3, 0
132 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
135 %macro IDCT8_1D_FULL 1
142 IDCT8_1D [%1], [%1+ 64]
145 ; %1=int16_t *block, %2=int16_t *dstblock
146 %macro IDCT8_ADD_MMX_START 2
149 TRANSPOSE4x4W 0, 1, 2, 3, 7
155 TRANSPOSE4x4W 4, 5, 6, 7, 3
162 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
163 %macro IDCT8_ADD_MMX_END 3
170 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
172 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
177 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
179 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
183 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
184 cglobal h264_idct8_add_8_mmx, 3, 4, 0
185 %assign pad 128+4-(stack_offset&7)
189 IDCT8_ADD_MMX_START r1 , rsp
190 IDCT8_ADD_MMX_START r1+8, rsp+64
192 IDCT8_ADD_MMX_END r0 , rsp, r2
193 IDCT8_ADD_MMX_END r3 , rsp+8, r2
198 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
199 %macro IDCT8_ADD_SSE 4
202 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
204 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
211 IDCT8_1D [%2], [%2+ 16]
224 STORE_DIFF m0, m6, m7, [%1 ]
225 STORE_DIFF m1, m6, m7, [%1+%3 ]
226 STORE_DIFF m2, m6, m7, [%1+%3*2]
227 STORE_DIFF m3, m6, m7, [%1+%4 ]
236 STORE_DIFF m4, m6, m7, [%1 ]
237 STORE_DIFF m5, m6, m7, [%1+%3 ]
238 STORE_DIFF m0, m6, m7, [%1+%3*2]
239 STORE_DIFF m1, m6, m7, [%1+%4 ]
243 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
244 cglobal h264_idct8_add_8_sse2, 3, 4, 10
245 IDCT8_ADD_SSE r0, r1, r2, r3
248 %macro DC_ADD_MMX2_INIT 2-3
268 %macro DC_ADD_MMX2_OP 4
288 ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
289 cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0
290 DC_ADD_MMX2_INIT r1, r2
291 DC_ADD_MMX2_OP movh, r0, r2, r1
294 ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
295 cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
296 DC_ADD_MMX2_INIT r1, r2
297 DC_ADD_MMX2_OP mova, r0, r2, r1
299 DC_ADD_MMX2_OP mova, r0, r2, r1
302 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
303 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
304 cglobal h264_idct_add16_8_mmx, 5, 7, 0
310 movzx r6, byte [scan8+r5]
311 movzx r6, byte [r4+r6]
314 mov r6d, dword [r1+r5*4]
324 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
325 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
326 cglobal h264_idct8_add4_8_mmx, 5, 7, 0
327 %assign pad 128+4-(stack_offset&7)
335 movzx r6, byte [scan8+r5]
336 movzx r6, byte [r4+r6]
339 mov r6d, dword [r1+r5*4]
342 IDCT8_ADD_MMX_START r2 , rsp
343 IDCT8_ADD_MMX_START r2+8, rsp+64
344 IDCT8_ADD_MMX_END r6 , rsp, r3
345 mov r6d, dword [r1+r5*4]
347 IDCT8_ADD_MMX_END r6 , rsp+8, r3
356 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
357 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
358 cglobal h264_idct_add16_8_mmx2, 5, 7, 0
364 movzx r6, byte [scan8+r5]
365 movzx r6, byte [r4+r6]
373 DC_ADD_MMX2_INIT r2, r3, r6
376 %define dst_regd r10d
381 mov dst_regd, dword [r1+r5*4]
382 lea dst_reg, [r0+dst_reg]
383 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
393 mov r6d, dword [r1+r5*4]
403 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
404 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
405 cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
411 movzx r6, byte [scan8+r5]
412 movzx r6, byte [r4+r6]
416 mov r6d, dword [r1+r5*4]
426 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
427 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
428 cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
434 movzx r6, byte [scan8+r5]
435 movzx r6, byte [r4+r6]
438 mov r6d, dword [r1+r5*4]
450 DC_ADD_MMX2_INIT r2, r3, r6
453 %define dst_regd r10d
458 mov dst_regd, dword [r1+r5*4]
459 lea dst_reg, [r0+dst_reg]
460 DC_ADD_MMX2_OP movh, dst_reg, r3, r6
471 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
472 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
473 cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
474 %assign pad 128+4-(stack_offset&7)
482 movzx r6, byte [scan8+r5]
483 movzx r6, byte [r4+r6]
491 DC_ADD_MMX2_INIT r2, r3, r6
494 %define dst_regd r10d
499 mov dst_regd, dword [r1+r5*4]
500 lea dst_reg, [r0+dst_reg]
501 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
502 lea dst_reg, [dst_reg+r3*4]
503 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
515 mov r6d, dword [r1+r5*4]
518 IDCT8_ADD_MMX_START r2 , rsp
519 IDCT8_ADD_MMX_START r2+8, rsp+64
520 IDCT8_ADD_MMX_END r6 , rsp, r3
521 mov r6d, dword [r1+r5*4]
523 IDCT8_ADD_MMX_END r6 , rsp+8, r3
534 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
535 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
536 cglobal h264_idct8_add4_8_sse2, 5, 7, 10
542 movzx r6, byte [scan8+r5]
543 movzx r6, byte [r4+r6]
552 DC_ADD_MMX2_INIT r2, r3, r6
555 %define dst_regd r10d
560 mov dst_regd, dword [r1+r5*4]
561 lea dst_reg, [r0+dst_reg]
562 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
563 lea dst_reg, [dst_reg+r3*4]
564 DC_ADD_MMX2_OP mova, dst_reg, r3, r6
575 mov dst_regd, dword [r1+r5*4]
576 lea dst_reg, [r0+dst_reg]
577 IDCT8_ADD_SSE dst_reg, r2, r3, r6
589 h264_idct_add8_mmx_plane:
591 movzx r6, byte [scan8+r5]
592 movzx r6, byte [r4+r6]
597 mov r0d, dword [r1+r5*4]
600 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
602 add r0, dword [r1+r5*4]
612 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
613 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
614 cglobal h264_idct_add8_8_mmx, 5, 7, 0
623 call h264_idct_add8_mmx_plane
631 call h264_idct_add8_mmx_plane
634 h264_idct_add8_mmx2_plane
636 movzx r6, byte [scan8+r5]
637 movzx r6, byte [r4+r6]
641 mov r0d, dword [r1+r5*4]
644 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
646 add r0, dword [r1+r5*4]
658 DC_ADD_MMX2_INIT r2, r3, r6
660 mov r0d, dword [r1+r5*4]
663 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
665 add r0, dword [r1+r5*4]
667 DC_ADD_MMX2_OP movh, r0, r3, r6
675 ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
676 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
677 cglobal h264_idct_add8_8_mmx2, 5, 7, 0
686 call h264_idct_add8_mmx2_plane
694 call h264_idct_add8_mmx2_plane
698 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
699 h264_idct_dc_add8_mmx2:
700 movd m0, [r2 ] ; 0 0 X D
701 punpcklwd m0, [r2+32] ; x X d D
704 punpcklwd m0, m0 ; d d D D
705 pxor m1, m1 ; 0 0 0 0
706 psubw m1, m0 ; -d-d-D-D
707 packuswb m0, m1 ; -d-d-D-D d d D D
708 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
709 punpcklwd m0, m0 ; d d d d D D D D
711 DC_ADD_MMX2_OP movq, r0, r3, r6
716 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
717 x264_add8x4_idct_sse2:
726 IDCT4_1D w,0,1,2,3,4,5
727 TRANSPOSE2x4x4W 0,1,2,3,4
729 IDCT4_1D w,0,1,2,3,4,5
731 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
733 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
736 %macro add16_sse2_cycle 2
737 movzx r0, word [r4+%2]
740 mov r0d, dword [r1+%1*8]
746 call x264_add8x4_idct_sse2
753 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
754 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
755 cglobal h264_idct_add16_8_sse2, 5, 5, 8
759 ; unrolling of the loop leads to an average performance gain of
761 add16_sse2_cycle 0, 0xc
762 add16_sse2_cycle 1, 0x14
763 add16_sse2_cycle 2, 0xe
764 add16_sse2_cycle 3, 0x16
765 add16_sse2_cycle 4, 0x1c
766 add16_sse2_cycle 5, 0x24
767 add16_sse2_cycle 6, 0x1e
768 add16_sse2_cycle 7, 0x26
771 %macro add16intra_sse2_cycle 2
772 movzx r0, word [r4+%2]
775 mov r0d, dword [r1+%1*8]
781 call x264_add8x4_idct_sse2
787 mov r0d, dword [r1+%1*8]
793 call h264_idct_dc_add8_mmx2
800 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
801 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
802 cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
806 add16intra_sse2_cycle 0, 0xc
807 add16intra_sse2_cycle 1, 0x14
808 add16intra_sse2_cycle 2, 0xe
809 add16intra_sse2_cycle 3, 0x16
810 add16intra_sse2_cycle 4, 0x1c
811 add16intra_sse2_cycle 5, 0x24
812 add16intra_sse2_cycle 6, 0x1e
813 add16intra_sse2_cycle 7, 0x26
816 %macro add8_sse2_cycle 2
817 movzx r0, word [r4+%2]
821 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
826 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
828 call x264_add8x4_idct_sse2
835 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
840 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
842 call h264_idct_dc_add8_mmx2
851 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
852 ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
853 cglobal h264_idct_add8_8_sse2, 5, 7, 8
858 add8_sse2_cycle 0, 0x34
859 add8_sse2_cycle 1, 0x3c
865 add8_sse2_cycle 2, 0x5c
866 add8_sse2_cycle 3, 0x64
869 ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
872 SUMSUB_BADC w, %4, %3, %2, %1, %5
873 SUMSUB_BADC w, %4, %2, %3, %1, %5
899 %macro STORE_WORDS_MMX 5
911 %macro DEQUANT_STORE_MMX 1
912 DEQUANT_MMX m0, m1, %1
913 STORE_WORDS_MMX m0, 0, 1, 4, 5
914 STORE_WORDS_MMX m1, 2, 3, 6, 7
916 DEQUANT_MMX m2, m3, %1
917 STORE_WORDS_MMX m2, 8, 9, 12, 13
918 STORE_WORDS_MMX m3, 10, 11, 14, 15
921 %macro STORE_WORDS_SSE 9
943 %macro DEQUANT_STORE_SSE2 1
965 STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
966 STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
969 %macro IDCT_DC_DEQUANT 2
970 cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
976 TRANSPOSE4x4W 0,1,2,3,4
979 ; shift, tmp, output, qmul
981 DECLARE_REG_TMP 0,3,1,2
982 ; we can't avoid this, because r0 is the shift register (ecx) on win64
985 DECLARE_REG_TMP 3,1,0,2
987 DECLARE_REG_TMP 1,3,0,2
1010 DEQUANT_STORE_MMX m6
1013 DEQUANT_STORE_SSE2 xmm6
1019 IDCT_DC_DEQUANT mmx, 0
1020 IDCT_DC_DEQUANT sse2, 7