1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 iDCT
3 ;*****************************************************************************
4 ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5 ;* Copyright (C) 2003-2008 x264 project
7 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Loren Merritt <lorenm@u.washington.edu>
9 ;* Holger Lubitz <hal@duncan.ol.sub.de>
10 ;* Min Chen <chenm001.163.com>
12 ;* This file is part of FFmpeg.
14 ;* FFmpeg is free software; you can redistribute it and/or
15 ;* modify it under the terms of the GNU Lesser General Public
16 ;* License as published by the Free Software Foundation; either
17 ;* version 2.1 of the License, or (at your option) any later version.
19 ;* FFmpeg is distributed in the hope that it will be useful,
20 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 ;* Lesser General Public License for more details.
24 ;* You should have received a copy of the GNU Lesser General Public
25 ;* License along with FFmpeg; if not, write to the Free Software
26 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 ;*****************************************************************************
29 %include "libavutil/x86/x86util.asm"
33 scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
34 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
35 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
36 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
37 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
38 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
39 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
40 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
41 db 4+11*8, 5+11*8, 4+12*8, 5+12*8
42 db 6+11*8, 7+11*8, 6+12*8, 7+12*8
43 db 4+13*8, 5+13*8, 4+14*8, 5+14*8
44 db 6+13*8, 7+13*8, 6+14*8, 7+14*8
50 %define scan8 scan8_mem
58 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
66 IDCT4_1D w, 0, 1, 2, 3, 4, 5
69 TRANSPOSE4x4W 0, 1, 2, 3, 4
73 SBUTTERFLY dq, 0, 2, 4
78 IDCT4_1D w, 0, 1, 2, 3, 4, 5
85 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
87 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
91 ; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
92 cglobal h264_idct_add_8, 3, 3, 0
142 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
145 %macro IDCT8_1D_FULL 1
152 IDCT8_1D [%1], [%1+ 64]
155 ; %1=int16_t *block, %2=int16_t *dstblock
156 %macro IDCT8_ADD_MMX_START 2
159 TRANSPOSE4x4W 0, 1, 2, 3, 7
165 TRANSPOSE4x4W 4, 5, 6, 7, 3
172 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
173 %macro IDCT8_ADD_MMX_END 3-4
198 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
200 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
205 STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
207 STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
211 ; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
212 cglobal h264_idct8_add_8, 3, 4, 0
214 %assign pad 128+4-(stack_offset&7)
218 IDCT8_ADD_MMX_START r1 , rsp
219 IDCT8_ADD_MMX_START r1+8, rsp+64
221 IDCT8_ADD_MMX_END r0 , rsp, r2, r1
222 IDCT8_ADD_MMX_END r3 , rsp+8, r2
227 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
228 %macro IDCT8_ADD_SSE 4
231 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
233 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
240 IDCT8_1D [%2], [%2+ 16]
253 STORE_DIFF m0, m6, m7, [%1 ]
254 STORE_DIFF m1, m6, m7, [%1+%3 ]
255 STORE_DIFF m2, m6, m7, [%1+%3*2]
256 STORE_DIFF m3, m6, m7, [%1+%4 ]
273 STORE_DIFF m4, m6, m7, [%1 ]
274 STORE_DIFF m5, m6, m7, [%1+%3 ]
275 STORE_DIFF m0, m6, m7, [%1+%3*2]
276 STORE_DIFF m1, m6, m7, [%1+%4 ]
280 ; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
281 cglobal h264_idct8_add_8, 3, 4, 10
283 IDCT8_ADD_SSE r0, r1, r2, r3
286 %macro DC_ADD_MMXEXT_INIT 2
298 %macro DC_ADD_MMXEXT_OP 4
318 ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
320 cglobal h264_idct_dc_add_8, 3, 4, 0
324 DC_ADD_MMXEXT_INIT r3, r2
325 DC_ADD_MMXEXT_OP movh, r0, r2, r3
328 ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
329 cglobal h264_idct8_dc_add_8, 3, 4, 0
333 DC_ADD_MMXEXT_INIT r3, r2
334 DC_ADD_MMXEXT_OP mova, r0, r2, r3
336 DC_ADD_MMXEXT_OP mova, r0, r2, r3
339 ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
340 cglobal h264_idct_dc_add_8, 2, 3, 0
344 DC_ADD_MMXEXT_INIT r2, r1
345 DC_ADD_MMXEXT_OP movh, r0, r1, r2
348 ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
349 cglobal h264_idct8_dc_add_8, 2, 3, 0
353 DC_ADD_MMXEXT_INIT r2, r1
354 DC_ADD_MMXEXT_OP mova, r0, r1, r2
356 DC_ADD_MMXEXT_OP mova, r0, r1, r2
361 ; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
362 ; int16_t *block, int stride,
363 ; const uint8_t nnzc[6 * 8])
364 cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
368 lea picregq, [scan8_mem]
371 movzx r6, byte [scan8+r5]
372 movzx r6, byte [r4+r6]
375 mov r6d, dword [r1+r5*4]
385 ; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
386 ; int16_t *block, int stride,
387 ; const uint8_t nnzc[6 * 8])
388 cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
390 %assign pad 128+4-(stack_offset&7)
395 lea picregq, [scan8_mem]
398 movzx r6, byte [scan8+r5]
399 movzx r6, byte [r4+r6]
402 mov r6d, dword [r1+r5*4]
405 IDCT8_ADD_MMX_START r2 , rsp
406 IDCT8_ADD_MMX_START r2+8, rsp+64
407 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
408 mov r6d, dword [r1+r5*4]
410 IDCT8_ADD_MMX_END r6 , rsp+8, r3
420 ; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
421 ; int16_t *block, int stride,
422 ; const uint8_t nnzc[6 * 8])
423 cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
427 lea picregq, [scan8_mem]
430 movzx r6, byte [scan8+r5]
431 movzx r6, byte [r4+r6]
440 DC_ADD_MMXEXT_INIT r6, r3
445 mov dst2d, dword [r1+r5*4]
446 lea dst2q, [r0+dst2q]
447 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
457 mov r6d, dword [r1+r5*4]
468 ; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
469 ; int16_t *block, int stride,
470 ; const uint8_t nnzc[6 * 8])
471 cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
475 lea picregq, [scan8_mem]
478 movzx r6, byte [scan8+r5]
479 movzx r6, byte [r4+r6]
483 mov r6d, dword [r1+r5*4]
494 ; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
495 ; int16_t *block, int stride,
496 ; const uint8_t nnzc[6 * 8])
497 cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
501 lea picregq, [scan8_mem]
504 movzx r6, byte [scan8+r5]
505 movzx r6, byte [r4+r6]
508 mov r6d, dword [r1+r5*4]
521 DC_ADD_MMXEXT_INIT r6, r3
526 mov dst2d, dword [r1+r5*4]
528 DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
539 ; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
540 ; int16_t *block, int stride,
541 ; const uint8_t nnzc[6 * 8])
542 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
544 %assign pad 128+4-(stack_offset&7)
549 lea picregq, [scan8_mem]
552 movzx r6, byte [scan8+r5]
553 movzx r6, byte [r4+r6]
562 DC_ADD_MMXEXT_INIT r6, r3
567 mov dst2d, dword [r1+r5*4]
568 lea dst2q, [r0+dst2q]
569 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
570 lea dst2q, [dst2q+r3*4]
571 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
583 mov r6d, dword [r1+r5*4]
586 IDCT8_ADD_MMX_START r2 , rsp
587 IDCT8_ADD_MMX_START r2+8, rsp+64
588 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
589 mov r6d, dword [r1+r5*4]
591 IDCT8_ADD_MMX_END r6 , rsp+8, r3
602 ; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
603 ; int16_t *block, int stride,
604 ; const uint8_t nnzc[6 * 8])
605 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
609 lea picregq, [scan8_mem]
612 movzx r6, byte [scan8+r5]
613 movzx r6, byte [r4+r6]
623 DC_ADD_MMXEXT_INIT r6, r3
628 mov dst2d, dword [r1+r5*4]
630 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
631 lea dst2q, [dst2q+r3*4]
632 DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
643 mov dst2d, dword [r1+r5*4]
645 IDCT8_ADD_SSE dst2q, r2, r3, r6
657 h264_idct_add8_mmx_plane:
660 movzx r6, byte [scan8+r5]
661 movzx r6, byte [r4+r6]
666 mov r0d, dword [r1+r5*4]
669 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
671 add r0, dword [r1+r5*4]
681 ; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
682 ; int16_t *block, int stride,
683 ; const uint8_t nnzc[6 * 8])
684 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
689 lea picregq, [scan8_mem]
694 call h264_idct_add8_mmx_plane
702 call h264_idct_add8_mmx_plane
703 RET ; TODO: check rep ret after a function call
705 cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
706 ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
709 lea picregq, [scan8_mem]
716 add r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
718 call h264_idct_add8_mmx_plane
720 call h264_idct_add8_mmx_plane
723 add dst2q, gprsize ; dest[1]
728 add r5, 4 ; set to 32
729 add r2, 256 ; set to i * 16 * sizeof(dctcoef)
731 call h264_idct_add8_mmx_plane
733 call h264_idct_add8_mmx_plane
735 RET ; TODO: check rep ret after a function call
737 h264_idct_add8_mmxext_plane:
740 movzx r6, byte [scan8+r5]
741 movzx r6, byte [r4+r6]
745 mov r0d, dword [r1+r5*4]
748 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
750 add r0, dword [r1+r5*4]
763 DC_ADD_MMXEXT_INIT r6, r3
765 mov r0d, dword [r1+r5*4]
768 mov r0, r1m ; XXX r1m here is actually r0m of the calling func
770 add r0, dword [r1+r5*4]
772 DC_ADD_MMXEXT_OP movh, r0, r3, r6
781 ; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
782 ; int16_t *block, int stride,
783 ; const uint8_t nnzc[6 * 8])
784 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
792 lea picregq, [scan8_mem]
794 call h264_idct_add8_mmxext_plane
802 call h264_idct_add8_mmxext_plane
803 RET ; TODO: check rep ret after a function call
805 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
806 h264_idct_dc_add8_mmxext:
808 movd m0, [r2 ] ; 0 0 X D
810 punpcklwd m0, [r2+32] ; x X d D
814 punpcklwd m0, m0 ; d d D D
815 pxor m1, m1 ; 0 0 0 0
816 psubw m1, m0 ; -d-d-D-D
817 packuswb m0, m1 ; -d-d-D-D d d D D
818 pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
819 punpcklwd m0, m0 ; d d d d D D D D
821 DC_ADD_MMXEXT_OP movq, r0, r3, r6
826 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
827 h264_add8x4_idct_sse2:
837 IDCT4_1D w,0,1,2,3,4,5
838 TRANSPOSE2x4x4W 0,1,2,3,4
840 IDCT4_1D w,0,1,2,3,4,5
846 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
848 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
851 %macro add16_sse2_cycle 2
852 movzx r0, word [r4+%2]
855 mov r0d, dword [r1+%1*8]
861 call h264_add8x4_idct_sse2
868 ; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
869 ; int16_t *block, int stride,
870 ; const uint8_t nnzc[6 * 8])
871 cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
876 ; unrolling of the loop leads to an average performance gain of
878 add16_sse2_cycle 0, 0xc
879 add16_sse2_cycle 1, 0x14
880 add16_sse2_cycle 2, 0xe
881 add16_sse2_cycle 3, 0x16
882 add16_sse2_cycle 4, 0x1c
883 add16_sse2_cycle 5, 0x24
884 add16_sse2_cycle 6, 0x1e
885 add16_sse2_cycle 7, 0x26
888 %macro add16intra_sse2_cycle 2
889 movzx r0, word [r4+%2]
892 mov r0d, dword [r1+%1*8]
898 call h264_add8x4_idct_sse2
904 mov r0d, dword [r1+%1*8]
910 call h264_idct_dc_add8_mmxext
917 ; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
918 ; int16_t *block, int stride,
919 ; const uint8_t nnzc[6 * 8])
920 cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
925 add16intra_sse2_cycle 0, 0xc
926 add16intra_sse2_cycle 1, 0x14
927 add16intra_sse2_cycle 2, 0xe
928 add16intra_sse2_cycle 3, 0x16
929 add16intra_sse2_cycle 4, 0x1c
930 add16intra_sse2_cycle 5, 0x24
931 add16intra_sse2_cycle 6, 0x1e
932 add16intra_sse2_cycle 7, 0x26
935 %macro add8_sse2_cycle 2
936 movzx r0, word [r4+%2]
940 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
945 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
947 call h264_add8x4_idct_sse2
954 mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
959 add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
961 call h264_idct_dc_add8_mmxext
970 ; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
971 ; int16_t *block, int stride,
972 ; const uint8_t nnzc[6 * 8])
973 cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
979 add8_sse2_cycle 0, 0x34
980 add8_sse2_cycle 1, 0x3c
986 add8_sse2_cycle 2, 0x5c
987 add8_sse2_cycle 3, 0x64
990 ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
993 SUMSUB_BADC w, %4, %3, %2, %1, %5
994 SUMSUB_BADC w, %4, %2, %3, %1, %5
1002 pshufd xmm4, xmm4, 0
1007 punpcklwd xmm0, xmm5
1008 punpcklwd xmm1, xmm5
1009 punpcklwd xmm2, xmm5
1010 punpcklwd xmm3, xmm5
1044 %macro STORE_WORDS 5-9
1078 %macro DEQUANT_STORE 1
1081 STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
1082 STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
1085 STORE_WORDS m0, 0, 1, 4, 5
1086 STORE_WORDS m1, 2, 3, 6, 7
1089 STORE_WORDS m2, 8, 9, 12, 13
1090 STORE_WORDS m3, 10, 11, 14, 15
1094 %macro IDCT_DC_DEQUANT 1
1095 cglobal h264_luma_dc_dequant_idct, 3, 4, %1
1096 ; manually spill XMM registers for Win64 because
1097 ; the code here is initialized with INIT_MMX
1104 TRANSPOSE4x4W 0,1,2,3,4
1107 ; shift, tmp, output, qmul
1109 DECLARE_REG_TMP 0,3,1,2
1110 ; we can't avoid this, because r0 is the shift register (ecx) on win64
1113 DECLARE_REG_TMP 3,1,0,2
1115 DECLARE_REG_TMP 1,3,0,2
1147 ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
1148 %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
1163 %macro DC_ADD_INIT 1
1168 lea %1, [3*stride_q]
1179 cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
1180 movsxdifnidn stride_q, stride_d
1181 IDCT4_ADD dst_q, block_q, stride_q
1184 cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
1185 movsxdifnidn stride_q, stride_d
1186 movsx r3d, word [block_q]
1187 mov dword [block_q], 0
1189 DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3