1 ;*****************************************************************************
2 ;* mc-a.asm: x86 motion compensation
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2010 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Min Chen <chenm001.163.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
26 ;* This program is also available under a commercial proprietary license.
27 ;* For more information, contact us at licensing@x264.com.
28 ;*****************************************************************************
34 ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
35 ch_shuf_adj: times 8 db 0
49 ;=============================================================================
50 ; implicit weighted biprediction
51 ;=============================================================================
52 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
54 DECLARE_REG_TMP 0,1,2,3,4,5,10,11
55 %macro AVG_START 0-1 0
63 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
64 %macro AVG_START 0-1 0
78 pshuflw %1, %2, %3*0x55
81 pshufw %1, %2, %3*0x55
97 %macro BIWEIGHT_START_MMX 0
99 SPLATW m2, m2 ; weight_dst
101 psubw m3, m2 ; weight_src
102 mova m4, [pw_32] ; rounding
106 %macro BIWEIGHT_SSSE3 2
115 %macro BIWEIGHT_START_SSSE3 0
116 movzx t6d, byte r6m ; FIXME x86_64
123 SPLATW m3, m3 ; weight_dst,src
126 %macro BIWEIGHT_ROW 4
133 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
139 ;-----------------------------------------------------------------------------
140 ; int pixel_avg_weight_w16( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
141 ;-----------------------------------------------------------------------------
142 %macro AVG_WEIGHT 2-3 0
143 cglobal pixel_avg_weight_w%2_%1
146 %if %2==8 && mmsize==16
149 BIWEIGHT [t2+t3], [t4+t5]
156 BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
157 BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2
169 %define BIWEIGHT BIWEIGHT_MMX
170 %define BIWEIGHT_START BIWEIGHT_START_MMX
174 AVG_WEIGHT mmxext, 16
176 %define pixel_avg_weight_w4_sse2 pixel_avg_weight_w4_mmxext
177 AVG_WEIGHT sse2, 8, 7
178 AVG_WEIGHT sse2, 16, 7
179 %define BIWEIGHT BIWEIGHT_SSSE3
180 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
184 AVG_WEIGHT ssse3, 8, 7
185 AVG_WEIGHT ssse3, 16, 7
187 ;=============================================================================
188 ; P frame explicit weighted prediction
189 ;=============================================================================
191 %macro WEIGHT_START 1
196 %if (%1 == 20 || %1 == 12) && mmsize == 16
205 %macro WEIGHT_START_SSSE3 1
209 %if %1 == 20 || %1 == 12
216 ;; macro to weight mmsize bytes taking half from %1 and half from %2
217 %macro WEIGHT 2 ; (src1,src2)
220 punpcklbw m0, m2 ;setup
221 punpcklbw m1, m2 ;setup
224 paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
225 paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
230 %macro WEIGHT_SSSE3 2
243 %macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
249 movd [%2], %1 ; width 2 can write garbage for last 2 bytes
253 %macro WEIGHT_ROW 3 ; (src,dst,width)
255 WEIGHT %1, (%1+(mmsize/2))
256 packuswb m0, m1 ;put bytes into m0
257 WEIGHT_SAVE_ROW m0, %2, %3
260 %macro WEIGHT_SAVE_COL 2 ;(dst,size)
268 movd [%1], m0 ; width 2 can write garbage for last 2 bytes
273 %macro WEIGHT_COL 3 ; (src,dst,width)
274 %if %3 <= 4 && mmsize == 16
278 WEIGHT_SAVE_COL %2, %3
282 WEIGHT_SAVE_COL %2, %3
287 %macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
291 WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
292 WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
295 WEIGHT_COL (%1+x),(%2+x),(%3-x)
304 ;-----------------------------------------------------------------------------
305 ;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
306 ;-----------------------------------------------------------------------------
311 %define HEIGHT_REG r5d
314 %define LOAD_HEIGHT mov r4d, r5m
315 %define HEIGHT_REG r4d
319 cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
323 WEIGHT_TWO_ROW r2, r0, %1
341 %define WEIGHT WEIGHT_SSSE3
342 %define WEIGHT_START WEIGHT_START_SSSE3
359 %macro OFFSET_TWO_ROW 4
363 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
366 OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
375 ;-----------------------------------------------------------------------------
376 ;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
377 ;-----------------------------------------------------------------------------
379 cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
383 OFFSET_TWO_ROW r2, r0, %1, %3
411 ;=============================================================================
413 ;=============================================================================
415 ;-----------------------------------------------------------------------------
416 ; void pixel_avg_4x4( uint8_t *dst, int dst_stride,
417 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
418 ;-----------------------------------------------------------------------------
420 cglobal pixel_avg_%1x%2_%3
423 jne pixel_avg_weight_w%1_%3
424 %if mmsize == 16 && %1 == 16
426 jz pixel_avg_w%1_sse2
428 jmp pixel_avg_w%1_mmxext
431 ;-----------------------------------------------------------------------------
432 ; void pixel_avg_w4( uint8_t *dst, int dst_stride,
433 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
434 ; int height, int weight );
435 ;-----------------------------------------------------------------------------
459 AVG_FUNC pixel_avg_w4_mmxext, movd, movd
464 AVG_FUNC pixel_avg_w8_mmxext, movq, movq
469 cglobal pixel_avg_w16_mmxext
489 AVG_FUNC pixel_avg_w16_sse2, movdqu, movdqa
507 ;=============================================================================
509 ;=============================================================================
511 ;-----------------------------------------------------------------------------
512 ; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
513 ; uint8_t *src1, int src_stride,
514 ; uint8_t *src2, int height );
515 ;-----------------------------------------------------------------------------
517 cglobal pixel_avg2_w%1_mmxext, 6,7
538 cglobal pixel_avg2_w%1_mmxext, 6,7
564 cglobal pixel_avg2_w20_mmxext, 6,7
576 pavgb mm2, [r2+r4+16]
579 pavgb mm5, [r2+r6+16]
592 cglobal pixel_avg2_w16_sse2, 6,7
611 cglobal pixel_avg2_w20_%1, 6,7
619 %ifidn %1, sse2_misalign
628 pavgb mm4, [r2+r4+16]
629 pavgb mm5, [r2+r6+16]
642 AVG2_W20 sse2_misalign
644 ; Cacheline split code for processors with high latencies for loads
645 ; split over cache lines. See sad-a.asm for a more detailed explanation.
646 ; This particular instance is complicated by the fact that src1 and src2
647 ; can have different alignments. For simplicity and code size, only the
648 ; MMX cacheline workaround is used. As a result, in the case of SSE2
649 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
650 ; is no cacheline split, and the MMX workaround if there is.
660 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
661 cglobal pixel_avg2_w%1_cache%2_%3
663 and eax, 0x1f|(%2>>1)
664 cmp eax, (32-%1)|(%2>>1)
665 jle pixel_avg2_w%1_%3
666 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
668 jmp pixel_avg2_w16_cache_mmxext
670 jmp pixel_avg2_w%1_cache_mmxext
674 %macro AVG_CACHELINE_START 0
675 %assign stack_offset 0
686 %macro AVG_CACHELINE_LOOP 2
689 movq mm2, [r2+r4+8+%1]
701 pixel_avg2_w8_cache_mmxext:
703 AVG_CACHELINE_LOOP 0, movq
710 pixel_avg2_w16_cache_mmxext:
712 AVG_CACHELINE_LOOP 0, movq
713 AVG_CACHELINE_LOOP 8, movq
720 pixel_avg2_w20_cache_mmxext:
722 AVG_CACHELINE_LOOP 0, movq
723 AVG_CACHELINE_LOOP 8, movq
724 AVG_CACHELINE_LOOP 16, movd
732 AVG_CACHELINE_CHECK 8, 32, mmxext
733 AVG_CACHELINE_CHECK 12, 32, mmxext
734 AVG_CACHELINE_CHECK 16, 32, mmxext
735 AVG_CACHELINE_CHECK 20, 32, mmxext
736 AVG_CACHELINE_CHECK 16, 64, mmxext
737 AVG_CACHELINE_CHECK 20, 64, mmxext
740 AVG_CACHELINE_CHECK 8, 64, mmxext
741 AVG_CACHELINE_CHECK 12, 64, mmxext
742 AVG_CACHELINE_CHECK 16, 64, sse2
743 AVG_CACHELINE_CHECK 20, 64, sse2
745 ; computed jump assumes this loop is exactly 48 bytes
746 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
748 avg_w16_align%1_%2_ssse3:
751 palignr xmm1, [r2], %1
755 movdqa xmm2, [r2+r4+16]
756 palignr xmm1, [r2], %1
757 palignr xmm2, [r2+r4], %2
764 jg avg_w16_align%1_%2_ssse3
768 cglobal pixel_avg2_w16_cache64_ssse3
772 jle pixel_avg2_w16_sse2
778 lea r6, [r6*3] ;(offset + align*2)*3
780 shl r6, 4 ;jump = (offset + align*2)*48
781 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
783 lea r11, [avg_w16_addr]
786 lea r6, [avg_w16_addr + r6]
798 AVG16_CACHELINE_LOOP_SSSE3 j, j
799 AVG16_CACHELINE_LOOP_SSSE3 j, k
804 ;=============================================================================
806 ;=============================================================================
820 ;-----------------------------------------------------------------------------
821 ; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
822 ; uint8_t *src, int i_src_stride, int i_height )
823 ;-----------------------------------------------------------------------------
824 cglobal mc_copy_w4_mmx, 4,6
829 COPY4 movd, movd, r4, r5
833 COPY4 movd, movd, r4, r5
836 cglobal mc_copy_w8_mmx, 5,7
840 COPY4 movq, movq, r5, r6
847 cglobal mc_copy_w16_mmx, 5,7
856 movq mm5, [r2+r3*2+8]
864 movq [r0+r1*2+8], mm5
874 %macro COPY_W16_SSE2 2
879 COPY4 movdqa, %2, r5, r6
887 COPY_W16_SSE2 mc_copy_w16_sse2, movdqu
888 ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
889 ; but with SSE3 the overhead is zero, so there's no reason not to include it.
890 COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
891 COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
895 ;=============================================================================
897 ;=============================================================================
898 ; FIXME assumes 64 byte cachelines
900 ;-----------------------------------------------------------------------------
901 ; void prefetch_fenc( uint8_t *pix_y, int stride_y,
902 ; uint8_t *pix_uv, int stride_uv, int mb_x )
903 ;-----------------------------------------------------------------------------
905 cglobal prefetch_fenc_mmxext, 5,5
917 lea r2, [r2+rax*2+64]
923 cglobal prefetch_fenc_mmxext, 0,3
947 ;-----------------------------------------------------------------------------
948 ; void prefetch_ref( uint8_t *pix, int stride, int parity )
949 ;-----------------------------------------------------------------------------
950 cglobal prefetch_ref_mmxext, 3,3
968 ;=============================================================================
970 ;=============================================================================
973 DECLARE_REG_TMP 10,11,6
975 DECLARE_REG_TMP 0,1,2
978 %macro MC_CHROMA_START 0
990 add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
993 %macro UNPACK_UNALIGNED_MEM 3
997 %macro UNPACK_UNALIGNED_LOAD 3
1002 ;-----------------------------------------------------------------------------
1003 ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
1004 ; uint8_t *src, int src_stride,
1006 ; int width, int height )
1007 ;-----------------------------------------------------------------------------
1009 cglobal mc_chroma_%1, 0,6
1024 add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
1029 jl mc_chroma_mmxext %+ .skip_prologue
1046 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
1055 UNPACK_UNALIGNED m0, m1, [r3+2]
1066 UNPACK_UNALIGNED m0, m1, [r3+r4+2]
1101 %define multy0 [rsp-8]
1120 movu m1, [r3+mmsize/2]
1121 UNPACK_UNALIGNED m0, m2, [r3+2]
1122 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1141 movu m1, [r3+mmsize/2]
1142 UNPACK_UNALIGNED m0, m2, [r3+2]
1143 UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
1209 %ifdef ARCH_X86_64 ; too many regs for x86_32
1210 RESET_MM_PERMUTATION
1212 %if xmm_regs_used > 6
1213 %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
1214 %assign xmm_regs_used 6
1220 mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
1297 %endif ; ARCH_X86_64
1298 %endmacro ; MC_CHROMA
1301 %macro MC_CHROMA_SSSE3 0-1
1303 cglobal mc_chroma_ssse3%1, 0,6,9
1313 imul t2d, t0d ; (x*255+8)*y
1314 imul r5d, t0d ; (x*255+8)*(8-y)
1321 lea t1, [ch_shuf_adj]
1322 movddup m5, [t1 + t0*4]
1324 movddup m5, [ch_shuf_adj + t0*4]
1410 movu m3, [r3+r4*2+8]
1437 %define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
1440 MC_CHROMA sse2_misalign
1441 %define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
1444 MC_CHROMA_SSSE3 _cache64