1 ;*****************************************************************************
2 ;* mc-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
30 ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
39 ;=============================================================================
41 ;=============================================================================
42 ; implicit bipred only:
43 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
45 DECLARE_REG_TMP 0,1,2,3,4,5,10,11
46 %macro AVG_START 0-1 0
54 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
55 %macro AVG_START 0-1 0
88 %macro BIWEIGHT_START_MMX 0
90 SPLATW m2, m2 ; weight_dst
91 mova m3, [pw_64 GLOBAL]
92 psubw m3, m2 ; weight_src
93 mova m4, [pw_32 GLOBAL] ; rounding
97 %macro BIWEIGHT_SSSE3 2
106 %macro BIWEIGHT_START_SSSE3 0
107 movzx t6d, byte r6m ; FIXME x86_64
113 mova m4, [pw_32 GLOBAL]
114 SPLATW m3, m3 ; weight_dst,src
117 %macro BIWEIGHT_ROW 4
124 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
130 ;-----------------------------------------------------------------------------
131 ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
132 ;-----------------------------------------------------------------------------
133 %macro AVG_WEIGHT 2-3 0
134 cglobal x264_pixel_avg_weight_w%2_%1
137 %if %2==8 && mmsize==16
140 BIWEIGHT [t2+t3], [t4+t5]
147 BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
148 BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2
160 %define BIWEIGHT BIWEIGHT_MMX
161 %define BIWEIGHT_START BIWEIGHT_START_MMX
165 AVG_WEIGHT mmxext, 16
167 %define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
168 AVG_WEIGHT sse2, 8, 7
169 AVG_WEIGHT sse2, 16, 7
170 %define BIWEIGHT BIWEIGHT_SSSE3
171 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
175 AVG_WEIGHT ssse3, 8, 7
176 AVG_WEIGHT ssse3, 16, 7
180 ;=============================================================================
182 ;=============================================================================
184 ;-----------------------------------------------------------------------------
185 ; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
186 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
187 ;-----------------------------------------------------------------------------
189 cglobal x264_pixel_avg_%1x%2_%3
192 jne x264_pixel_avg_weight_w%1_%3
193 %if mmsize == 16 && %1 == 16
195 jz x264_pixel_avg_w%1_sse2
197 jmp x264_pixel_avg_w%1_mmxext
200 ;-----------------------------------------------------------------------------
201 ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
202 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
203 ; int height, int weight );
204 ;-----------------------------------------------------------------------------
228 AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
233 AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
238 cglobal x264_pixel_avg_w16_mmxext
258 AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
276 ;=============================================================================
278 ;=============================================================================
280 ;-----------------------------------------------------------------------------
281 ; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
282 ; uint8_t *src1, int src_stride,
283 ; uint8_t *src2, int height );
284 ;-----------------------------------------------------------------------------
286 cglobal x264_pixel_avg2_w%1_mmxext, 6,7
307 cglobal x264_pixel_avg2_w%1_mmxext, 6,7
333 cglobal x264_pixel_avg2_w20_mmxext, 6,7
345 pavgb mm2, [r2+r4+16]
348 pavgb mm5, [r2+r6+16]
361 cglobal x264_pixel_avg2_w16_sse2, 6,7
380 cglobal x264_pixel_avg2_w20_%1, 6,7
388 %ifidn %1, sse2_misalign
397 pavgb mm4, [r2+r4+16]
398 pavgb mm5, [r2+r6+16]
411 AVG2_W20 sse2_misalign
413 ; Cacheline split code for processors with high latencies for loads
414 ; split over cache lines. See sad-a.asm for a more detailed explanation.
415 ; This particular instance is complicated by the fact that src1 and src2
416 ; can have different alignments. For simplicity and code size, only the
417 ; MMX cacheline workaround is used. As a result, in the case of SSE2
418 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
419 ; is no cacheline split, and the MMX workaround if there is.
424 movd %1, [sw_64 GLOBAL]
429 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
430 cglobal x264_pixel_avg2_w%1_cache%2_%3
432 and eax, 0x1f|(%2>>1)
433 cmp eax, (32-%1)|(%2>>1)
434 jle x264_pixel_avg2_w%1_%3
435 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
437 jmp x264_pixel_avg2_w16_cache_mmxext
439 jmp x264_pixel_avg2_w%1_cache_mmxext
443 %macro AVG_CACHELINE_START 0
444 %assign stack_offset 0
455 %macro AVG_CACHELINE_LOOP 2
458 movq mm2, [r2+r4+8+%1]
470 x264_pixel_avg2_w8_cache_mmxext:
472 AVG_CACHELINE_LOOP 0, movq
479 x264_pixel_avg2_w16_cache_mmxext:
481 AVG_CACHELINE_LOOP 0, movq
482 AVG_CACHELINE_LOOP 8, movq
489 x264_pixel_avg2_w20_cache_mmxext:
491 AVG_CACHELINE_LOOP 0, movq
492 AVG_CACHELINE_LOOP 8, movq
493 AVG_CACHELINE_LOOP 16, movd
501 AVG_CACHELINE_CHECK 8, 32, mmxext
502 AVG_CACHELINE_CHECK 12, 32, mmxext
503 AVG_CACHELINE_CHECK 16, 32, mmxext
504 AVG_CACHELINE_CHECK 20, 32, mmxext
505 AVG_CACHELINE_CHECK 16, 64, mmxext
506 AVG_CACHELINE_CHECK 20, 64, mmxext
509 AVG_CACHELINE_CHECK 8, 64, mmxext
510 AVG_CACHELINE_CHECK 12, 64, mmxext
511 AVG_CACHELINE_CHECK 16, 64, sse2
512 AVG_CACHELINE_CHECK 20, 64, sse2
514 ;=============================================================================
516 ;=============================================================================
530 ;-----------------------------------------------------------------------------
531 ; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
532 ; uint8_t *src, int i_src_stride, int i_height )
533 ;-----------------------------------------------------------------------------
534 cglobal x264_mc_copy_w4_mmx, 4,6
539 COPY4 movd, movd, r4, r5
543 COPY4 movd, movd, r4, r5
546 cglobal x264_mc_copy_w8_mmx, 5,7
550 COPY4 movq, movq, r5, r6
557 cglobal x264_mc_copy_w16_mmx, 5,7
566 movq mm5, [r2+r3*2+8]
574 movq [r0+r1*2+8], mm5
584 %macro COPY_W16_SSE2 2
589 COPY4 movdqa, %2, r5, r6
597 COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
598 ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
599 ; but with SSE3 the overhead is zero, so there's no reason not to include it.
600 COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
601 COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
605 ;=============================================================================
607 ;=============================================================================
608 ; FIXME assumes 64 byte cachelines
610 ;-----------------------------------------------------------------------------
611 ; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
612 ; uint8_t *pix_uv, int stride_uv, int mb_x )
613 ;-----------------------------------------------------------------------------
615 cglobal x264_prefetch_fenc_mmxext, 5,5
619 lea r0, [r0+rax*4+64]
634 cglobal x264_prefetch_fenc_mmxext
658 ;-----------------------------------------------------------------------------
659 ; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
660 ;-----------------------------------------------------------------------------
661 cglobal x264_prefetch_ref_mmxext, 3,3
679 ;=============================================================================
681 ;=============================================================================
690 %macro MC_CHROMA_START 0
702 add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride
705 ;-----------------------------------------------------------------------------
706 ; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,
707 ; uint8_t *src, int src_stride,
709 ; int width, int height )
710 ;-----------------------------------------------------------------------------
711 %macro MC_CHROMA 1-2 0
712 cglobal x264_mc_chroma_%1
715 jle x264_mc_chroma_mmxext
727 SPLATW m5, m5 ; m5 = dx
728 SPLATW m6, m6 ; m6 = dy
730 mova m4, [pw_8 GLOBAL]
732 psubw m4, m5 ; m4 = 8-dx
733 psubw m0, m6 ; m0 = 8-dy
736 pmullw m5, m0 ; m5 = dx*(8-dy) = cB
737 pmullw m7, m6 ; m7 = dx*dy = cD
738 pmullw m6, m4 ; m6 = (8-dx)*dy = cC
739 pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA
754 punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
756 pmullw m1, m6 ; 2nd line * cC
757 pmullw m0, m4 ; 1st line * cA
758 paddw m0, m1 ; m0 <- result
765 paddw m0, [pw_32 GLOBAL]
767 pmullw m2, m5 ; line * cB
768 pmullw m1, m7 ; line * cD
773 packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4
777 add r0, r1 ; dst_stride
783 jnz .finish ; width != 8 so assume 4
785 lea r0, [r10+4] ; dst
786 lea r2, [r11+4] ; src
792 mov r4d, r7m ; height
801 mov r5, r3 ; pel_offset = dx ? 1 : src_stride
807 mova m5, [pw_8 GLOBAL]
809 mova m7, [pw_4 GLOBAL]
866 %endmacro ; MC_CHROMA
873 %macro MC_CHROMA_SSSE3 2
875 cglobal x264_mc_chroma_ssse3%1, 0,6,%2
885 imul r5d, t0d ; (x*255+8)*y
886 imul r4d, t0d ; (x*255+8)*(8-y)
889 mova m5, [pw_32 GLOBAL]
904 punpcklbw m3, [r2+r3+1]
943 mova m5, [pw_32 GLOBAL]
980 lea r11, [ch_shuffle GLOBAL]
981 movu m5, [r11 + r5*2]
983 movu m5, [ch_shuffle + r5*2 GLOBAL]
988 mova m8, [pw_32 GLOBAL]
991 %define round [pw_32 GLOBAL]
1020 ; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
1024 MC_CHROMA_SSSE3 _cache64, 9