1 ;*****************************************************************************
2 ;* mc-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;* Fiona Glaser <fiona@x264.com>
9 ;* Min Chen <chenm001.163.com>
11 ;* This program is free software; you can redistribute it and/or modify
12 ;* it under the terms of the GNU General Public License as published by
13 ;* the Free Software Foundation; either version 2 of the License, or
14 ;* (at your option) any later version.
16 ;* This program is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;* GNU General Public License for more details.
21 ;* You should have received a copy of the GNU General Public License
22 ;* along with this program; if not, write to the Free Software
23 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 ;*****************************************************************************
38 ;=============================================================================
40 ;=============================================================================
41 ; implicit bipred only:
42 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
95 %macro BIWEIGHT_START 0
97 SPLATW m4, m4 ; weight_dst
98 mova m5, [pw_64 GLOBAL]
99 psubw m5, m4 ; weight_src
100 mova m6, [pw_32 GLOBAL] ; rounding
107 ;-----------------------------------------------------------------------------
108 ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
109 ;-----------------------------------------------------------------------------
111 AVG_START x264_pixel_avg_weight_w%2_%1
115 BIWEIGHT [t0+x], [t2+x], [t4+x]
116 BIWEIGHT [t0+x+t1], [t2+x+t3], [t4+x+t5]
129 AVG_WEIGHT mmxext, 16
136 ;=============================================================================
138 ;=============================================================================
140 ;-----------------------------------------------------------------------------
141 ; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
142 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
143 ;-----------------------------------------------------------------------------
145 cglobal x264_pixel_avg_%1x%2_%3,0,0
148 jne x264_pixel_avg_weight_w%1_mmxext
149 %if mmsize == 16 && %1 == 16
151 jz x264_pixel_avg_w%1_sse2
153 jmp x264_pixel_avg_w%1_mmxext
156 ;-----------------------------------------------------------------------------
157 ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
158 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
159 ; int height, int weight );
160 ;-----------------------------------------------------------------------------
173 AVG_START x264_pixel_avg_w4_mmxext
187 AVG_START x264_pixel_avg_w8_mmxext
201 AVG_START x264_pixel_avg_w16_mmxext
220 AVG_START x264_pixel_avg_w16_sse2
239 ;=============================================================================
241 ;=============================================================================
243 ;-----------------------------------------------------------------------------
244 ; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
245 ; uint8_t *src1, int src_stride,
246 ; uint8_t *src2, int height );
247 ;-----------------------------------------------------------------------------
249 cglobal x264_pixel_avg2_w%1_mmxext, 6,7
270 cglobal x264_pixel_avg2_w%1_mmxext, 6,7
296 cglobal x264_pixel_avg2_w20_mmxext, 6,7
308 pavgb mm2, [r2+r4+16]
311 pavgb mm5, [r2+r6+16]
324 cglobal x264_pixel_avg2_w16_sse2, 6,7
342 cglobal x264_pixel_avg2_w20_sse2, 6,7
354 pavgb mm4, [r2+r4+16]
355 pavgb mm5, [r2+r6+16]
366 ; Cacheline split code for processors with high latencies for loads
367 ; split over cache lines. See sad-a.asm for a more detailed explanation.
368 ; This particular instance is complicated by the fact that src1 and src2
369 ; can have different alignments. For simplicity and code size, only the
370 ; MMX cacheline workaround is used. As a result, in the case of SSE2
371 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
372 ; is no cacheline split, and the MMX workaround if there is.
377 movd %1, [sw_64 GLOBAL]
382 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
383 cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0
385 and eax, 0x1f|(%2>>1)
386 cmp eax, (32-%1)|(%2>>1)
387 jle x264_pixel_avg2_w%1_%3
388 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
390 jmp x264_pixel_avg2_w16_cache_mmxext
392 jmp x264_pixel_avg2_w%1_cache_mmxext
396 %macro AVG_CACHELINE_START 0
397 %assign stack_offset 0
408 %macro AVG_CACHELINE_LOOP 2
411 movq mm2, [r2+r4+8+%1]
423 x264_pixel_avg2_w8_cache_mmxext:
425 AVG_CACHELINE_LOOP 0, movq
432 x264_pixel_avg2_w16_cache_mmxext:
434 AVG_CACHELINE_LOOP 0, movq
435 AVG_CACHELINE_LOOP 8, movq
442 x264_pixel_avg2_w20_cache_mmxext:
444 AVG_CACHELINE_LOOP 0, movq
445 AVG_CACHELINE_LOOP 8, movq
446 AVG_CACHELINE_LOOP 16, movd
454 AVG_CACHELINE_CHECK 8, 32, mmxext
455 AVG_CACHELINE_CHECK 12, 32, mmxext
456 AVG_CACHELINE_CHECK 16, 32, mmxext
457 AVG_CACHELINE_CHECK 20, 32, mmxext
458 AVG_CACHELINE_CHECK 16, 64, mmxext
459 AVG_CACHELINE_CHECK 20, 64, mmxext
462 AVG_CACHELINE_CHECK 8, 64, mmxext
463 AVG_CACHELINE_CHECK 12, 64, mmxext
464 AVG_CACHELINE_CHECK 16, 64, sse2
465 AVG_CACHELINE_CHECK 20, 64, sse2
467 ;=============================================================================
469 ;=============================================================================
483 ;-----------------------------------------------------------------------------
484 ; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
485 ; uint8_t *src, int i_src_stride, int i_height )
486 ;-----------------------------------------------------------------------------
487 cglobal x264_mc_copy_w4_mmx, 4,6
492 COPY4 movd, movd, r4, r5
496 COPY4 movd, movd, r4, r5
499 cglobal x264_mc_copy_w8_mmx, 5,7
503 COPY4 movq, movq, r5, r6
510 cglobal x264_mc_copy_w16_mmx, 5,7
519 movq mm5, [r2+r3*2+8]
527 movq [r0+r1*2+8], mm5
537 %macro COPY_W16_SSE2 2
542 COPY4 movdqa, %2, r5, r6
550 COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
551 ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
552 ; but with SSE3 the overhead is zero, so there's no reason not to include it.
553 COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
554 COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
558 ;=============================================================================
560 ;=============================================================================
561 ; FIXME assumes 64 byte cachelines
563 ;-----------------------------------------------------------------------------
564 ; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
565 ; uint8_t *pix_uv, int stride_uv, int mb_x )
566 ;-----------------------------------------------------------------------------
568 cglobal x264_prefetch_fenc_mmxext, 5,5
572 lea r0, [r0+rax*4+64]
587 cglobal x264_prefetch_fenc_mmxext
611 ;-----------------------------------------------------------------------------
612 ; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
613 ;-----------------------------------------------------------------------------
614 cglobal x264_prefetch_ref_mmxext, 3,3
632 ;=============================================================================
634 ;=============================================================================
644 %macro MC_CHROMA_START 0
656 add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride
659 ;-----------------------------------------------------------------------------
660 ; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,
661 ; uint8_t *src, int src_stride,
663 ; int width, int height )
664 ;-----------------------------------------------------------------------------
666 cglobal x264_mc_chroma_%1, 0,6
669 jle x264_mc_chroma_mmxext %+ .skip_prologue
681 SPLATW m5, m5 ; m5 = dx
682 SPLATW m6, m6 ; m6 = dy
684 mova m4, [pw_8 GLOBAL]
686 psubw m4, m5 ; m4 = 8-dx
687 psubw m0, m6 ; m0 = 8-dy
690 pmullw m5, m0 ; m5 = dx*(8-dy) = cB
691 pmullw m7, m6 ; m7 = dx*dy = cD
692 pmullw m6, m4 ; m6 = (8-dx)*dy = cC
693 pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA
708 punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
710 pmullw m1, m6 ; 2nd line * cC
711 pmullw m0, m4 ; 1st line * cA
712 paddw m0, m1 ; m0 <- result
719 paddw m0, [pw_32 GLOBAL]
721 pmullw m2, m5 ; line * cB
722 pmullw m1, m7 ; line * cD
727 packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4
731 add r0, r1 ; dst_stride
737 jnz .finish ; width != 8 so assume 4
739 lea r0, [r10+4] ; dst
740 lea r2, [r11+4] ; src
746 mov r4d, r7m ; height
755 mov r5, r3 ; pel_offset = dx ? 1 : src_stride
761 mova m5, [pw_8 GLOBAL]
763 mova m7, [pw_4 GLOBAL]
820 %endmacro ; MC_CHROMA
828 cglobal x264_mc_chroma_ssse3, 0,6
838 imul r5d, t0d ; (x*255+8)*y
839 imul r4d, t0d ; (x*255+8)*(8-y)
842 mova m5, [pw_32 GLOBAL]
857 punpcklbw m3, [r2+r3+1]
883 mova m5, [pw_32 GLOBAL]
924 ; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size