1 ;*****************************************************************************
2 ;* mc-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* Dylan Yudaken <dyudaken@gmail.com>
10 ;* Min Chen <chenm001.163.com>
12 ;* This program is free software; you can redistribute it and/or modify
13 ;* it under the terms of the GNU General Public License as published by
14 ;* the Free Software Foundation; either version 2 of the License, or
15 ;* (at your option) any later version.
17 ;* This program is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;* GNU General Public License for more details.
22 ;* You should have received a copy of the GNU General Public License
23 ;* along with this program; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 ;*****************************************************************************
31 ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
41 ;=============================================================================
42 ; implicit weighted biprediction
43 ;=============================================================================
44 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
46 DECLARE_REG_TMP 0,1,2,3,4,5,10,11
47 %macro AVG_START 0-1 0
55 DECLARE_REG_TMP 1,2,3,4,5,6,1,2
56 %macro AVG_START 0-1 0
70 pshuflw %1, %2, %3*0x55
73 pshufw %1, %2, %3*0x55
89 %macro BIWEIGHT_START_MMX 0
91 SPLATW m2, m2 ; weight_dst
93 psubw m3, m2 ; weight_src
94 mova m4, [pw_32] ; rounding
98 %macro BIWEIGHT_SSSE3 2
107 %macro BIWEIGHT_START_SSSE3 0
108 movzx t6d, byte r6m ; FIXME x86_64
115 SPLATW m3, m3 ; weight_dst,src
118 %macro BIWEIGHT_ROW 4
125 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
131 ;-----------------------------------------------------------------------------
132 ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
133 ;-----------------------------------------------------------------------------
134 %macro AVG_WEIGHT 2-3 0
135 cglobal x264_pixel_avg_weight_w%2_%1
138 %if %2==8 && mmsize==16
141 BIWEIGHT [t2+t3], [t4+t5]
148 BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
149 BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2
161 %define BIWEIGHT BIWEIGHT_MMX
162 %define BIWEIGHT_START BIWEIGHT_START_MMX
166 AVG_WEIGHT mmxext, 16
168 %define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
169 AVG_WEIGHT sse2, 8, 7
170 AVG_WEIGHT sse2, 16, 7
171 %define BIWEIGHT BIWEIGHT_SSSE3
172 %define BIWEIGHT_START BIWEIGHT_START_SSSE3
176 AVG_WEIGHT ssse3, 8, 7
177 AVG_WEIGHT ssse3, 16, 7
179 ;=============================================================================
180 ; P frame explicit weighted prediction
181 ;=============================================================================
183 %macro WEIGHT_START 1
188 %if (%1 == 20 || %1 == 12) && mmsize == 16
197 %macro WEIGHT_START_SSSE3 1
201 %if %1 == 20 || %1 == 12
208 ;; macro to weight mmsize bytes taking half from %1 and half from %2
209 %macro WEIGHT 2 ; (src1,src2)
212 punpcklbw m0, m2 ;setup
213 punpcklbw m1, m2 ;setup
216 paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
217 paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
222 %macro WEIGHT_SSSE3 2
235 %macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
241 movd [%2], %1 ; width 2 can write garbage for last 2 bytes
245 %macro WEIGHT_ROW 3 ; (src,dst,width)
247 WEIGHT %1, (%1+(mmsize/2))
248 packuswb m0, m1 ;put bytes into m0
249 WEIGHT_SAVE_ROW m0, %2, %3
252 %macro WEIGHT_SAVE_COL 2 ;(dst,size)
260 movd [%1], m0 ; width 2 can write garbage for last 2 bytes
265 %macro WEIGHT_COL 3 ; (src,dst,width)
266 %if %3 <= 4 && mmsize == 16
270 WEIGHT_SAVE_COL %2, %3
274 WEIGHT_SAVE_COL %2, %3
279 %macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
283 WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
284 WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
287 WEIGHT_COL (%1+x),(%2+x),(%3-x)
297 ;void x264_mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src,int i_src_stride, x264_weight_t *weight,int h)
302 %define HEIGHT_REG r5d
305 %define LOAD_HEIGHT mov r4d, r5m
306 %define HEIGHT_REG r4d
310 cglobal x264_mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
314 WEIGHT_TWO_ROW r2, r0, %1
332 %define WEIGHT WEIGHT_SSSE3
333 %define WEIGHT_START WEIGHT_START_SSSE3
350 %macro OFFSET_TWO_ROW 4
354 OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
357 OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
366 ;void x264_mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, x264_weight_t *w, int h )
368 cglobal x264_mc_offset%3_w%1_%2, NUMREGS, NUMREGS
372 OFFSET_TWO_ROW r2, r0, %1, %3
400 ;=============================================================================
402 ;=============================================================================
404 ;-----------------------------------------------------------------------------
405 ; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
406 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
407 ;-----------------------------------------------------------------------------
409 cglobal x264_pixel_avg_%1x%2_%3
412 jne x264_pixel_avg_weight_w%1_%3
413 %if mmsize == 16 && %1 == 16
415 jz x264_pixel_avg_w%1_sse2
417 jmp x264_pixel_avg_w%1_mmxext
420 ;-----------------------------------------------------------------------------
421 ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
422 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
423 ; int height, int weight );
424 ;-----------------------------------------------------------------------------
448 AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
453 AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
458 cglobal x264_pixel_avg_w16_mmxext
478 AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
496 ;=============================================================================
498 ;=============================================================================
500 ;-----------------------------------------------------------------------------
501 ; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
502 ; uint8_t *src1, int src_stride,
503 ; uint8_t *src2, int height );
504 ;-----------------------------------------------------------------------------
506 cglobal x264_pixel_avg2_w%1_mmxext, 6,7
527 cglobal x264_pixel_avg2_w%1_mmxext, 6,7
553 cglobal x264_pixel_avg2_w20_mmxext, 6,7
565 pavgb mm2, [r2+r4+16]
568 pavgb mm5, [r2+r6+16]
581 cglobal x264_pixel_avg2_w16_sse2, 6,7
600 cglobal x264_pixel_avg2_w20_%1, 6,7
608 %ifidn %1, sse2_misalign
617 pavgb mm4, [r2+r4+16]
618 pavgb mm5, [r2+r6+16]
631 AVG2_W20 sse2_misalign
633 ; Cacheline split code for processors with high latencies for loads
634 ; split over cache lines. See sad-a.asm for a more detailed explanation.
635 ; This particular instance is complicated by the fact that src1 and src2
636 ; can have different alignments. For simplicity and code size, only the
637 ; MMX cacheline workaround is used. As a result, in the case of SSE2
638 ; pixel_avg, the cacheline check functions calls the SSE2 version if there
639 ; is no cacheline split, and the MMX workaround if there is.
649 %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
650 cglobal x264_pixel_avg2_w%1_cache%2_%3
652 and eax, 0x1f|(%2>>1)
653 cmp eax, (32-%1)|(%2>>1)
654 jle x264_pixel_avg2_w%1_%3
655 ;w12 isn't needed because w16 is just as fast if there's no cacheline split
657 jmp x264_pixel_avg2_w16_cache_mmxext
659 jmp x264_pixel_avg2_w%1_cache_mmxext
663 %macro AVG_CACHELINE_START 0
664 %assign stack_offset 0
675 %macro AVG_CACHELINE_LOOP 2
678 movq mm2, [r2+r4+8+%1]
690 x264_pixel_avg2_w8_cache_mmxext:
692 AVG_CACHELINE_LOOP 0, movq
699 x264_pixel_avg2_w16_cache_mmxext:
701 AVG_CACHELINE_LOOP 0, movq
702 AVG_CACHELINE_LOOP 8, movq
709 x264_pixel_avg2_w20_cache_mmxext:
711 AVG_CACHELINE_LOOP 0, movq
712 AVG_CACHELINE_LOOP 8, movq
713 AVG_CACHELINE_LOOP 16, movd
721 AVG_CACHELINE_CHECK 8, 32, mmxext
722 AVG_CACHELINE_CHECK 12, 32, mmxext
723 AVG_CACHELINE_CHECK 16, 32, mmxext
724 AVG_CACHELINE_CHECK 20, 32, mmxext
725 AVG_CACHELINE_CHECK 16, 64, mmxext
726 AVG_CACHELINE_CHECK 20, 64, mmxext
729 AVG_CACHELINE_CHECK 8, 64, mmxext
730 AVG_CACHELINE_CHECK 12, 64, mmxext
731 AVG_CACHELINE_CHECK 16, 64, sse2
732 AVG_CACHELINE_CHECK 20, 64, sse2
734 ; computed jump assumes this loop is exactly 48 bytes
735 %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
737 avg_w16_align%1_%2_ssse3:
740 palignr xmm1, [r2], %1
744 movdqa xmm2, [r2+r4+16]
745 palignr xmm1, [r2], %1
746 palignr xmm2, [r2+r4], %2
753 jg avg_w16_align%1_%2_ssse3
760 AVG16_CACHELINE_LOOP_SSSE3 j, j
761 AVG16_CACHELINE_LOOP_SSSE3 j, k
766 cglobal x264_pixel_avg2_w16_cache64_ssse3
770 jle x264_pixel_avg2_w16_sse2
776 lea r6, [r6*3] ;(offset + align*2)*3
778 shl r6, 4 ;jump = (offset + align*2)*48
779 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
781 lea r11, [avg_w16_addr]
784 lea r6, [avg_w16_addr + r6]
794 ;=============================================================================
796 ;=============================================================================
810 ;-----------------------------------------------------------------------------
811 ; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
812 ; uint8_t *src, int i_src_stride, int i_height )
813 ;-----------------------------------------------------------------------------
814 cglobal x264_mc_copy_w4_mmx, 4,6
819 COPY4 movd, movd, r4, r5
823 COPY4 movd, movd, r4, r5
826 cglobal x264_mc_copy_w8_mmx, 5,7
830 COPY4 movq, movq, r5, r6
837 cglobal x264_mc_copy_w16_mmx, 5,7
846 movq mm5, [r2+r3*2+8]
854 movq [r0+r1*2+8], mm5
864 %macro COPY_W16_SSE2 2
869 COPY4 movdqa, %2, r5, r6
877 COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
878 ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
879 ; but with SSE3 the overhead is zero, so there's no reason not to include it.
880 COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
881 COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
885 ;=============================================================================
887 ;=============================================================================
888 ; FIXME assumes 64 byte cachelines
890 ;-----------------------------------------------------------------------------
891 ; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
892 ; uint8_t *pix_uv, int stride_uv, int mb_x )
893 ;-----------------------------------------------------------------------------
895 cglobal x264_prefetch_fenc_mmxext, 5,5
899 lea r0, [r0+rax*4+64]
914 cglobal x264_prefetch_fenc_mmxext
938 ;-----------------------------------------------------------------------------
939 ; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
940 ;-----------------------------------------------------------------------------
941 cglobal x264_prefetch_ref_mmxext, 3,3
959 ;=============================================================================
961 ;=============================================================================
970 %macro MC_CHROMA_START 0
982 add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride
985 ;-----------------------------------------------------------------------------
986 ; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,
987 ; uint8_t *src, int src_stride,
989 ; int width, int height )
990 ;-----------------------------------------------------------------------------
991 %macro MC_CHROMA 1-2 0
992 cglobal x264_mc_chroma_%1
995 jle x264_mc_chroma_mmxext
1000 and r4d, 7 ; dx &= 7
1002 and r5d, 7 ; dy &= 7
1007 SPLATW m5, m5 ; m5 = dx
1008 SPLATW m6, m6 ; m6 = dy
1012 psubw m4, m5 ; m4 = 8-dx
1013 psubw m0, m6 ; m0 = 8-dy
1016 pmullw m5, m0 ; m5 = dx*(8-dy) = cB
1017 pmullw m7, m6 ; m7 = dx*dy = cD
1018 pmullw m6, m4 ; m6 = (8-dx)*dy = cC
1019 pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA
1034 punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
1036 pmullw m1, m6 ; 2nd line * cC
1037 pmullw m0, m4 ; 1st line * cA
1038 paddw m0, m1 ; m0 <- result
1047 pmullw m2, m5 ; line * cB
1048 pmullw m1, m7 ; line * cD
1053 packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4
1057 add r0, r1 ; dst_stride
1063 jnz .finish ; width != 8 so assume 4
1065 lea r0, [r10+4] ; dst
1066 lea r2, [r11+4] ; src
1072 mov r4d, r7m ; height
1081 mov r5, r3 ; pel_offset = dx ? 1 : src_stride
1146 %endmacro ; MC_CHROMA
1153 %macro MC_CHROMA_SSSE3 2
1155 cglobal x264_mc_chroma_ssse3%1, 0,6,%2
1165 imul r5d, t0d ; (x*255+8)*y
1166 imul r4d, t0d ; (x*255+8)*(8-y)
1181 lea r11, [ch_shuffle]
1182 movu m5, [r11 + r5*2]
1184 movu m5, [ch_shuffle + r5*2]
1237 movh m2, [r2+1*r3+1]
1239 movh m4, [r2+2*r3+1]
1268 lea r11, [ch_shuffle]
1269 movu m5, [r11 + r5*2]
1271 movu m5, [ch_shuffle + r5*2]
1279 %define round [pw_32]
1308 ; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
1312 MC_CHROMA_SSSE3 _cache64, 9