1 ;*****************************************************************************
2 ;* mc.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2003 x264 project
5 ;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
7 ;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
8 ;* Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
10 ;* This program is free software; you can redistribute it and/or modify
11 ;* it under the terms of the GNU General Public License as published by
12 ;* the Free Software Foundation; either version 2 of the License, or
13 ;* (at your option) any later version.
15 ;* This program is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;* GNU General Public License for more details.
20 ;* You should have received a copy of the GNU General Public License
21 ;* along with this program; if not, write to the Free Software
22 ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 ;*****************************************************************************
25 ;*****************************************************************************
27 ;* Revision history: *
29 ;* 2004.05.17 portab mc_copy_w4/8/16 (CM) *
31 ;*****************************************************************************
35 ;=============================================================================
36 ; Macros and other preprocessor constants
37 ;=============================================================================
39 %include "amd64inc.asm"
41 ;=============================================================================
43 ;=============================================================================
53 ;=============================================================================
55 ;=============================================================================
59 cglobal x264_pixel_avg_w4_mmxext
60 cglobal x264_pixel_avg_w8_mmxext
61 cglobal x264_pixel_avg_w16_mmxext
62 cglobal x264_pixel_avg_w16_sse2
64 cglobal x264_pixel_avg_weight_4x4_mmxext
65 cglobal x264_pixel_avg_weight_w8_mmxext
66 cglobal x264_pixel_avg_weight_w16_mmxext
68 cglobal x264_mc_copy_w4_mmxext
69 cglobal x264_mc_copy_w8_mmxext
70 cglobal x264_mc_copy_w16_mmxext
71 cglobal x264_mc_copy_w16_sse2
73 cglobal x264_mc_chroma_mmxext
75 ;=============================================================================
77 ;=============================================================================
80 ;-----------------------------------------------------------------------------
81 ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int i_dst_stride,
82 ; uint8_t *src1, int i_src1_stride,
83 ; uint8_t *src2, int i_src2_stride,
85 ;-----------------------------------------------------------------------------
86 x264_pixel_avg_w4_mmxext:
87 mov r10, parm5q ; src2
88 movsxd r11, parm6d ; i_src2_stride
89 movsxd rax, parm7d ; i_height
95 movd mm1, [parm3q+parm4q]
98 movd [parm1q+parm2q], mm1
101 lea parm3q, [parm3q+parm4q*2]
103 lea parm1q, [parm1q+parm2q*2]
111 ;-----------------------------------------------------------------------------
112 ; void x264_pixel_avg_w8_mmxext( uint8_t *dst, int i_dst_stride,
113 ; uint8_t *src1, int i_src1_stride,
114 ; uint8_t *src2, int i_src2_stride,
116 ;-----------------------------------------------------------------------------
117 x264_pixel_avg_w8_mmxext:
119 mov r10, parm5q ; src2
120 movsxd r11, parm6d ; i_src2_stride
121 movsxd rax, parm7d ; i_height
129 lea parm3q, [parm3q+parm4q]
131 lea parm1q, [parm1q+parm2q]
137 ;-----------------------------------------------------------------------------
138 ; void x264_pixel_avg_w16_mmxext( uint8_t *dst, int i_dst_stride,
139 ; uint8_t *src1, int i_src1_stride,
140 ; uint8_t *src2, int i_src2_stride,
142 ;-----------------------------------------------------------------------------
143 x264_pixel_avg_w16_mmxext:
144 mov r10, parm5q ; src2
145 movsxd r11, parm6d ; i_src2_stride
146 movsxd rax, parm7d ; i_height
157 lea parm3q, [parm3q+parm4q]
159 lea parm1q, [parm1q+parm2q]
165 ;-----------------------------------------------------------------------------
166 ; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride,
167 ; uint8_t *src1, int i_src1_stride,
168 ; uint8_t *src2, int i_src2_stride,
170 ;-----------------------------------------------------------------------------
171 x264_pixel_avg_w16_sse2:
172 mov r10, parm5q ; src2
173 movsxd r11, parm6d ; i_src2_stride
174 movsxd rax, parm7d ; i_height
178 movdqu xmm0, [parm3q]
180 movdqu [parm1q], xmm0
183 lea parm3q, [parm3q+parm4q]
185 lea parm1q, [parm1q+parm2q]
192 ;=============================================================================
193 ; weighted prediction
194 ;=============================================================================
195 ; implicit bipred only:
196 ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
198 %macro BIWEIGHT_4P_MMX 2
213 %macro BIWEIGHT_START_MMX 0
215 ; movsxd rsi, esi ; i_dst
217 ; movsxd rcx, ecx ; i_src
218 ; movsxd r8, r8d ; i_weight_dst
219 ; movsxd r9, r9d ; i_height
220 mov r11d, parm6d ; i_height
223 pshufw mm4, mm4, 0 ; weight_dst
224 movq mm5, [pw_64 GLOBAL]
225 psubw mm5, mm4 ; weight_src
226 movq mm6, [pw_32 GLOBAL] ; rounding
234 ;-----------------------------------------------------------------------------
235 ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
236 ;-----------------------------------------------------------------------------
237 x264_pixel_avg_weight_w16_mmxext:
240 BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
241 BIWEIGHT_4P_MMX [parm1q+ 4], [parm3q+ 4]
242 BIWEIGHT_4P_MMX [parm1q+ 8], [parm3q+ 8]
243 BIWEIGHT_4P_MMX [parm1q+12], [parm3q+12]
252 ;-----------------------------------------------------------------------------
253 ; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
254 ;-----------------------------------------------------------------------------
255 x264_pixel_avg_weight_w8_mmxext:
258 BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
259 BIWEIGHT_4P_MMX [parm1q+4 ], [parm3q+4 ]
260 BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ]
261 BIWEIGHT_4P_MMX [parm1q+parm2q+4], [parm3q+parm4q+4]
263 lea parm1q, [parm1q+parm2q*2]
264 lea parm3q, [parm3q+parm4q*2]
270 ;-----------------------------------------------------------------------------
271 ; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
272 ;-----------------------------------------------------------------------------
273 x264_pixel_avg_weight_4x4_mmxext:
275 BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
276 BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ]
277 BIWEIGHT_4P_MMX [parm1q+parm2q*2], [parm3q+parm4q*2]
280 BIWEIGHT_4P_MMX [parm1q+parm2q*2], [parm3q+parm4q*2]
285 ;=============================================================================
287 ;=============================================================================
290 ;-----------------------------------------------------------------------------
291 ; void x264_mc_copy_w4_mmxext( uint8_t *src, int i_src_stride,
292 ; uint8_t *dst, int i_dst_stride, int i_height )
293 ;-----------------------------------------------------------------------------
294 x264_mc_copy_w4_mmxext:
295 mov eax, parm5d ; i_height
300 mov r11d, [parm1q+parm2q]
302 mov [parm3q+parm4q], r11d
303 lea parm1q, [parm1q+parm2q*2]
304 lea parm3q, [parm3q+parm4q*2]
312 ;-----------------------------------------------------------------------------
313 ; void x264_mc_copy_w8_mmxext( uint8_t *src, int i_src_stride,
314 ; uint8_t *dst, int i_dst_stride, int i_height )
315 ;-----------------------------------------------------------------------------
316 x264_mc_copy_w8_mmxext:
317 mov eax, parm5d ; i_height
319 lea r10, [parm2q+parm2q*2] ; 3 * i_src_stride
320 lea r11, [parm4q+parm4q*2] ; 3 * i_dst_stride
325 movq mm1, [parm1q+parm2q]
326 movq mm2, [parm1q+parm2q*2]
327 movq mm3, [parm1q+r10]
329 movq [parm3q+parm4q], mm1
330 movq [parm3q+parm4q*2], mm2
331 movq [parm3q+r11], mm3
332 lea parm1q, [parm1q+parm2q*4]
333 lea parm3q, [parm3q+parm4q*4]
341 ;-----------------------------------------------------------------------------
342 ; void x264_mc_copy_w16_mmxext( uint8_t *src, int i_src_stride,
343 ; uint8_t *dst, int i_dst_stride, int i_height )
344 ;-----------------------------------------------------------------------------
345 x264_mc_copy_w16_mmxext:
346 mov eax, parm5d ; i_height
348 lea r10, [parm2q+parm2q*2] ; 3 * i_src_stride
349 lea r11, [parm4q+parm4q*2] ; 3 * i_dst_stride
355 movq mm2, [parm1q+parm2q]
356 movq mm3, [parm1q+parm2q+8]
357 movq mm4, [parm1q+parm2q*2]
358 movq mm5, [parm1q+parm2q*2+8]
359 movq mm6, [parm1q+r10]
360 movq mm7, [parm1q+r10+8]
363 movq [parm3q+parm4q], mm2
364 movq [parm3q+parm4q+8], mm3
365 movq [parm3q+parm4q*2], mm4
366 movq [parm3q+parm4q*2+8], mm5
367 movq [parm3q+r11], mm6
368 movq [parm3q+r11+8], mm7
369 lea parm1q, [parm1q+parm2q*4]
370 lea parm3q, [parm3q+parm4q*4]
378 ;-----------------------------------------------------------------------------
379 ; void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
380 ;-----------------------------------------------------------------------------
381 x264_mc_copy_w16_sse2:
382 mov eax, parm5d ; i_height
386 movdqu xmm0, [parm1q]
387 movdqu xmm1, [parm1q+parm2q]
388 movdqu [parm3q], xmm0
389 movdqu [parm3q+parm4q], xmm1
392 lea parm1q, [parm1q+parm2q*2]
393 lea parm3q, [parm3q+parm4q*2]
400 ;=============================================================================
402 ;=============================================================================
405 ;-----------------------------------------------------------------------------
406 ; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
407 ; uint8_t *dst, int i_dst_stride,
409 ; int i_width, int i_height )
410 ;-----------------------------------------------------------------------------
412 x264_mc_chroma_mmxext:
418 pshufw mm5, mm0, 0 ; mm5 - dx
419 pshufw mm6, mm1, 0 ; mm6 - dy
421 movq mm4, [pw_8 GLOBAL]
424 psubw mm4, mm5 ; mm4 - 8-dx
425 psubw mm0, mm6 ; mm0 - 8-dy
428 pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB
429 pmullw mm7, mm6 ; mm7 = dx*dy = cD
430 pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
431 pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
440 movd mm1, [rax+parm2q]
442 punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
444 pmullw mm1, mm6 ; 2nd line * cC
445 pmullw mm0, mm4 ; 1st line * cA
447 paddw mm0, mm1 ; mm0 <- result
450 movd mm1, [rax+parm2q+1]
454 paddw mm0, [pw_32 GLOBAL]
456 pmullw mm2, mm5 ; line * cB
457 pmullw mm1, mm7 ; line * cD
462 packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4
466 add r10, parm4q ; i_dst_stride
472 jnz .finish ; width != 8 so assume 4
474 mov r10, parm3q ; dst
475 mov rax, parm1q ; src
476 mov r11d, parm8d ; i_height