1 ;******************************************************************************
3 ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4 ;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
5 ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7 ;* Copyright (c) 2013 Daniel Kang
9 ;* SIMD-optimized halfpel functions
11 ;* This file is part of FFmpeg.
13 ;* FFmpeg is free software; you can redistribute it and/or
14 ;* modify it under the terms of the GNU Lesser General Public
15 ;* License as published by the Free Software Foundation; either
16 ;* version 2.1 of the License, or (at your option) any later version.
18 ;* FFmpeg is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 ;* Lesser General Public License for more details.
23 ;* You should have received a copy of the GNU Lesser General Public
24 ;* License along with FFmpeg; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 ;******************************************************************************
28 %include "libavutil/x86/x86util.asm"
33 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34 pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
40 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41 %macro PUT_PIXELS8_X2 0
43 cglobal put_pixels16_x2, 4,5,4
45 cglobal put_pixels8_x2, 4,5
90 ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
91 %macro PUT_PIXELS_16 0
92 cglobal put_pixels16_x2, 4,5
132 ; The 8_X2 macro can easily be used here
137 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
138 %macro PUT_NO_RND_PIXELS8_X2 0
139 cglobal put_no_rnd_pixels8_x2, 4,5
173 PUT_NO_RND_PIXELS8_X2
175 PUT_NO_RND_PIXELS8_X2
178 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
179 %macro PUT_PIXELS8_Y2 0
181 cglobal put_pixels16_y2, 4,5,3
183 cglobal put_pixels8_y2, 4,5
214 ; actually, put_pixels16_y2_sse2
219 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
220 %macro PUT_NO_RND_PIXELS8_Y2 0
221 cglobal put_no_rnd_pixels8_y2, 4,5
251 PUT_NO_RND_PIXELS8_Y2
253 PUT_NO_RND_PIXELS8_Y2
256 ; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
258 cglobal avg_pixels8, 4,5
286 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
287 %macro AVG_PIXELS8_X2 0
289 cglobal avg_pixels16_x2, 4,5,4
291 cglobal avg_pixels8_x2, 4,5
294 %if notcpuflag(mmxext)
307 PAVGB m0, [r1+1], m3, m5
308 PAVGB m2, [r1+r2+1], m4, m5
310 PAVGB m0, [r0], m3, m5
311 PAVGB m2, [r0+r2], m4, m5
323 PAVGB m0, [r1+1], m3, m5
324 PAVGB m2, [r1+r2+1], m4, m5
328 PAVGB m0, [r0], m3, m5
329 PAVGB m2, [r0+r2], m4, m5
344 ; actually avg_pixels16_x2
349 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
350 %macro AVG_PIXELS8_Y2 0
352 cglobal avg_pixels16_y2, 4,5,3
354 cglobal avg_pixels8_y2, 4,5
389 ; actually avg_pixels16_y2
394 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
395 ; Note this is not correctly rounded, and is therefore used for
396 ; not-bitexact output
397 %macro AVG_APPROX_PIXELS8_XY2 0
398 cglobal avg_approx_pixels8_xy2, 4,5
435 AVG_APPROX_PIXELS8_XY2
437 AVG_APPROX_PIXELS8_XY2
440 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
441 %macro SET_PIXELS_XY2 1
443 cglobal %1_pixels16_xy2, 4,5,8
445 cglobal %1_pixels8_xy2, 4,5
526 %macro SSSE3_PIXELS_XY2 1-2
528 cglobal %1_pixels16_xy2, 4,5,%2
529 mova m4, [pb_interleave16]
531 cglobal %1_pixels8_xy2, 4,5
532 mova m4, [pb_interleave8]
548 pmulhrsw m0, [pw_8192]
549 pmulhrsw m1, [pw_8192]
568 pmulhrsw m2, [pw_8192]
569 pmulhrsw m3, [pw_8192]
590 SSSE3_PIXELS_XY2 put, 6
591 SSSE3_PIXELS_XY2 avg, 7