1 ;******************************************************************************
3 ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4 ;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
5 ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7 ;* Copyright (c) 2013 Daniel Kang
9 ;* SIMD-optimized halfpel functions
11 ;* This file is part of FFmpeg.
13 ;* FFmpeg is free software; you can redistribute it and/or
14 ;* modify it under the terms of the GNU Lesser General Public
15 ;* License as published by the Free Software Foundation; either
16 ;* version 2.1 of the License, or (at your option) any later version.
18 ;* FFmpeg is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 ;* Lesser General Public License for more details.
23 ;* You should have received a copy of the GNU Lesser General Public
24 ;* License along with FFmpeg; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 ;******************************************************************************
28 %include "libavutil/x86/x86util.asm"
33 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34 pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
40 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41 %macro PUT_PIXELS8_X2 0
43 cglobal put_pixels16_x2, 4,5,4
45 cglobal put_pixels8_x2, 4,5
90 ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
91 %macro PUT_PIXELS_16 0
92 cglobal put_pixels16_x2, 4,5
132 ; The 8_X2 macro can easily be used here
137 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
138 %macro PUT_NO_RND_PIXELS8_X2 0
139 cglobal put_no_rnd_pixels8_x2, 4,5
173 PUT_NO_RND_PIXELS8_X2
175 PUT_NO_RND_PIXELS8_X2
178 ; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
179 %macro PUT_NO_RND_PIXELS8_X2_EXACT 0
180 cglobal put_no_rnd_pixels8_x2_exact, 4,5
220 PUT_NO_RND_PIXELS8_X2_EXACT
222 PUT_NO_RND_PIXELS8_X2_EXACT
225 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
226 %macro PUT_PIXELS8_Y2 0
228 cglobal put_pixels16_y2, 4,5,3
230 cglobal put_pixels8_y2, 4,5
261 ; actually, put_pixels16_y2_sse2
266 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
267 %macro PUT_NO_RND_PIXELS8_Y2 0
268 cglobal put_no_rnd_pixels8_y2, 4,5
298 PUT_NO_RND_PIXELS8_Y2
300 PUT_NO_RND_PIXELS8_Y2
303 ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
304 %macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
305 cglobal put_no_rnd_pixels8_y2_exact, 4,5
340 PUT_NO_RND_PIXELS8_Y2_EXACT
342 PUT_NO_RND_PIXELS8_Y2_EXACT
345 ; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
347 cglobal avg_pixels8, 4,5
375 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
376 %macro AVG_PIXELS8_X2 0
378 cglobal avg_pixels16_x2, 4,5,4
380 cglobal avg_pixels8_x2, 4,5
383 %if notcpuflag(mmxext)
396 PAVGB m0, [r1+1], m3, m5
397 PAVGB m2, [r1+r2+1], m4, m5
399 PAVGB m0, [r0], m3, m5
400 PAVGB m2, [r0+r2], m4, m5
412 PAVGB m0, [r1+1], m3, m5
413 PAVGB m2, [r1+r2+1], m4, m5
417 PAVGB m0, [r0], m3, m5
418 PAVGB m2, [r0+r2], m4, m5
433 ; actually avg_pixels16_x2
438 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
439 %macro AVG_PIXELS8_Y2 0
441 cglobal avg_pixels16_y2, 4,5,3
443 cglobal avg_pixels8_y2, 4,5
478 ; actually avg_pixels16_y2
483 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
484 ; Note this is not correctly rounded, and is therefore used for
485 ; not-bitexact output
486 %macro AVG_APPROX_PIXELS8_XY2 0
487 cglobal avg_approx_pixels8_xy2, 4,5
524 AVG_APPROX_PIXELS8_XY2
526 AVG_APPROX_PIXELS8_XY2
529 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
530 %macro SET_PIXELS_XY2 1
532 cglobal %1_pixels16_xy2, 4,5,8
534 cglobal %1_pixels8_xy2, 4,5
615 %macro SSSE3_PIXELS_XY2 1-2
617 cglobal %1_pixels16_xy2, 4,5,%2
618 mova m4, [pb_interleave16]
620 cglobal %1_pixels8_xy2, 4,5
621 mova m4, [pb_interleave8]
637 pmulhrsw m0, [pw_8192]
638 pmulhrsw m1, [pw_8192]
657 pmulhrsw m2, [pw_8192]
658 pmulhrsw m3, [pw_8192]
679 SSSE3_PIXELS_XY2 put, 6
680 SSSE3_PIXELS_XY2 avg, 7