1 ;******************************************************************************
3 ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4 ;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
5 ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7 ;* Copyright (c) 2013 Daniel Kang
9 ;* SIMD-optimized halfpel functions
11 ;* This file is part of FFmpeg.
13 ;* FFmpeg is free software; you can redistribute it and/or
14 ;* modify it under the terms of the GNU Lesser General Public
15 ;* License as published by the Free Software Foundation; either
16 ;* version 2.1 of the License, or (at your option) any later version.
18 ;* FFmpeg is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 ;* Lesser General Public License for more details.
23 ;* You should have received a copy of the GNU Lesser General Public
24 ;* License along with FFmpeg; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 ;******************************************************************************
28 %include "libavutil/x86/x86util.asm"
33 pw_8192: times 8 dw (1<<13)
34 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
35 pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
39 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
40 %macro PUT_PIXELS8_X2 0
42 cglobal put_pixels16_x2, 4,5,4
44 cglobal put_pixels8_x2, 4,5
89 ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
90 %macro PUT_PIXELS_16 0
91 cglobal put_pixels16_x2, 4,5
131 ; The 8_X2 macro can easily be used here
136 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
137 %macro PUT_NO_RND_PIXELS8_X2 0
138 cglobal put_no_rnd_pixels8_x2, 4,5
172 PUT_NO_RND_PIXELS8_X2
174 PUT_NO_RND_PIXELS8_X2
177 ; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
178 %macro PUT_NO_RND_PIXELS8_X2_EXACT 0
179 cglobal put_no_rnd_pixels8_x2_exact, 4,5
219 PUT_NO_RND_PIXELS8_X2_EXACT
221 PUT_NO_RND_PIXELS8_X2_EXACT
224 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
225 %macro PUT_PIXELS8_Y2 0
227 cglobal put_pixels16_y2, 4,5,3
229 cglobal put_pixels8_y2, 4,5
260 ; actually, put_pixels16_y2_sse2
265 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
266 %macro PUT_NO_RND_PIXELS8_Y2 0
267 cglobal put_no_rnd_pixels8_y2, 4,5
297 PUT_NO_RND_PIXELS8_Y2
299 PUT_NO_RND_PIXELS8_Y2
302 ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
303 %macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
304 cglobal put_no_rnd_pixels8_y2_exact, 4,5
339 PUT_NO_RND_PIXELS8_Y2_EXACT
341 PUT_NO_RND_PIXELS8_Y2_EXACT
344 ; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
346 cglobal avg_pixels8, 4,5
374 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
375 %macro AVG_PIXELS8_X2 0
377 cglobal avg_pixels16_x2, 4,5,4
379 cglobal avg_pixels8_x2, 4,5
382 %if notcpuflag(mmxext)
395 PAVGB m0, [r1+1], m3, m5
396 PAVGB m2, [r1+r2+1], m4, m5
398 PAVGB m0, [r0], m3, m5
399 PAVGB m2, [r0+r2], m4, m5
411 PAVGB m0, [r1+1], m3, m5
412 PAVGB m2, [r1+r2+1], m4, m5
416 PAVGB m0, [r0], m3, m5
417 PAVGB m2, [r0+r2], m4, m5
432 ; actually avg_pixels16_x2
437 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
438 %macro AVG_PIXELS8_Y2 0
440 cglobal avg_pixels16_y2, 4,5,3
442 cglobal avg_pixels8_y2, 4,5
477 ; actually avg_pixels16_y2
482 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
483 ; Note this is not correctly rounded, and is therefore used for
484 ; not-bitexact output
485 %macro AVG_APPROX_PIXELS8_XY2 0
486 cglobal avg_approx_pixels8_xy2, 4,5
523 AVG_APPROX_PIXELS8_XY2
525 AVG_APPROX_PIXELS8_XY2
528 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
529 %macro SET_PIXELS_XY2 1
531 cglobal %1_pixels16_xy2, 4,5,8
533 cglobal %1_pixels8_xy2, 4,5
614 %macro SSSE3_PIXELS_XY2 1-2
616 cglobal %1_pixels16_xy2, 4,5,%2
617 mova m4, [pb_interleave16]
619 cglobal %1_pixels8_xy2, 4,5
620 mova m4, [pb_interleave8]
636 pmulhrsw m0, [pw_8192]
637 pmulhrsw m1, [pw_8192]
656 pmulhrsw m2, [pw_8192]
657 pmulhrsw m3, [pw_8192]
678 SSSE3_PIXELS_XY2 put, 6
679 SSSE3_PIXELS_XY2 avg, 7