1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
26 %include "x86util.asm"
30 pw_pixel_max: times 8 dw ((1 << 10)-1)
38 ;-----------------------------------------------------------------------------
39 ; void h264_weight(uint8_t *dst, int stride, int log2_denom,
40 ; int weight, int offset);
41 ;-----------------------------------------------------------------------------
48 %macro WEIGHT_PROLOGUE 1
61 pslld m0, m2 ; 1<<log2_denom
63 shl r4, 19 ; *8, move to upper half of dword
64 lea r4, [r4+r3*2+0x10000]
65 movd m3, r4d ; weight<<1 | 1+(offset<<(3))
67 mova m4, [pw_pixel_max]
68 paddw m2, [sq_1] ; log2_denom+1
98 %macro WEIGHT_FUNC_DBL 1
99 cglobal h264_weight_16x16_10_%1
112 cglobal h264_weight_16x8_10_%1
114 jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
122 %macro WEIGHT_FUNC_MM 1
123 cglobal h264_weight_8x16_10_%1
134 cglobal h264_weight_8x8_10_%1
136 jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
138 cglobal h264_weight_8x4_10_%1
140 jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
148 %macro WEIGHT_FUNC_HALF_MM 1
149 cglobal h264_weight_4x8_10_%1
162 cglobal h264_weight_4x4_10_%1
164 jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
166 cglobal h264_weight_4x2_10_%1
168 jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
172 WEIGHT_FUNC_HALF_MM sse2
173 WEIGHT_FUNC_HALF_MM sse4
176 ;-----------------------------------------------------------------------------
177 ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
178 ; int weightd, int weights, int offset);
179 ;-----------------------------------------------------------------------------
186 %macro BIWEIGHT_PROLOGUE 1
198 %macro BIWEIGHT_SETUP 1
199 lea r6, [r6*4+1] ; (offset<<2)+1
203 movd m4, r4d ; weightd | weights
204 movd m5, r6d ; (offset+1)|1
205 movd m6, r3m ; log2_denom
206 pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
210 mova m3, [pw_pixel_max]
245 %macro BIWEIGHT_FUNC_DBL 1
246 cglobal h264_biweight_16x16_10_%1
260 cglobal h264_biweight_16x8_10_%1
262 jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
266 BIWEIGHT_FUNC_DBL sse2
267 BIWEIGHT_FUNC_DBL sse4
269 %macro BIWEIGHT_FUNC 1
270 cglobal h264_biweight_8x16_10_%1
282 cglobal h264_biweight_8x8_10_%1
284 jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
286 cglobal h264_biweight_8x4_10_%1
288 jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
295 %macro BIWEIGHT_FUNC_HALF 1
296 cglobal h264_biweight_4x8_10_%1
310 cglobal h264_biweight_4x4_10_%1
312 jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
314 cglobal h264_biweight_4x2_10_%1
316 jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
320 BIWEIGHT_FUNC_HALF sse2
321 BIWEIGHT_FUNC_HALF sse4