1 ;******************************************************************************
2 ;* SIMD optimized SAO functions for HEVC 10/12bit decoding
4 ;* Copyright (c) 2013 Pierre-Edouard LEPERE
5 ;* Copyright (c) 2014 James Almer
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
29 pw_mask10: times 16 dw 0x03FF
30 pw_mask12: times 16 dw 0x0FFF
31 pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
38 ;******************************************************************************
40 ;******************************************************************************
42 %macro HEVC_SAO_BAND_FILTER_INIT 1
60 SPLATW m4, [offsetq + 2]
61 SPLATW m5, [offsetq + 4]
62 SPLATW m6, [offsetq + 6]
63 SPLATW m7, [offsetq + 8]
65 movq m7, [offsetq + 2]
73 mova m13, [pw_mask %+ %1]
77 mova [rsp+mmsize*0], m0
78 mova [rsp+mmsize*1], m1
79 mova [rsp+mmsize*2], m2
80 mova [rsp+mmsize*3], m3
81 mova [rsp+mmsize*4], m4
82 mova [rsp+mmsize*5], m5
83 mova [rsp+mmsize*6], m6
84 mova m1, [pw_mask %+ %1]
91 DEFINE_ARGS dst, src, dststride, srcstride, offset, height
95 ;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
96 ; int16_t *sao_offset_val, int sao_left_class, int width, int height);
97 %macro HEVC_SAO_BAND_FILTER 3
98 cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
99 HEVC_SAO_BAND_FILTER_INIT %1
109 mova m %+ k, [srcq + i]
110 psraw m %+ l, m %+ k, %1-5
112 pcmpeqw m10, m %+ l, m0
113 pcmpeqw m11, m %+ l, m1
114 pcmpeqw m12, m %+ l, m2
125 pcmpeqw m4, m %+ l, [rsp+mmsize*0]
126 pcmpeqw m5, m %+ l, [rsp+mmsize*1]
127 pcmpeqw m6, m %+ l, [rsp+mmsize*2]
128 pcmpeqw m %+ l, [rsp+mmsize*3]
129 pand m4, [rsp+mmsize*4]
130 pand m5, [rsp+mmsize*5]
131 pand m6, [rsp+mmsize*6]
138 CLIPW m %+ k, m14, m13
139 mova [dstq + i], m %+ k
151 %macro HEVC_SAO_BAND_FILTER_FUNCS 0
152 HEVC_SAO_BAND_FILTER 10, 8, 1
153 HEVC_SAO_BAND_FILTER 10, 16, 2
154 HEVC_SAO_BAND_FILTER 10, 32, 4
155 HEVC_SAO_BAND_FILTER 10, 48, 6
156 HEVC_SAO_BAND_FILTER 10, 64, 8
158 HEVC_SAO_BAND_FILTER 12, 8, 1
159 HEVC_SAO_BAND_FILTER 12, 16, 2
160 HEVC_SAO_BAND_FILTER 12, 32, 4
161 HEVC_SAO_BAND_FILTER 12, 48, 6
162 HEVC_SAO_BAND_FILTER 12, 64, 8
166 HEVC_SAO_BAND_FILTER_FUNCS
168 HEVC_SAO_BAND_FILTER_FUNCS
170 %if HAVE_AVX2_EXTERNAL
172 HEVC_SAO_BAND_FILTER 10, 8, 1
174 HEVC_SAO_BAND_FILTER 10, 16, 1
175 HEVC_SAO_BAND_FILTER 10, 32, 2
176 HEVC_SAO_BAND_FILTER 10, 48, 3
177 HEVC_SAO_BAND_FILTER 10, 64, 4
180 HEVC_SAO_BAND_FILTER 12, 8, 1
182 HEVC_SAO_BAND_FILTER 12, 16, 1
183 HEVC_SAO_BAND_FILTER 12, 32, 2
184 HEVC_SAO_BAND_FILTER 12, 48, 3
185 HEVC_SAO_BAND_FILTER 12, 64, 4
188 ;******************************************************************************
190 ;******************************************************************************
192 %define MAX_PB_SIZE 64
193 %define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
194 %define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
205 %macro HEVC_SAO_EDGE_FILTER_INIT 0
207 movsxd eoq, dword eom
214 movsx a_strideq, byte [tmp2q+eoq*4+1]
215 movsx b_strideq, byte [tmp2q+eoq*4+3]
216 imul a_strideq, EDGE_SRCSTRIDE >> 1
217 imul b_strideq, EDGE_SRCSTRIDE >> 1
218 movsx tmpq, byte [tmp2q+eoq*4]
220 movsx tmpq, byte [tmp2q+eoq*4+2]
224 ;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
225 ; int eo, int width, int height);
226 %macro HEVC_SAO_EDGE_FILTER 3
228 cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
229 %define tmp2q heightq
230 HEVC_SAO_EDGE_FILTER_INIT
232 add a_strideq, a_strideq
233 add b_strideq, b_strideq
236 cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
239 %define tmp2q dststrideq
240 %define offsetq heightq
246 HEVC_SAO_EDGE_FILTER_INIT
249 mov dststrideq, dststridem
250 add a_strideq, a_strideq
251 add b_strideq, b_strideq
256 SPLATW m8, [offsetq+2]
257 SPLATW m9, [offsetq+4]
258 SPLATW m10, [offsetq+0]
259 SPLATW m11, [offsetq+6]
260 SPLATW m12, [offsetq+8]
262 movq m10, [offsetq+0]
263 movd m12, [offsetq+6]
277 mova [rsp+mmsize*0], m8
278 mova [rsp+mmsize*1], m9
279 mova [rsp+mmsize*2], m10
280 mova [rsp+mmsize*3], m11
281 mova [rsp+mmsize*4], m12
290 movu m2, [srcq+a_strideq + i]
291 movu m3, [srcq+b_strideq + i]
292 PMINUW m4, m1, m2, m6
293 PMINUW m5, m1, m3, m7
302 pcmpeqw m2, m4, [pw_m2]
314 pcmpeqw m3, m4, [pw_m1]
316 pcmpeqw m6, m4, [pw_1]
317 pcmpeqw m7, m4, [pw_2]
318 pand m2, [rsp+mmsize*0]
319 pand m3, [rsp+mmsize*1]
320 pand m5, [rsp+mmsize*2]
321 pand m6, [rsp+mmsize*3]
322 pand m7, [rsp+mmsize*4]
329 CLIPW m2, m0, [pw_mask %+ %1]
335 add srcq, EDGE_SRCSTRIDE
342 HEVC_SAO_EDGE_FILTER 10, 8, 1
343 HEVC_SAO_EDGE_FILTER 10, 16, 2
344 HEVC_SAO_EDGE_FILTER 10, 32, 4
345 HEVC_SAO_EDGE_FILTER 10, 48, 6
346 HEVC_SAO_EDGE_FILTER 10, 64, 8
348 HEVC_SAO_EDGE_FILTER 12, 8, 1
349 HEVC_SAO_EDGE_FILTER 12, 16, 2
350 HEVC_SAO_EDGE_FILTER 12, 32, 4
351 HEVC_SAO_EDGE_FILTER 12, 48, 6
352 HEVC_SAO_EDGE_FILTER 12, 64, 8
354 %if HAVE_AVX2_EXTERNAL
356 HEVC_SAO_EDGE_FILTER 10, 8, 1
358 HEVC_SAO_EDGE_FILTER 10, 16, 1
359 HEVC_SAO_EDGE_FILTER 10, 32, 2
360 HEVC_SAO_EDGE_FILTER 10, 48, 3
361 HEVC_SAO_EDGE_FILTER 10, 64, 4
364 HEVC_SAO_EDGE_FILTER 12, 8, 1
366 HEVC_SAO_EDGE_FILTER 12, 16, 1
367 HEVC_SAO_EDGE_FILTER 12, 32, 2
368 HEVC_SAO_EDGE_FILTER 12, 48, 3
369 HEVC_SAO_EDGE_FILTER 12, 64, 4