1 ;******************************************************************************
2 ;* SIMD optimized SAO functions for HEVC 10/12bit decoding
4 ;* Copyright (c) 2013 Pierre-Edouard LEPERE
5 ;* Copyright (c) 2014 James Almer
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
29 pw_mask10: times 16 dw 0x03FF
30 pw_mask12: times 16 dw 0x0FFF
31 pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
38 ;******************************************************************************
40 ;******************************************************************************
42 %macro HEVC_SAO_BAND_FILTER_INIT 1
60 SPLATW m4, [offsetq + 2]
61 SPLATW m5, [offsetq + 4]
62 SPLATW m6, [offsetq + 6]
63 SPLATW m7, [offsetq + 8]
65 movq m7, [offsetq + 2]
73 mova m13, [pw_mask %+ %1]
77 mova [rsp+mmsize*0], m0
78 mova [rsp+mmsize*1], m1
79 mova [rsp+mmsize*2], m2
80 mova [rsp+mmsize*3], m3
81 mova [rsp+mmsize*4], m4
82 mova [rsp+mmsize*5], m5
83 mova [rsp+mmsize*6], m6
84 mova m1, [pw_mask %+ %1]
92 DEFINE_ARGS dst, src, dststride, srcstride, offset, height
96 %macro HEVC_SAO_BAND_FILTER_COMPUTE 3
112 pcmpeqw m4, %2, [rsp+MMSIZE*0]
113 pcmpeqw m5, %2, [rsp+MMSIZE*1]
114 pcmpeqw m6, %2, [rsp+MMSIZE*2]
115 pcmpeqw %2, [rsp+MMSIZE*3]
116 pand m4, [rsp+MMSIZE*4]
117 pand m5, [rsp+MMSIZE*5]
118 pand m6, [rsp+MMSIZE*6]
127 ;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
128 ; int16_t *sao_offset_val, int sao_left_class, int width, int height);
129 %macro HEVC_SAO_BAND_FILTER 3
130 cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
131 HEVC_SAO_BAND_FILTER_INIT %1
137 HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
145 HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
149 mova m9, [srcq + i + mmsize]
150 HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
152 mova [dstq + i + mmsize], m9
159 HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
163 mova m9, [srcq + i + mmsize]
164 HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
166 mova [dstq + i + mmsize], m9
179 %macro HEVC_SAO_BAND_FILTER_FUNCS 0
180 HEVC_SAO_BAND_FILTER 10, 8, 0
181 HEVC_SAO_BAND_FILTER 10, 16, 1
182 HEVC_SAO_BAND_FILTER 10, 32, 2
183 HEVC_SAO_BAND_FILTER 10, 48, 2
184 HEVC_SAO_BAND_FILTER 10, 64, 4
186 HEVC_SAO_BAND_FILTER 12, 8, 0
187 HEVC_SAO_BAND_FILTER 12, 16, 1
188 HEVC_SAO_BAND_FILTER 12, 32, 2
189 HEVC_SAO_BAND_FILTER 12, 48, 2
190 HEVC_SAO_BAND_FILTER 12, 64, 4
194 HEVC_SAO_BAND_FILTER_FUNCS
196 HEVC_SAO_BAND_FILTER_FUNCS
198 %if HAVE_AVX2_EXTERNAL
200 HEVC_SAO_BAND_FILTER 10, 8, 0
201 HEVC_SAO_BAND_FILTER 10, 16, 1
203 HEVC_SAO_BAND_FILTER 10, 32, 1
204 HEVC_SAO_BAND_FILTER 10, 48, 1
205 HEVC_SAO_BAND_FILTER 10, 64, 2
208 HEVC_SAO_BAND_FILTER 12, 8, 0
209 HEVC_SAO_BAND_FILTER 12, 16, 1
211 HEVC_SAO_BAND_FILTER 12, 32, 1
212 HEVC_SAO_BAND_FILTER 12, 48, 1
213 HEVC_SAO_BAND_FILTER 12, 64, 2
216 ;******************************************************************************
218 ;******************************************************************************
220 %define MAX_PB_SIZE 64
221 %define PADDING_SIZE 32 ; AV_INPUT_BUFFER_PADDING_SIZE
222 %define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
233 %macro HEVC_SAO_EDGE_FILTER_INIT 0
235 movsxd eoq, dword eom
242 movsx a_strideq, byte [tmp2q+eoq*4+1]
243 movsx b_strideq, byte [tmp2q+eoq*4+3]
244 imul a_strideq, EDGE_SRCSTRIDE >> 1
245 imul b_strideq, EDGE_SRCSTRIDE >> 1
246 movsx tmpq, byte [tmp2q+eoq*4]
248 movsx tmpq, byte [tmp2q+eoq*4+2]
252 %macro HEVC_SAO_EDGE_FILTER_COMPUTE 0
253 PMINUW m4, m1, m2, m6
254 PMINUW m5, m1, m3, m7
263 pcmpeqw m2, m4, [pw_m2]
275 pcmpeqw m3, m4, [pw_m1]
277 pcmpeqw m6, m4, [pw_1]
278 pcmpeqw m7, m4, [pw_2]
279 pand m2, [rsp+MMSIZE*0]
280 pand m3, [rsp+MMSIZE*1]
281 pand m5, [rsp+MMSIZE*2]
282 pand m6, [rsp+MMSIZE*3]
283 pand m7, [rsp+MMSIZE*4]
292 ;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
293 ; int eo, int width, int height);
294 %macro HEVC_SAO_EDGE_FILTER 3
296 cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
297 %define tmp2q heightq
298 HEVC_SAO_EDGE_FILTER_INIT
300 add a_strideq, a_strideq
301 add b_strideq, b_strideq
304 cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
305 %assign MMSIZE mmsize
308 %define tmp2q dststrideq
309 %define offsetq heightq
315 HEVC_SAO_EDGE_FILTER_INIT
318 mov dststrideq, dststridem
319 add a_strideq, a_strideq
320 add b_strideq, b_strideq
325 SPLATW m8, [offsetq+2]
326 SPLATW m9, [offsetq+4]
327 SPLATW m10, [offsetq+0]
328 SPLATW m11, [offsetq+6]
329 SPLATW m12, [offsetq+8]
331 movq m10, [offsetq+0]
332 movd m12, [offsetq+6]
346 mova [rsp+mmsize*0], m8
347 mova [rsp+mmsize*1], m9
348 mova [rsp+mmsize*2], m10
349 mova [rsp+mmsize*3], m11
350 mova [rsp+mmsize*4], m12
358 movu m2, [srcq+a_strideq]
359 movu m3, [srcq+b_strideq]
361 HEVC_SAO_EDGE_FILTER_COMPUTE
362 CLIPW m2, m0, [pw_mask %+ %1]
369 movu m2, [srcq+a_strideq + i]
370 movu m3, [srcq+b_strideq + i]
371 HEVC_SAO_EDGE_FILTER_COMPUTE
372 CLIPW m2, m0, [pw_mask %+ %1]
375 mova m1, [srcq + i + mmsize]
376 movu m2, [srcq+a_strideq + i + mmsize]
377 movu m3, [srcq+b_strideq + i + mmsize]
378 HEVC_SAO_EDGE_FILTER_COMPUTE
379 CLIPW m2, m0, [pw_mask %+ %1]
380 mova [dstq + i + mmsize], m2
387 movu m2, [srcq+a_strideq + i]
388 movu m3, [srcq+b_strideq + i]
389 HEVC_SAO_EDGE_FILTER_COMPUTE
390 CLIPW m2, m0, [pw_mask %+ %1]
393 mova m1, [srcq + i + mmsize]
394 movu m2, [srcq+a_strideq + i + mmsize]
395 movu m3, [srcq+b_strideq + i + mmsize]
396 HEVC_SAO_EDGE_FILTER_COMPUTE
397 CLIPW m2, m0, [pw_mask %+ %1]
398 mova [dstq + i + mmsize], m2
405 add srcq, EDGE_SRCSTRIDE
412 HEVC_SAO_EDGE_FILTER 10, 8, 0
413 HEVC_SAO_EDGE_FILTER 10, 16, 1
414 HEVC_SAO_EDGE_FILTER 10, 32, 2
415 HEVC_SAO_EDGE_FILTER 10, 48, 2
416 HEVC_SAO_EDGE_FILTER 10, 64, 4
418 HEVC_SAO_EDGE_FILTER 12, 8, 0
419 HEVC_SAO_EDGE_FILTER 12, 16, 1
420 HEVC_SAO_EDGE_FILTER 12, 32, 2
421 HEVC_SAO_EDGE_FILTER 12, 48, 2
422 HEVC_SAO_EDGE_FILTER 12, 64, 4
424 %if HAVE_AVX2_EXTERNAL
426 HEVC_SAO_EDGE_FILTER 10, 32, 1
427 HEVC_SAO_EDGE_FILTER 10, 48, 1
428 HEVC_SAO_EDGE_FILTER 10, 64, 2
430 HEVC_SAO_EDGE_FILTER 12, 32, 1
431 HEVC_SAO_EDGE_FILTER 12, 48, 1
432 HEVC_SAO_EDGE_FILTER 12, 64, 2