1 ;******************************************************************************
2 ;* SIMD optimized SAO functions for HEVC decoding
4 ;* Copyright (c) 2013 Pierre-Edouard LEPERE
5 ;* Copyright (c) 2014 James Almer
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
28 pw_mask10: times 16 dw 0x03FF
29 pw_mask12: times 16 dw 0x0FFF
31 pb_edge_shuffle: times 2 db 1, 2, 0, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
32 pb_eo: db -1, 0, 1, 0, 0, -1, 0, 1, -1, -1, 1, 1, 1, -1, -1, 1
41 %define MAX_PB_SIZE 64
42 %define PADDING_SIZE 32 ; FF_INPUT_BUFFER_PADDING_SIZE
44 ;******************************************************************************
46 ;******************************************************************************
48 %macro HEVC_SAO_BAND_FILTER_INIT 1
66 SPLATW m4, [offsetq + 2]
67 SPLATW m5, [offsetq + 4]
68 SPLATW m6, [offsetq + 6]
69 SPLATW m7, [offsetq + 8]
71 movq m7, [offsetq + 2]
80 mova m13, [pw_mask %+ %1]
85 mova [rsp+mmsize*0], m0
86 mova [rsp+mmsize*1], m1
87 mova [rsp+mmsize*2], m2
88 mova [rsp+mmsize*3], m3
89 mova [rsp+mmsize*4], m4
90 mova [rsp+mmsize*5], m5
91 mova [rsp+mmsize*6], m6
94 mova m1, [pw_mask %+ %1]
102 DEFINE_ARGS dst, src, dststride, srcstride, offset, height
106 %macro HEVC_SAO_BAND_FILTER_COMPUTE 3
122 pcmpeqw m4, %2, [rsp+MMSIZE*0]
123 pcmpeqw m5, %2, [rsp+MMSIZE*1]
124 pcmpeqw m6, %2, [rsp+MMSIZE*2]
125 pcmpeqw %2, [rsp+MMSIZE*3]
126 pand m4, [rsp+MMSIZE*4]
127 pand m5, [rsp+MMSIZE*5]
128 pand m6, [rsp+MMSIZE*6]
137 ;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
138 ; int16_t *sao_offset_val, int sao_left_class, int width, int height);
139 %macro HEVC_SAO_BAND_FILTER_8 2
140 cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
141 HEVC_SAO_BAND_FILTER_INIT 8
148 HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8
156 punpcklbw m8, m13, m14
157 HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8
159 HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m13
169 punpcklbw m8, m13, m14
170 HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m8
172 HEVC_SAO_BAND_FILTER_COMPUTE 8, m9, m13
180 add dstq, dststrideq ; dst += dststride
181 add srcq, srcstrideq ; src += srcstride
182 dec heightd ; cmp height
183 jnz .loop ; height loop
187 ;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
188 ; int16_t *sao_offset_val, int sao_left_class, int width, int height);
189 %macro HEVC_SAO_BAND_FILTER_16 3
190 cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 7*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
191 HEVC_SAO_BAND_FILTER_INIT %1
197 HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
205 HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
209 mova m9, [srcq + i + mmsize]
210 HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
212 mova [dstq + i + mmsize], m9
219 HEVC_SAO_BAND_FILTER_COMPUTE %1, m9, m8
223 mova m9, [srcq + i + mmsize]
224 HEVC_SAO_BAND_FILTER_COMPUTE %1, m8, m9
226 mova [dstq + i + mmsize], m9
239 %macro HEVC_SAO_BAND_FILTER_FUNCS 0
240 HEVC_SAO_BAND_FILTER_8 8, 0
241 HEVC_SAO_BAND_FILTER_8 16, 1
242 HEVC_SAO_BAND_FILTER_8 32, 2
243 HEVC_SAO_BAND_FILTER_8 48, 2
244 HEVC_SAO_BAND_FILTER_8 64, 4
246 HEVC_SAO_BAND_FILTER_16 10, 8, 0
247 HEVC_SAO_BAND_FILTER_16 10, 16, 1
248 HEVC_SAO_BAND_FILTER_16 10, 32, 2
249 HEVC_SAO_BAND_FILTER_16 10, 48, 2
250 HEVC_SAO_BAND_FILTER_16 10, 64, 4
252 HEVC_SAO_BAND_FILTER_16 12, 8, 0
253 HEVC_SAO_BAND_FILTER_16 12, 16, 1
254 HEVC_SAO_BAND_FILTER_16 12, 32, 2
255 HEVC_SAO_BAND_FILTER_16 12, 48, 2
256 HEVC_SAO_BAND_FILTER_16 12, 64, 4
260 HEVC_SAO_BAND_FILTER_FUNCS
262 HEVC_SAO_BAND_FILTER_FUNCS
264 %if HAVE_AVX2_EXTERNAL
266 HEVC_SAO_BAND_FILTER_8 8, 0
267 HEVC_SAO_BAND_FILTER_8 16, 1
269 HEVC_SAO_BAND_FILTER_8 32, 1
270 HEVC_SAO_BAND_FILTER_8 48, 1
271 HEVC_SAO_BAND_FILTER_8 64, 2
274 HEVC_SAO_BAND_FILTER_16 10, 8, 0
275 HEVC_SAO_BAND_FILTER_16 10, 16, 1
277 HEVC_SAO_BAND_FILTER_16 10, 32, 1
278 HEVC_SAO_BAND_FILTER_16 10, 48, 1
279 HEVC_SAO_BAND_FILTER_16 10, 64, 2
282 HEVC_SAO_BAND_FILTER_16 12, 8, 0
283 HEVC_SAO_BAND_FILTER_16 12, 16, 1
285 HEVC_SAO_BAND_FILTER_16 12, 32, 1
286 HEVC_SAO_BAND_FILTER_16 12, 48, 1
287 HEVC_SAO_BAND_FILTER_16 12, 64, 2
290 ;******************************************************************************
292 ;******************************************************************************
294 %define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
296 %macro HEVC_SAO_EDGE_FILTER_INIT 1
298 movsxd eoq, dword eom
305 movsx a_strideq, byte [tmp2q+eoq*4+1]
306 movsx b_strideq, byte [tmp2q+eoq*4+3]
307 imul a_strideq, EDGE_SRCSTRIDE>>%1
308 imul b_strideq, EDGE_SRCSTRIDE>>%1
309 movsx tmpq, byte [tmp2q+eoq*4]
311 movsx tmpq, byte [tmp2q+eoq*4+2]
315 %macro HEVC_SAO_EDGE_FILTER_COMPUTE_8 1
344 ;void ff_hevc_sao_edge_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
345 ; int eo, int width, int height);
346 %macro HEVC_SAO_EDGE_FILTER_8 2-3
348 cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
349 %define tmp2q heightq
350 HEVC_SAO_EDGE_FILTER_INIT 0
354 cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_stride, height
357 %define tmp2q dststrideq
358 %define offsetq heightq
359 HEVC_SAO_EDGE_FILTER_INIT 0
362 mov dststrideq, dststridem
366 vbroadcasti128 m0, [offsetq]
370 mova m1, [pb_edge_shuffle]
384 movq m2, [srcq + a_strideq]
385 movq m3, [srcq + b_strideq]
386 HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1
393 movu m2, [srcq + a_strideq + i]
394 movu m3, [srcq + b_strideq + i]
395 HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1
404 movu m2, [srcq + a_strideq + i]
405 movu m3, [srcq + b_strideq + i]
406 HEVC_SAO_EDGE_FILTER_COMPUTE_8 %1
414 add srcq, EDGE_SRCSTRIDE
429 %macro HEVC_SAO_EDGE_FILTER_COMPUTE_10 0
430 PMINUW m4, m1, m2, m6
431 PMINUW m5, m1, m3, m7
440 pcmpeqw m2, m4, [pw_m2]
452 pcmpeqw m3, m4, [pw_m1]
454 pcmpeqw m6, m4, [pw_1]
455 pcmpeqw m7, m4, [pw_2]
456 pand m2, [rsp+MMSIZE*0]
457 pand m3, [rsp+MMSIZE*1]
458 pand m5, [rsp+MMSIZE*2]
459 pand m6, [rsp+MMSIZE*3]
460 pand m7, [rsp+MMSIZE*4]
469 ;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
470 ; int eo, int width, int height);
471 %macro HEVC_SAO_EDGE_FILTER_16 3
473 cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
474 %define tmp2q heightq
475 HEVC_SAO_EDGE_FILTER_INIT 1
477 add a_strideq, a_strideq
478 add b_strideq, b_strideq
481 cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
482 %assign MMSIZE mmsize
485 %define tmp2q dststrideq
486 %define offsetq heightq
492 HEVC_SAO_EDGE_FILTER_INIT 1
495 mov dststrideq, dststridem
496 add a_strideq, a_strideq
497 add b_strideq, b_strideq
502 SPLATW m8, [offsetq+2]
503 SPLATW m9, [offsetq+4]
504 SPLATW m10, [offsetq+0]
505 SPLATW m11, [offsetq+6]
506 SPLATW m12, [offsetq+8]
508 movq m10, [offsetq+0]
509 movd m12, [offsetq+6]
523 mova [rsp+mmsize*0], m8
524 mova [rsp+mmsize*1], m9
525 mova [rsp+mmsize*2], m10
526 mova [rsp+mmsize*3], m11
527 mova [rsp+mmsize*4], m12
535 movu m2, [srcq+a_strideq]
536 movu m3, [srcq+b_strideq]
538 HEVC_SAO_EDGE_FILTER_COMPUTE_10
539 CLIPW m2, m0, [pw_mask %+ %1]
546 movu m2, [srcq+a_strideq + i]
547 movu m3, [srcq+b_strideq + i]
548 HEVC_SAO_EDGE_FILTER_COMPUTE_10
549 CLIPW m2, m0, [pw_mask %+ %1]
552 mova m1, [srcq + i + mmsize]
553 movu m2, [srcq+a_strideq + i + mmsize]
554 movu m3, [srcq+b_strideq + i + mmsize]
555 HEVC_SAO_EDGE_FILTER_COMPUTE_10
556 CLIPW m2, m0, [pw_mask %+ %1]
557 mova [dstq + i + mmsize], m2
564 movu m2, [srcq+a_strideq + i]
565 movu m3, [srcq+b_strideq + i]
566 HEVC_SAO_EDGE_FILTER_COMPUTE_10
567 CLIPW m2, m0, [pw_mask %+ %1]
570 mova m1, [srcq + i + mmsize]
571 movu m2, [srcq+a_strideq + i + mmsize]
572 movu m3, [srcq+b_strideq + i + mmsize]
573 HEVC_SAO_EDGE_FILTER_COMPUTE_10
574 CLIPW m2, m0, [pw_mask %+ %1]
575 mova [dstq + i + mmsize], m2
582 add srcq, EDGE_SRCSTRIDE
589 HEVC_SAO_EDGE_FILTER_8 8, 0
590 HEVC_SAO_EDGE_FILTER_8 16, 1, a
591 HEVC_SAO_EDGE_FILTER_8 32, 2, a
592 HEVC_SAO_EDGE_FILTER_8 48, 2, a
593 HEVC_SAO_EDGE_FILTER_8 64, 4, a
595 %if HAVE_AVX2_EXTERNAL
597 HEVC_SAO_EDGE_FILTER_8 32, 1, a
598 HEVC_SAO_EDGE_FILTER_8 48, 1, u
599 HEVC_SAO_EDGE_FILTER_8 64, 2, a
603 HEVC_SAO_EDGE_FILTER_16 10, 8, 0
604 HEVC_SAO_EDGE_FILTER_16 10, 16, 1
605 HEVC_SAO_EDGE_FILTER_16 10, 32, 2
606 HEVC_SAO_EDGE_FILTER_16 10, 48, 2
607 HEVC_SAO_EDGE_FILTER_16 10, 64, 4
609 HEVC_SAO_EDGE_FILTER_16 12, 8, 0
610 HEVC_SAO_EDGE_FILTER_16 12, 16, 1
611 HEVC_SAO_EDGE_FILTER_16 12, 32, 2
612 HEVC_SAO_EDGE_FILTER_16 12, 48, 2
613 HEVC_SAO_EDGE_FILTER_16 12, 64, 4
615 %if HAVE_AVX2_EXTERNAL
617 HEVC_SAO_EDGE_FILTER_16 10, 32, 1
618 HEVC_SAO_EDGE_FILTER_16 10, 48, 1
619 HEVC_SAO_EDGE_FILTER_16 10, 64, 2
621 HEVC_SAO_EDGE_FILTER_16 12, 32, 1
622 HEVC_SAO_EDGE_FILTER_16 12, 48, 1
623 HEVC_SAO_EDGE_FILTER_16 12, 64, 2