1 ;*****************************************************************************
2 ;* x86-optimized functions for blend filter
4 ;* Copyright (C) 2015 Paul B Mahol
5 ;* Copyright (C) 2018 Henrik Gramner
6 ;* Copyright (C) 2018 Jokyo Images
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
29 ps_255: times 4 dd 255.0
30 pd_32768 : times 4 dd 32768
31 pd_65535 : times 4 dd 65535
33 pw_128: times 8 dw 128
34 pw_255: times 8 dw 255
35 pb_127: times 16 db 127
36 pb_128: times 16 db 128
37 pb_255: times 16 db 255
43 cglobal blend_%1, 6, 9, %2, top, top_linesize, bottom, bottom_linesize, dst, dst_linesize, width, end, x
44 mov widthd, dword widthm
45 %if %0 == 3; is 16 bit
46 add widthq, widthq ; doesn't compile on x86_32
49 cglobal blend_%1, 5, 7, %2, top, top_linesize, bottom, bottom_linesize, dst, end, x
50 %define dst_linesizeq r5mp
61 add topq, top_linesizeq
62 add bottomq, bottom_linesizeq
63 add dstq, dst_linesizeq
69 %macro BLEND_SIMPLE 2-3
76 movu m1, [bottomq + xq]
84 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
85 %macro GRAINEXTRACT 3-4
89 VBROADCASTI128 m5, [pd_32768]
91 VBROADCASTI128 m5, [pw_128]
97 movu m3, [bottomq + xq]
99 punpckl%2%3 m0, m1, m4
101 punpckl%2%3 m2, m3, m4
117 %macro MULTIPLY 3 ; a, b, pw_1
118 pmullw %1, %2 ; xxxxxxxx a * b
122 psrlw %1, 8 ; 00xx00xx a * b / 255
125 %macro SCREEN 4 ; a, b, pw_1, pw_255
126 pxor %1, %4 ; 00xx00xx 255 - a
129 pxor %1, %4 ; 00xx00xx 255 - x / 255
132 %macro BLEND_MULTIPLY 0
133 BLEND_INIT multiply, 6
135 VBROADCASTI128 m5, [pw_1]
141 movu m3, [bottomq + xq]
157 %macro BLEND_SCREEN 0
161 VBROADCASTI128 m5, [pw_1]
162 VBROADCASTI128 m6, [pw_255]
168 movu m3, [bottomq + xq]
174 SCREEN m0, m2, m5, m6
175 SCREEN m1, m3, m5, m6
184 ;%1 name, %2 (b or w), %3 (set if 16 bit)
194 movu m1, [bottomq + xq]
205 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
206 %macro GRAINMERGE 3-4
210 VBROADCASTI128 m5, [pd_32768]
212 VBROADCASTI128 m5, [pw_128]
219 movu m3, [bottomq + xq]
221 punpckl%2%3 m0, m1, m4
223 punpckl%2%3 m2, m3, m4
240 BLEND_INIT hardmix, 5
241 VBROADCASTI128 m2, [pb_255]
242 VBROADCASTI128 m3, [pb_128]
243 VBROADCASTI128 m4, [pb_127]
249 movu m1, [bottomq + xq]
268 movd m0, [topq + xq] ; 000000xx
269 movd m1, [bottomq + xq]
270 punpcklbw m0, m2 ; 00000x0x
272 punpcklwd m0, m2 ; 000x000x
278 mulps m0, m3 ; a / b * 255
282 packssdw m0, m0 ; 00000x0x
283 packuswb m0, m0 ; 000000xx
292 ; %1 name, %2 b or w, %3 (opt) 1 if 16 bit
294 VBROADCASTI128 m3, [pb_255]
300 movu m1, [bottomq + xq]
313 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
314 %macro DIFFERENCE 3-4
322 movu m1, [bottomq + xq]
323 punpckh%2%3 m3, m0, m2
325 punpckh%2%3 m4, m1, m2
342 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
347 VBROADCASTI128 m4, [pd_65535]
349 VBROADCASTI128 m4, [pw_255]
356 movu m1, [bottomq + xq]
357 punpckh%2%3 m5, m0, m2
359 punpckh%2%3 m6, m1, m2
382 VBROADCASTI128 m4, [pd_65535]
384 VBROADCASTI128 m4, [pw_255]
391 movu m1, [bottomq + xq]
392 punpckh%2%3 m5, m0, m2
394 punpckh%2%3 m6, m1, m2
416 BLEND_SIMPLE xor, xor
418 BLEND_SIMPLE and, and
419 BLEND_SIMPLE addition, addusb
420 BLEND_SIMPLE subtract, subusb
421 BLEND_SIMPLE darken, minub
422 BLEND_SIMPLE lighten, maxub
423 GRAINEXTRACT grainextract, b, w
427 GRAINMERGE grainmerge, b, w
430 DIFFERENCE difference, b, w
432 EXTREMITY extremity, b, w
433 NEGATION negation, b, w
436 BLEND_SIMPLE addition_16, addusw, 1
437 BLEND_SIMPLE and_16, and, 1
438 BLEND_SIMPLE or_16, or, 1
439 BLEND_SIMPLE subtract_16, subusw, 1
440 BLEND_SIMPLE xor_16, xor, 1
441 AVERAGE average_16, w, 1
445 DIFFERENCE difference, b, w
446 EXTREMITY extremity, b, w
447 NEGATION negation, b, w
451 BLEND_SIMPLE darken_16, minuw, 1
452 BLEND_SIMPLE lighten_16, maxuw, 1
453 GRAINEXTRACT grainextract_16, w, d, 1
454 GRAINMERGE grainmerge_16, w, d, 1
455 PHOENIX phoenix_16, w, 1
456 DIFFERENCE difference_16, w, d, 1
457 EXTREMITY extremity_16, w, d, 1
458 NEGATION negation_16, w, d, 1
461 %if HAVE_AVX2_EXTERNAL
463 BLEND_SIMPLE xor, xor
465 BLEND_SIMPLE and, and
466 BLEND_SIMPLE addition, addusb
467 BLEND_SIMPLE subtract, subusb
468 BLEND_SIMPLE darken, minub
469 BLEND_SIMPLE lighten, maxub
470 GRAINEXTRACT grainextract, b, w
474 GRAINMERGE grainmerge, b, w
478 DIFFERENCE difference, b, w
479 EXTREMITY extremity, b, w
480 NEGATION negation, b, w
483 BLEND_SIMPLE addition_16, addusw, 1
484 BLEND_SIMPLE and_16, and, 1
485 BLEND_SIMPLE darken_16, minuw, 1
486 BLEND_SIMPLE lighten_16, maxuw, 1
487 BLEND_SIMPLE or_16, or, 1
488 BLEND_SIMPLE subtract_16, subusw, 1
489 BLEND_SIMPLE xor_16, xor, 1
490 GRAINEXTRACT grainextract_16, w, d, 1
491 AVERAGE average_16, w, 1
492 GRAINMERGE grainmerge_16, w, d, 1
493 PHOENIX phoenix_16, w, 1
494 DIFFERENCE difference_16, w, d, 1
495 EXTREMITY extremity_16, w, d, 1
496 NEGATION negation_16, w, d, 1