1 ;*****************************************************************************
2 ;* x86-optimized functions for stereo3d filter
4 ;* Copyright (C) 2015 Paul B Mahol
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;*****************************************************************************
23 %include "libavutil/x86/x86util.asm"
30 shuf: db 0, 4, 8, 1,5, 9, 2, 6,10,3, 7,11,-1,-1,-1,-1
31 ex_r: db 0,-1,-1,-1,3,-1,-1,-1,6,-1,-1,-1, 9,-1,-1,-1
32 ex_g: db 1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1,10,-1,-1,-1
33 ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1
39 cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt
40 %define ana_matrix_rq r6q
41 %define ana_matrix_gq r7q
42 %define ana_matrix_bq r8q
45 %if HAVE_ALIGNED_STACK
46 cglobal anaglyph, 3, 7, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, o, cnt
48 cglobal anaglyph, 3, 6, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, o, cnt
49 %define l_linesizeq r4mp
50 %endif ; HAVE_ALIGNED_STACK
51 %define ana_matrix_rq r3q
52 %define ana_matrix_gq r4q
53 %define ana_matrix_bq r5q
54 %define r_linesizeq r5mp
57 %define m8 [rsp+mmsize*12]
58 %define m9 [rsp+mmsize*13]
59 %define m10 [rsp+mmsize*14]
60 %define m11 [rsp+mmsize*15]
61 %define m12 [rsp+mmsize*16]
62 %define m13 [rsp+mmsize*17]
65 mov ana_matrix_rq, r8m
66 mov ana_matrix_gq, r9m
67 mov ana_matrix_bq, r10m
68 movu m3, [ana_matrix_rq+ 0]
69 movq m5, [ana_matrix_rq+16]
76 mova [rsp+mmsize*0], m0
77 mova [rsp+mmsize*1], m1
78 mova [rsp+mmsize*2], m2
79 mova [rsp+mmsize*3], m3
80 mova [rsp+mmsize*4], m4
81 mova [rsp+mmsize*5], m5
83 movu m3, [ana_matrix_gq+ 0]
84 movq m5, [ana_matrix_gq+16]
91 mova [rsp+mmsize*6 ], m0
92 mova [rsp+mmsize*7 ], m1
93 mova [rsp+mmsize*8 ], m2
94 mova [rsp+mmsize*9 ], m3
95 mova [rsp+mmsize*10], m4
96 mova [rsp+mmsize*11], m5
99 movu m11, [ana_matrix_bq+ 0]
100 movq m13, [ana_matrix_bq+16]
101 pshufd m8, m11, q0000
102 pshufd m9, m11, q1111
103 pshufd m10, m11, q2222
104 pshufd m11, m11, q3333
105 pshufd m12, m13, q0000
106 pshufd m13, m13, q1111
107 mov widthd, dword widthm
108 mov heightd, dword heightm
110 movu m3, [ana_matrix_bq+ 0]
111 movq m5, [ana_matrix_bq+16]
118 mova [rsp+mmsize*12], m0
119 mova [rsp+mmsize*13], m1
120 mova [rsp+mmsize*14], m2
121 mova [rsp+mmsize*15], m3
122 mova [rsp+mmsize*16], m4
123 mova [rsp+mmsize*17], m5
124 mov dst_linesizeq, r3m
125 %if HAVE_ALIGNED_STACK
135 movu m3, [lsrcq+cntq]
136 pshufb m1, m3, [ex_r]
137 pshufb m2, m3, [ex_g]
139 movu m0, [rsrcq+cntq]
140 pshufb m4, m0, [ex_r]
141 pshufb m5, m0, [ex_g]
143 pmulld m1, [rsp+mmsize*0]
144 pmulld m2, [rsp+mmsize*1]
145 pmulld m3, [rsp+mmsize*2]
146 pmulld m4, [rsp+mmsize*3]
147 pmulld m5, [rsp+mmsize*4]
148 pmulld m0, [rsp+mmsize*5]
155 movu m3, [lsrcq+cntq]
156 pshufb m7, m3, [ex_r]
157 pshufb m2, m3, [ex_g]
159 movu m0, [rsrcq+cntq]
160 pshufb m4, m0, [ex_r]
161 pshufb m5, m0, [ex_g]
163 pmulld m7, [rsp+mmsize*6]
164 pmulld m2, [rsp+mmsize*7]
165 pmulld m3, [rsp+mmsize*8]
166 pmulld m4, [rsp+mmsize*9]
167 pmulld m5, [rsp+mmsize*10]
168 pmulld m0, [rsp+mmsize*11]
175 movu m4, [lsrcq+cntq]
176 pshufb m2, m4, [ex_r]
177 pshufb m3, m4, [ex_g]
179 movu m0, [rsrcq+cntq]
180 pshufb m5, m0, [ex_r]
181 pshufb m6, m0, [ex_g]
204 movq [dstq+cntq+0], m1
206 movd [dstq+cntq+8], m1
211 add dstq, dst_linesizeq
212 add lsrcq, l_linesizeq
213 add rsrcq, r_linesizeq