]> git.sesse.net Git - mlt/blob - src/modules/motion_est/sad_sse.h
Merge ../mlt++
[mlt] / src / modules / motion_est / sad_sse.h
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software Foundation,
14  * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  */
16
17
18
19 #define SAD_SSE_INIT \
20         asm volatile ( "pxor %%mm6,%%mm6\n\t" ::  );\
21
22 // Sum two 8x1 pixel blocks
23 #define SAD_SSE_SUM_8(OFFSET) \
24                         "movq " #OFFSET "(%0),%%mm0             \n\t"\
25                         "movq " #OFFSET "(%1),%%mm1             \n\t"\
26                         "psadbw %%mm1,%%mm0                     \n\t"\
27                         "paddw %%mm0,%%mm6                      \n\t"\
28
29 #define SAD_SSE_FINISH(RESULT) \
30         asm volatile( "movd %%mm6,%0" : "=r" (RESULT) : );
31
32 // Advance by ystride
33 #define SAD_SSE_NEXTROW \
34                         "add %2,%0                              \n\t"\
35                         "add %2,%1                              \n\t"\
36
37 // BROKEN!
38 inline static int sad_sse_4x4( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
39 {
40         int result; 
41         SAD_SSE_INIT
42         #define ROW     SAD_SSE_SUM_8(0) SAD_SSE_NEXTROW
43         asm volatile (  ROW ROW ROW ROW
44                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
45         
46         SAD_SSE_FINISH(result)
47         return result;
48         #undef ROW
49
50 }
51
52 inline static int sad_sse_8x8( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
53 {
54         int result; 
55         SAD_SSE_INIT
56         #define ROW     SAD_SSE_SUM_8(0) SAD_SSE_NEXTROW
57         asm volatile (  ROW ROW ROW ROW ROW ROW ROW ROW
58                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
59         
60         SAD_SSE_FINISH(result)
61         return result;
62         #undef ROW
63
64 }
65
66 inline static int sad_sse_16x16( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
67 {
68         int result; 
69         SAD_SSE_INIT
70         #define ROW     SAD_SSE_SUM_8(0) SAD_SSE_SUM_8(8) SAD_SSE_NEXTROW
71         asm volatile (  ROW ROW ROW ROW ROW ROW ROW ROW
72                         ROW ROW ROW ROW ROW ROW ROW ROW
73                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
74         
75         SAD_SSE_FINISH(result)
76         return result;
77         #undef ROW
78
79 }
80
81 inline static int sad_sse_32x32( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
82 {
83         int result; 
84         SAD_SSE_INIT
85         #define ROW     SAD_SSE_SUM_8(0) SAD_SSE_SUM_8(8) SAD_SSE_SUM_8(16) SAD_SSE_SUM_8(24)\
86                         SAD_SSE_NEXTROW
87
88         asm volatile (  ROW ROW ROW ROW ROW ROW ROW ROW
89                         ROW ROW ROW ROW ROW ROW ROW ROW
90                         ROW ROW ROW ROW ROW ROW ROW ROW
91                         ROW ROW ROW ROW ROW ROW ROW ROW
92                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
93         
94         SAD_SSE_FINISH(result)
95         return result;
96         #undef ROW
97
98 }
99 // BROKEN!
100 inline static int sad_sse_4w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
101 {
102         int result; 
103
104         SAD_SSE_INIT
105
106         while( h != 0 ) {
107                 asm volatile (
108                         SAD_SSE_SUM_8(0)
109                         :: "r" (block1), "r" (block2)
110                 );
111         
112                 h--;
113                 block1 += ystride;
114                 block2 += ystride;
115         }
116         SAD_SSE_FINISH(result)
117         return result;
118
119 }
120
121 inline static int sad_sse_8w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
122 {
123         int result; 
124
125         SAD_SSE_INIT
126
127         while( h != 0 ) {
128                 asm volatile (
129                         SAD_SSE_SUM_8(0)
130
131                         :: "r" (block1), "r" (block2)
132                 );
133         
134                 h--;
135                 block1 += ystride;
136                 block2 += ystride;
137         }
138         SAD_SSE_FINISH(result)
139         return result;
140
141 }
142
143 inline static int sad_sse_16w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
144 {
145         int result; 
146
147         SAD_SSE_INIT
148
149         while( h != 0 ) {
150                 asm volatile (
151                         SAD_SSE_SUM_8(0)
152                         SAD_SSE_SUM_8(8)
153
154                         :: "r" (block1), "r" (block2)
155                 );
156         
157                 h--;
158                 block1 += ystride;
159                 block2 += ystride;
160         }
161         SAD_SSE_FINISH(result)
162         return result;
163
164 }
165
166 inline static int sad_sse_32w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
167 {
168         int result; 
169
170         SAD_SSE_INIT
171
172         while( h != 0 ) {
173                 asm volatile (
174                         SAD_SSE_SUM_8(0)
175                         SAD_SSE_SUM_8(8)
176                         SAD_SSE_SUM_8(16)
177                         SAD_SSE_SUM_8(24)
178
179                         :: "r" (block1), "r" (block2)
180                 );
181         
182                 h--;
183                 block1 += ystride;
184                 block2 += ystride;
185         }
186         SAD_SSE_FINISH(result)
187         return result;
188
189 }
190
191 inline static int sad_sse_64w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
192 {
193         int result; 
194
195         SAD_SSE_INIT
196
197         while( h != 0 ) {
198                 asm volatile (
199                         SAD_SSE_SUM_8(0)
200                         SAD_SSE_SUM_8(8)
201                         SAD_SSE_SUM_8(16)
202                         SAD_SSE_SUM_8(24)
203                         SAD_SSE_SUM_8(32)
204                         SAD_SSE_SUM_8(40)
205                         SAD_SSE_SUM_8(48)
206                         SAD_SSE_SUM_8(56)
207
208                         :: "r" (block1), "r" (block2)
209                 );
210         
211                 h--;
212                 block1 += ystride;
213                 block2 += ystride;
214         }
215         SAD_SSE_FINISH(result)
216         return result;
217
218 }
219 static __attribute__((used)) __attribute__((aligned(8))) uint64_t sad_sse_422_mask_chroma = 0x00ff00ff00ff00ffULL;
220
221 #define SAD_SSE_422_LUMA_INIT \
222         asm volatile (  "movq %0,%%mm7\n\t"\
223                         "pxor %%mm6,%%mm6\n\t" :: "m" (sad_sse_422_mask_chroma) );\
224
225 // Sum two 4x1 pixel blocks
226 #define SAD_SSE_422_LUMA_SUM_4(OFFSET) \
227                         "movq " #OFFSET "(%0),%%mm0             \n\t"\
228                         "movq " #OFFSET "(%1),%%mm1             \n\t"\
229                         "pand %%mm7,%%mm0                       \n\t"\
230                         "pand %%mm7,%%mm1                       \n\t"\
231                         "psadbw %%mm1,%%mm0                     \n\t"\
232                         "paddw %%mm0,%%mm6                      \n\t"\
233
234 static int sad_sse_422_luma_4x4( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
235 {
236         int result; 
237         SAD_SSE_422_LUMA_INIT
238         #define ROW     SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_NEXTROW
239         asm volatile (  ROW ROW ROW ROW
240                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
241         
242         SAD_SSE_FINISH(result)
243         return result;
244         #undef ROW
245
246 }
247
248 static int sad_sse_422_luma_8x8( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
249 {
250         int result; 
251         SAD_SSE_422_LUMA_INIT
252         #define ROW     SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_NEXTROW
253         asm volatile (  ROW ROW ROW ROW ROW ROW ROW ROW
254                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
255         
256         SAD_SSE_FINISH(result)
257         return result;
258         #undef ROW
259
260 }
261
262 static int sad_sse_422_luma_16x16( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
263 {
264         int result; 
265         SAD_SSE_422_LUMA_INIT
266         #define ROW     SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_422_LUMA_SUM_4(16) SAD_SSE_422_LUMA_SUM_4(24) SAD_SSE_NEXTROW
267         asm volatile (  ROW ROW ROW ROW ROW ROW ROW ROW
268                         ROW ROW ROW ROW ROW ROW ROW ROW
269                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
270         
271         SAD_SSE_FINISH(result)
272         return result;
273         #undef ROW
274
275 }
276
277 static int sad_sse_422_luma_32x32( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
278 {
279         int result; 
280         SAD_SSE_422_LUMA_INIT
281         #define ROW     SAD_SSE_422_LUMA_SUM_4(0) SAD_SSE_422_LUMA_SUM_4(8) SAD_SSE_422_LUMA_SUM_4(16) SAD_SSE_422_LUMA_SUM_4(24)\
282                         SAD_SSE_422_LUMA_SUM_4(32) SAD_SSE_422_LUMA_SUM_4(40) SAD_SSE_422_LUMA_SUM_4(48) SAD_SSE_422_LUMA_SUM_4(56)\
283                         SAD_SSE_NEXTROW
284
285         asm volatile (  ROW ROW ROW ROW ROW ROW ROW ROW
286                         ROW ROW ROW ROW ROW ROW ROW ROW
287                         ROW ROW ROW ROW ROW ROW ROW ROW
288                         ROW ROW ROW ROW ROW ROW ROW ROW
289                         :: "r" (block1), "r" (block2), "r" ((long int)(ystride)));
290         
291         SAD_SSE_FINISH(result)
292         return result;
293         #undef ROW
294
295 }
296
297 static int sad_sse_422_luma_4w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
298 {
299         int result; 
300
301         SAD_SSE_422_LUMA_INIT
302
303         while( h != 0 ) {
304                 asm volatile (
305                         SAD_SSE_422_LUMA_SUM_4(0)
306                         :: "r" (block1), "r" (block2)
307                 );
308         
309                 h--;
310                 block1 += ystride;
311                 block2 += ystride;
312         }
313         SAD_SSE_FINISH(result)
314         return result;
315
316 }
317
318 static int sad_sse_422_luma_8w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
319 {
320         int result; 
321
322         SAD_SSE_422_LUMA_INIT
323
324         while( h != 0 ) {
325                 asm volatile (
326                         SAD_SSE_422_LUMA_SUM_4(0)
327                         SAD_SSE_422_LUMA_SUM_4(8)
328
329                         :: "r" (block1), "r" (block2)
330                 );
331         
332                 h--;
333                 block1 += ystride;
334                 block2 += ystride;
335         }
336         SAD_SSE_FINISH(result)
337         return result;
338
339 }
340
341 static int sad_sse_422_luma_16w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
342 {
343         int result; 
344
345         SAD_SSE_422_LUMA_INIT
346
347         while( h != 0 ) {
348                 asm volatile (
349                         SAD_SSE_422_LUMA_SUM_4(0)
350                         SAD_SSE_422_LUMA_SUM_4(8)
351                         SAD_SSE_422_LUMA_SUM_4(16)
352                         SAD_SSE_422_LUMA_SUM_4(24)
353
354                         :: "r" (block1), "r" (block2)
355                 );
356         
357                 h--;
358                 block1 += ystride;
359                 block2 += ystride;
360         }
361         SAD_SSE_FINISH(result)
362         return result;
363
364 }
365
366 static int sad_sse_422_luma_32w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
367 {
368         int result; 
369
370         SAD_SSE_422_LUMA_INIT
371
372         while( h != 0 ) {
373                 asm volatile (
374                         SAD_SSE_422_LUMA_SUM_4(0)
375                         SAD_SSE_422_LUMA_SUM_4(8)
376                         SAD_SSE_422_LUMA_SUM_4(16)
377                         SAD_SSE_422_LUMA_SUM_4(24)
378                         SAD_SSE_422_LUMA_SUM_4(32)
379                         SAD_SSE_422_LUMA_SUM_4(40)
380                         SAD_SSE_422_LUMA_SUM_4(48)
381                         SAD_SSE_422_LUMA_SUM_4(56)
382
383                         :: "r" (block1), "r" (block2)
384                 );
385         
386                 h--;
387                 block1 += ystride;
388                 block2 += ystride;
389         }
390         SAD_SSE_FINISH(result)
391         return result;
392
393 }
394
395 static int sad_sse_422_luma_64w( uint8_t *block1, uint8_t *block2, int xstride, int ystride, int w, int h )
396 {
397         int result; 
398
399         SAD_SSE_422_LUMA_INIT
400
401         while( h != 0 ) {
402                 asm volatile (
403                         SAD_SSE_422_LUMA_SUM_4(0)
404                         SAD_SSE_422_LUMA_SUM_4(8)
405                         SAD_SSE_422_LUMA_SUM_4(16)
406                         SAD_SSE_422_LUMA_SUM_4(24)
407                         SAD_SSE_422_LUMA_SUM_4(32)
408                         SAD_SSE_422_LUMA_SUM_4(40)
409                         SAD_SSE_422_LUMA_SUM_4(48)
410                         SAD_SSE_422_LUMA_SUM_4(56)
411                         SAD_SSE_422_LUMA_SUM_4(64)
412                         SAD_SSE_422_LUMA_SUM_4(72)
413                         SAD_SSE_422_LUMA_SUM_4(80)
414                         SAD_SSE_422_LUMA_SUM_4(88)
415                         SAD_SSE_422_LUMA_SUM_4(96)
416                         SAD_SSE_422_LUMA_SUM_4(104)
417                         SAD_SSE_422_LUMA_SUM_4(112)
418                         SAD_SSE_422_LUMA_SUM_4(120)
419
420                         :: "r" (block1), "r" (block2)
421                 );
422         
423                 h--;
424                 block1 += ystride;
425                 block2 += ystride;
426         }
427         SAD_SSE_FINISH(result)
428         return result;
429 }