]> git.sesse.net Git - ffmpeg/blob - libavcodec/i386/snowdsp_mmx.c
replace <<1 by add
[ffmpeg] / libavcodec / i386 / snowdsp_mmx.c
1 /*
2  * MMX and SSE2 optimized snow DSP utils
3  * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "avcodec.h"
23 #include "snow.h"
24 #include "x86_cpu.h"
25
26 void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
27     const int w2= (width+1)>>1;
28     // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
29     DWTELEM temp_buf[(width>>1) + 4];
30     DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) >> 2);
31     const int w_l= (width>>1);
32     const int w_r= w2 - 1;
33     int i;
34
35     { // Lift 0
36         DWTELEM * const ref = b + w2 - 1;
37         DWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
38         // (the first time erroneously), we allow the SSE2 code to run an extra pass.
39         // The savings in code and time are well worth having to store this value and
40         // calculate b[0] correctly afterwards.
41
42         i = 0;
43         asm volatile(
44             "pcmpeqd   %%xmm7, %%xmm7         \n\t"
45             "pslld        $31, %%xmm7         \n\t"
46             "psrld        $29, %%xmm7         \n\t"
47         ::);
48         for(; i<w_l-7; i+=8){
49             asm volatile(
50                 "movdqu   (%1), %%xmm1        \n\t"
51                 "movdqu 16(%1), %%xmm5        \n\t"
52                 "movdqu  4(%1), %%xmm2        \n\t"
53                 "movdqu 20(%1), %%xmm6        \n\t"
54                 "paddd  %%xmm1, %%xmm2        \n\t"
55                 "paddd  %%xmm5, %%xmm6        \n\t"
56                 "movdqa %%xmm2, %%xmm0        \n\t"
57                 "movdqa %%xmm6, %%xmm4        \n\t"
58                 "paddd  %%xmm2, %%xmm2        \n\t"
59                 "paddd  %%xmm6, %%xmm6        \n\t"
60                 "paddd  %%xmm0, %%xmm2        \n\t"
61                 "paddd  %%xmm4, %%xmm6        \n\t"
62                 "paddd  %%xmm7, %%xmm2        \n\t"
63                 "paddd  %%xmm7, %%xmm6        \n\t"
64                 "psrad      $3, %%xmm2        \n\t"
65                 "psrad      $3, %%xmm6        \n\t"
66                 "movdqa   (%0), %%xmm0        \n\t"
67                 "movdqa 16(%0), %%xmm4        \n\t"
68                 "psubd  %%xmm2, %%xmm0        \n\t"
69                 "psubd  %%xmm6, %%xmm4        \n\t"
70                 "movdqa %%xmm0, (%0)          \n\t"
71                 "movdqa %%xmm4, 16(%0)        \n\t"
72                 :: "r"(&b[i]), "r"(&ref[i])
73                 : "memory"
74             );
75         }
76         snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
77         b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
78     }
79
80     { // Lift 1
81         DWTELEM * const dst = b+w2;
82
83         i = 0;
84         for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
85             dst[i] = dst[i] - (b[i] + b[i + 1]);
86         }
87         for(; i<w_r-7; i+=8){
88             asm volatile(
89                 "movdqu   (%1), %%xmm1        \n\t"
90                 "movdqu 16(%1), %%xmm5        \n\t"
91                 "movdqu  4(%1), %%xmm2        \n\t"
92                 "movdqu 20(%1), %%xmm6        \n\t"
93                 "paddd  %%xmm1, %%xmm2        \n\t"
94                 "paddd  %%xmm5, %%xmm6        \n\t"
95                 "movdqa   (%0), %%xmm0        \n\t"
96                 "movdqa 16(%0), %%xmm4        \n\t"
97                 "psubd  %%xmm2, %%xmm0        \n\t"
98                 "psubd  %%xmm6, %%xmm4        \n\t"
99                 "movdqa %%xmm0, (%0)          \n\t"
100                 "movdqa %%xmm4, 16(%0)        \n\t"
101                 :: "r"(&dst[i]), "r"(&b[i])
102                 : "memory"
103             );
104         }
105         snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
106     }
107
108     { // Lift 2
109         DWTELEM * const ref = b+w2 - 1;
110         DWTELEM b_0 = b[0];
111
112         i = 0;
113         asm volatile(
114             "pcmpeqd    %%xmm7, %%xmm7        \n\t"
115             "psrad         $29, %%xmm7        \n\t"
116         ::);
117         for(; i<w_l-7; i+=8){
118             asm volatile(
119                 "movdqu   (%1), %%xmm1        \n\t"
120                 "movdqu 16(%1), %%xmm5        \n\t"
121                 "movdqu  4(%1), %%xmm0        \n\t"
122                 "movdqu 20(%1), %%xmm4        \n\t" //FIXME try aligned reads and shifts
123                 "paddd  %%xmm1, %%xmm0        \n\t"
124                 "paddd  %%xmm5, %%xmm4        \n\t"
125                 "paddd  %%xmm7, %%xmm0        \n\t"
126                 "paddd  %%xmm7, %%xmm4        \n\t"
127                 "movdqa   (%0), %%xmm1        \n\t"
128                 "movdqa 16(%0), %%xmm5        \n\t"
129                 "psrad      $2, %%xmm0        \n\t"
130                 "psrad      $2, %%xmm4        \n\t"
131                 "paddd  %%xmm1, %%xmm0        \n\t"
132                 "paddd  %%xmm5, %%xmm4        \n\t"
133                 "psrad      $2, %%xmm0        \n\t"
134                 "psrad      $2, %%xmm4        \n\t"
135                 "paddd  %%xmm1, %%xmm0        \n\t"
136                 "paddd  %%xmm5, %%xmm4        \n\t"
137                 "movdqa %%xmm0, (%0)          \n\t"
138                 "movdqa %%xmm4, 16(%0)        \n\t"
139                 :: "r"(&b[i]), "r"(&ref[i])
140                 : "memory"
141             );
142         }
143         snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
144         b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
145     }
146
147     { // Lift 3
148         DWTELEM * const src = b+w2;
149
150         i = 0;
151         for(; (((long)&temp[i]) & 0xF) && i<w_r; i++){
152             temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
153         }
154         for(; i<w_r-7; i+=8){
155             asm volatile(
156                 "movdqu  4(%1), %%xmm2        \n\t"
157                 "movdqu 20(%1), %%xmm6        \n\t"
158                 "paddd    (%1), %%xmm2        \n\t"
159                 "paddd  16(%1), %%xmm6        \n\t"
160                 "movdqa %%xmm2, %%xmm0        \n\t"
161                 "movdqa %%xmm6, %%xmm4        \n\t"
162                 "pslld      $2, %%xmm2        \n\t"
163                 "pslld      $2, %%xmm6        \n\t"
164                 "psubd  %%xmm2, %%xmm0        \n\t"
165                 "psubd  %%xmm6, %%xmm4        \n\t"
166                 "psrad      $1, %%xmm0        \n\t"
167                 "psrad      $1, %%xmm4        \n\t"
168                 "movdqu   (%0), %%xmm2        \n\t"
169                 "movdqu 16(%0), %%xmm6        \n\t"
170                 "psubd  %%xmm0, %%xmm2        \n\t"
171                 "psubd  %%xmm4, %%xmm6        \n\t"
172                 "movdqa %%xmm2, (%2)          \n\t"
173                 "movdqa %%xmm6, 16(%2)        \n\t"
174                 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
175                  : "memory"
176                );
177         }
178         snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
179     }
180
181     {
182         snow_interleave_line_header(&i, width, b, temp);
183
184         for (; (i & 0x1E) != 0x1E; i-=2){
185             b[i+1] = temp[i>>1];
186             b[i] = b[i>>1];
187         }
188         for (i-=30; i>=0; i-=32){
189             asm volatile(
190                 "movdqa      (%1), %%xmm0       \n\t"
191                 "movdqa    16(%1), %%xmm2       \n\t"
192                 "movdqa    32(%1), %%xmm4       \n\t"
193                 "movdqa    48(%1), %%xmm6       \n\t"
194                 "movdqa      (%1), %%xmm1       \n\t"
195                 "movdqa    16(%1), %%xmm3       \n\t"
196                 "movdqa    32(%1), %%xmm5       \n\t"
197                 "movdqa    48(%1), %%xmm7       \n\t"
198                 "punpckldq   (%2), %%xmm0       \n\t"
199                 "punpckldq 16(%2), %%xmm2       \n\t"
200                 "punpckldq 32(%2), %%xmm4       \n\t"
201                 "punpckldq 48(%2), %%xmm6       \n\t"
202                 "movdqa    %%xmm0, (%0)         \n\t"
203                 "movdqa    %%xmm2, 32(%0)       \n\t"
204                 "movdqa    %%xmm4, 64(%0)       \n\t"
205                 "movdqa    %%xmm6, 96(%0)       \n\t"
206                 "punpckhdq   (%2), %%xmm1       \n\t"
207                 "punpckhdq 16(%2), %%xmm3       \n\t"
208                 "punpckhdq 32(%2), %%xmm5       \n\t"
209                 "punpckhdq 48(%2), %%xmm7       \n\t"
210                 "movdqa    %%xmm1, 16(%0)       \n\t"
211                 "movdqa    %%xmm3, 48(%0)       \n\t"
212                 "movdqa    %%xmm5, 80(%0)       \n\t"
213                 "movdqa    %%xmm7, 112(%0)      \n\t"
214                 :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
215                  : "memory"
216                );
217         }
218     }
219 }
220
221 void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
222     const int w2= (width+1)>>1;
223     DWTELEM temp[width >> 1];
224     const int w_l= (width>>1);
225     const int w_r= w2 - 1;
226     int i;
227
228     { // Lift 0
229         DWTELEM * const ref = b + w2 - 1;
230
231         i = 1;
232         b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
233         asm volatile(
234             "pcmpeqd    %%mm7, %%mm7         \n\t"
235             "pslld        $31, %%mm7         \n\t"
236             "psrld        $29, %%mm7         \n\t"
237            ::);
238         for(; i<w_l-3; i+=4){
239             asm volatile(
240                 "movq     (%1), %%mm2        \n\t"
241                 "movq    8(%1), %%mm6        \n\t"
242                 "paddd   4(%1), %%mm2        \n\t"
243                 "paddd  12(%1), %%mm6        \n\t"
244                 "movq    %%mm2, %%mm0        \n\t"
245                 "movq    %%mm6, %%mm4        \n\t"
246                 "paddd   %%mm2, %%mm2        \n\t"
247                 "paddd   %%mm6, %%mm6        \n\t"
248                 "paddd   %%mm0, %%mm2        \n\t"
249                 "paddd   %%mm4, %%mm6        \n\t"
250                 "paddd   %%mm7, %%mm2        \n\t"
251                 "paddd   %%mm7, %%mm6        \n\t"
252                 "psrad      $3, %%mm2        \n\t"
253                 "psrad      $3, %%mm6        \n\t"
254                 "movq     (%0), %%mm0        \n\t"
255                 "movq    8(%0), %%mm4        \n\t"
256                 "psubd   %%mm2, %%mm0        \n\t"
257                 "psubd   %%mm6, %%mm4        \n\t"
258                 "movq    %%mm0, (%0)         \n\t"
259                 "movq    %%mm4, 8(%0)        \n\t"
260                 :: "r"(&b[i]), "r"(&ref[i])
261                  : "memory"
262                );
263         }
264         snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
265     }
266
267     { // Lift 1
268         DWTELEM * const dst = b+w2;
269
270         i = 0;
271         for(; i<w_r-3; i+=4){
272             asm volatile(
273                 "movq     (%1), %%mm2        \n\t"
274                 "movq    8(%1), %%mm6        \n\t"
275                 "paddd   4(%1), %%mm2        \n\t"
276                 "paddd  12(%1), %%mm6        \n\t"
277                 "movq     (%0), %%mm0        \n\t"
278                 "movq    8(%0), %%mm4        \n\t"
279                 "psubd   %%mm2, %%mm0        \n\t"
280                 "psubd   %%mm6, %%mm4        \n\t"
281                 "movq    %%mm0, (%0)         \n\t"
282                 "movq    %%mm4, 8(%0)        \n\t"
283                 :: "r"(&dst[i]), "r"(&b[i])
284                  : "memory"
285                );
286         }
287         snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
288     }
289
290     { // Lift 2
291         DWTELEM * const ref = b+w2 - 1;
292
293         i = 1;
294         b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS);
295         asm volatile(
296             "pcmpeqd     %%mm7, %%mm7        \n\t"
297             "psrld         $29, %%mm7        \n\t"
298            ::);
299         for(; i<w_l-3; i+=4){
300             asm volatile(
301                 "movq     (%1), %%mm0        \n\t"
302                 "movq    8(%1), %%mm4        \n\t"
303                 "paddd   4(%1), %%mm0        \n\t"
304                 "paddd  12(%1), %%mm4        \n\t"
305                 "paddd   %%mm7, %%mm0        \n\t"
306                 "paddd   %%mm7, %%mm4        \n\t"
307                 "psrad      $2, %%mm0        \n\t"
308                 "psrad      $2, %%mm4        \n\t"
309                 "movq     (%0), %%mm1        \n\t"
310                 "movq    8(%0), %%mm5        \n\t"
311                 "paddd   %%mm1, %%mm0        \n\t"
312                 "paddd   %%mm5, %%mm4        \n\t"
313                 "psrad      $2, %%mm0        \n\t"
314                 "psrad      $2, %%mm4        \n\t"
315                 "paddd   %%mm1, %%mm0        \n\t"
316                 "paddd   %%mm5, %%mm4        \n\t"
317                 "movq    %%mm0, (%0)         \n\t"
318                 "movq    %%mm4, 8(%0)        \n\t"
319                 :: "r"(&b[i]), "r"(&ref[i])
320                  : "memory"
321                );
322         }
323         snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
324     }
325
326     { // Lift 3
327         DWTELEM * const src = b+w2;
328         i = 0;
329
330         for(; i<w_r-3; i+=4){
331             asm volatile(
332                 "movq    4(%1), %%mm2        \n\t"
333                 "movq   12(%1), %%mm6        \n\t"
334                 "paddd    (%1), %%mm2        \n\t"
335                 "paddd   8(%1), %%mm6        \n\t"
336                 "pxor    %%mm0, %%mm0        \n\t" //note: the 2 xor could be avoided if we would flip the rounding direction
337                 "pxor    %%mm4, %%mm4        \n\t"
338                 "psubd   %%mm2, %%mm0        \n\t"
339                 "psubd   %%mm6, %%mm4        \n\t"
340                 "psrad      $1, %%mm0        \n\t"
341                 "psrad      $1, %%mm4        \n\t"
342                 "psubd   %%mm0, %%mm2        \n\t"
343                 "psubd   %%mm4, %%mm6        \n\t"
344                 "movq     (%0), %%mm0        \n\t"
345                 "movq    8(%0), %%mm4        \n\t"
346                 "paddd   %%mm0, %%mm2        \n\t"
347                 "paddd   %%mm4, %%mm6        \n\t"
348                 "movq    %%mm2, (%2)         \n\t"
349                 "movq    %%mm6, 8(%2)        \n\t"
350                 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
351                  : "memory"
352                );
353         }
354         snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
355     }
356
357     {
358         snow_interleave_line_header(&i, width, b, temp);
359
360         for (; (i & 0xE) != 0xE; i-=2){
361             b[i+1] = temp[i>>1];
362             b[i] = b[i>>1];
363         }
364         for (i-=14; i>=0; i-=16){
365             asm volatile(
366                 "movq        (%1), %%mm0       \n\t"
367                 "movq       8(%1), %%mm2       \n\t"
368                 "movq      16(%1), %%mm4       \n\t"
369                 "movq      24(%1), %%mm6       \n\t"
370                 "movq        (%1), %%mm1       \n\t"
371                 "movq       8(%1), %%mm3       \n\t"
372                 "movq      16(%1), %%mm5       \n\t"
373                 "movq      24(%1), %%mm7       \n\t"
374                 "punpckldq   (%2), %%mm0       \n\t"
375                 "punpckldq  8(%2), %%mm2       \n\t"
376                 "punpckldq 16(%2), %%mm4       \n\t"
377                 "punpckldq 24(%2), %%mm6       \n\t"
378                 "movq       %%mm0, (%0)        \n\t"
379                 "movq       %%mm2, 16(%0)      \n\t"
380                 "movq       %%mm4, 32(%0)      \n\t"
381                 "movq       %%mm6, 48(%0)      \n\t"
382                 "punpckhdq   (%2), %%mm1       \n\t"
383                 "punpckhdq  8(%2), %%mm3       \n\t"
384                 "punpckhdq 16(%2), %%mm5       \n\t"
385                 "punpckhdq 24(%2), %%mm7       \n\t"
386                 "movq       %%mm1, 8(%0)       \n\t"
387                 "movq       %%mm3, 24(%0)      \n\t"
388                 "movq       %%mm5, 40(%0)      \n\t"
389                 "movq       %%mm7, 56(%0)      \n\t"
390                 :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
391                  : "memory"
392                );
393         }
394     }
395 }
396
397 #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
398         ""op" (%%"r",%%"REG_d",4), %%"t0"      \n\t"\
399         ""op" 16(%%"r",%%"REG_d",4), %%"t1"    \n\t"\
400         ""op" 32(%%"r",%%"REG_d",4), %%"t2"    \n\t"\
401         ""op" 48(%%"r",%%"REG_d",4), %%"t3"    \n\t"
402
403 #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
404         snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
405
406 #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
407         snow_vertical_compose_sse2_load_add("paddd",r,t0,t1,t2,t3)
408
409 #define snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
410         "psubd %%"s0", %%"t0" \n\t"\
411         "psubd %%"s1", %%"t1" \n\t"\
412         "psubd %%"s2", %%"t2" \n\t"\
413         "psubd %%"s3", %%"t3" \n\t"
414
415 #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
416         "movdqa %%"s0", (%%"w",%%"REG_d",4)      \n\t"\
417         "movdqa %%"s1", 16(%%"w",%%"REG_d",4)    \n\t"\
418         "movdqa %%"s2", 32(%%"w",%%"REG_d",4)    \n\t"\
419         "movdqa %%"s3", 48(%%"w",%%"REG_d",4)    \n\t"
420
421 #define snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)\
422         "psrad $"n", %%"t0" \n\t"\
423         "psrad $"n", %%"t1" \n\t"\
424         "psrad $"n", %%"t2" \n\t"\
425         "psrad $"n", %%"t3" \n\t"
426
427 #define snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
428         "paddd %%"s0", %%"t0" \n\t"\
429         "paddd %%"s1", %%"t1" \n\t"\
430         "paddd %%"s2", %%"t2" \n\t"\
431         "paddd %%"s3", %%"t3" \n\t"
432
433 #define snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)\
434         "pslld $"n", %%"t0" \n\t"\
435         "pslld $"n", %%"t1" \n\t"\
436         "pslld $"n", %%"t2" \n\t"\
437         "pslld $"n", %%"t3" \n\t"
438
439 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
440         "movdqa %%"s0", %%"t0" \n\t"\
441         "movdqa %%"s1", %%"t1" \n\t"\
442         "movdqa %%"s2", %%"t2" \n\t"\
443         "movdqa %%"s3", %%"t3" \n\t"
444
445 void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
446     long i = width;
447
448     while(i & 0xF)
449     {
450         i--;
451         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
452         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
453         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
454         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
455     }
456
457          asm volatile (
458         "jmp 2f                                      \n\t"
459         "1:                                          \n\t"
460
461         "mov %6, %%"REG_a"                           \n\t"
462         "mov %4, %%"REG_S"                           \n\t"
463
464         snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
465         snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
466         snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
467         snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
468         snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
469
470         "pcmpeqd %%xmm1, %%xmm1                      \n\t"
471         "pslld $31, %%xmm1                           \n\t"
472         "psrld $29, %%xmm1                           \n\t"
473         "mov %5, %%"REG_a"                           \n\t"
474
475         snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
476         snow_vertical_compose_sse2_sra("3","xmm0","xmm2","xmm4","xmm6")
477         snow_vertical_compose_sse2_load(REG_a,"xmm1","xmm3","xmm5","xmm7")
478         snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
479         snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7")
480         "mov %3, %%"REG_c"                           \n\t"
481         snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6")
482         snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7")
483         snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
484         snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6")
485         "mov %2, %%"REG_a"                           \n\t"
486         snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7")
487         snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
488         snow_vertical_compose_sse2_sll("2","xmm1","xmm3","xmm5","xmm7")\
489         snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
490
491         "pcmpeqd %%xmm1, %%xmm1                      \n\t"
492         "pslld $31, %%xmm1                           \n\t"
493         "psrld $28, %%xmm1                           \n\t"
494         "mov %1, %%"REG_S"                           \n\t"
495
496         snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6")
497         snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6")
498         snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6")
499         snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6")
500         snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6")
501         snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
502         snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\
503         snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
504         snow_vertical_compose_sse2_sra("1","xmm0","xmm2","xmm4","xmm6")
505         snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6")
506         snow_vertical_compose_sse2_store(REG_a,"xmm0","xmm2","xmm4","xmm6")
507
508         "2:                                          \n\t"
509         "sub $16, %%"REG_d"                          \n\t"
510         "jge 1b                                      \n\t"
511         :"+d"(i)
512         :
513         "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
514         "%"REG_a"","%"REG_S"","%"REG_c"");
515 }
516
517 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
518         ""op" (%%"r",%%"REG_d",4), %%"t0"   \n\t"\
519         ""op" 8(%%"r",%%"REG_d",4), %%"t1"  \n\t"\
520         ""op" 16(%%"r",%%"REG_d",4), %%"t2" \n\t"\
521         ""op" 24(%%"r",%%"REG_d",4), %%"t3" \n\t"
522
523 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
524         snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
525
526 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
527         snow_vertical_compose_mmx_load_add("paddd",r,t0,t1,t2,t3)
528
529 #define snow_vertical_compose_mmx_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
530         snow_vertical_compose_sse2_sub(s0,s1,s2,s3,t0,t1,t2,t3)
531
532 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
533         "movq %%"s0", (%%"w",%%"REG_d",4)   \n\t"\
534         "movq %%"s1", 8(%%"w",%%"REG_d",4)  \n\t"\
535         "movq %%"s2", 16(%%"w",%%"REG_d",4) \n\t"\
536         "movq %%"s3", 24(%%"w",%%"REG_d",4) \n\t"
537
538 #define snow_vertical_compose_mmx_sra(n,t0,t1,t2,t3)\
539         snow_vertical_compose_sse2_sra(n,t0,t1,t2,t3)
540
541 #define snow_vertical_compose_mmx_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
542         snow_vertical_compose_sse2_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)
543
544 #define snow_vertical_compose_mmx_sll(n,t0,t1,t2,t3)\
545         snow_vertical_compose_sse2_sll(n,t0,t1,t2,t3)
546
547 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
548         "movq %%"s0", %%"t0" \n\t"\
549         "movq %%"s1", %%"t1" \n\t"\
550         "movq %%"s2", %%"t2" \n\t"\
551         "movq %%"s3", %%"t3" \n\t"
552
553 void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
554     long i = width;
555     while(i & 0x7)
556     {
557         i--;
558         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
559         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
560         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
561         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
562     }
563
564     asm volatile(
565         "jmp 2f                                      \n\t"
566         "1:                                          \n\t"
567
568         "mov %6, %%"REG_a"                           \n\t"
569         "mov %4, %%"REG_S"                           \n\t"
570
571         snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
572         snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
573         snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
574         snow_vertical_compose_mmx_r2r_add("mm0","mm2","mm4","mm6","mm0","mm2","mm4","mm6")
575         snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
576
577         "pcmpeqd %%mm1, %%mm1                        \n\t"
578         "pslld $31, %%mm1                            \n\t"
579         "psrld $29, %%mm1                            \n\t"
580         "mov %5, %%"REG_a"                           \n\t"
581
582         snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
583         snow_vertical_compose_mmx_sra("3","mm0","mm2","mm4","mm6")
584         snow_vertical_compose_mmx_load(REG_a,"mm1","mm3","mm5","mm7")
585         snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
586         snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7")
587         "mov %3, %%"REG_c"                           \n\t"
588         snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6")
589         snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7")
590         snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
591         snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6")
592         "mov %2, %%"REG_a"                           \n\t"
593         snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7")
594         snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
595         snow_vertical_compose_mmx_sll("2","mm1","mm3","mm5","mm7")
596         snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
597
598         "pcmpeqd %%mm1, %%mm1                        \n\t"
599         "pslld $31, %%mm1                            \n\t"
600         "psrld $28, %%mm1                            \n\t"
601         "mov %1, %%"REG_S"                           \n\t"
602
603         snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
604         snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6")
605         snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6")
606         snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6")
607         snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6")
608         snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
609         snow_vertical_compose_mmx_r2r_add("mm0","mm2","mm4","mm6","mm0","mm2","mm4","mm6")
610         snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
611         snow_vertical_compose_mmx_sra("1","mm0","mm2","mm4","mm6")
612         snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6")
613         snow_vertical_compose_mmx_store(REG_a,"mm0","mm2","mm4","mm6")
614
615         "2:                                          \n\t"
616         "sub $8, %%"REG_d"                           \n\t"
617         "jge 1b                                      \n\t"
618         :"+d"(i)
619         :
620         "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5):
621         "%"REG_a"","%"REG_S"","%"REG_c"");
622 }
623
624 #define snow_inner_add_yblock_sse2_header \
625     DWTELEM * * dst_array = sb->line + src_y;\
626     long tmp;\
627     asm volatile(\
628              "mov  %7, %%"REG_c"             \n\t"\
629              "mov  %6, %2                    \n\t"\
630              "mov  %4, %%"REG_S"             \n\t"\
631              "pxor %%xmm7, %%xmm7            \n\t" /* 0 */\
632              "pcmpeqd %%xmm3, %%xmm3         \n\t"\
633              "pslld $31, %%xmm3              \n\t"\
634              "psrld $24, %%xmm3              \n\t" /* FRAC_BITS >> 1 */\
635              "1:                             \n\t"\
636              "mov %1, %%"REG_D"              \n\t"\
637              "mov (%%"REG_D"), %%"REG_D"     \n\t"\
638              "add %3, %%"REG_D"              \n\t"
639
640 #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
641              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
642              "movq (%%"REG_d"), %%"out_reg1" \n\t"\
643              "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
644              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
645              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
646              "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
647              "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
648              "punpcklbw %%xmm7, %%xmm0       \n\t"\
649              "punpcklbw %%xmm7, %%xmm4       \n\t"\
650              "pmullw %%xmm0, %%"out_reg1"    \n\t"\
651              "pmullw %%xmm4, %%"out_reg2"    \n\t"
652
653 #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
654              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
655              "movq (%%"REG_d"), %%"out_reg1" \n\t"\
656              "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
657              "punpcklbw %%xmm7, %%"out_reg1" \n\t"\
658              "punpcklbw %%xmm7, %%"out_reg2" \n\t"\
659              "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
660              "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
661              "punpcklbw %%xmm7, %%xmm0       \n\t"\
662              "punpcklbw %%xmm7, %%xmm4       \n\t"\
663              "pmullw %%xmm0, %%"out_reg1"    \n\t"\
664              "pmullw %%xmm4, %%"out_reg2"    \n\t"
665
666 #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
667              snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
668              "paddusw %%xmm2, %%xmm1         \n\t"\
669              "paddusw %%xmm6, %%xmm5         \n\t"
670
671 #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
672              snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
673              "paddusw %%xmm2, %%xmm1         \n\t"\
674              "paddusw %%xmm6, %%xmm5         \n\t"
675
676 #define snow_inner_add_yblock_sse2_end_common1\
677              "add $32, %%"REG_S"             \n\t"\
678              "add %%"REG_c", %0              \n\t"\
679              "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
680              "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
681              "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
682              "add %%"REG_c", (%%"REG_a")     \n\t"
683
684 #define snow_inner_add_yblock_sse2_end_common2\
685              "jnz 1b                         \n\t"\
686              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
687              :\
688              "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
689              "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
690
691 #define snow_inner_add_yblock_sse2_end_8\
692              "sal $1, %%"REG_c"              \n\t"\
693              "add $"PTR_SIZE"*2, %1          \n\t"\
694              snow_inner_add_yblock_sse2_end_common1\
695              "sar $1, %%"REG_c"              \n\t"\
696              "sub $2, %2                     \n\t"\
697              snow_inner_add_yblock_sse2_end_common2
698
699 #define snow_inner_add_yblock_sse2_end_16\
700              "add $"PTR_SIZE"*1, %1          \n\t"\
701              snow_inner_add_yblock_sse2_end_common1\
702              "dec %2                         \n\t"\
703              snow_inner_add_yblock_sse2_end_common2
704
705 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
706                       int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
707 snow_inner_add_yblock_sse2_header
708 snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
709 snow_inner_add_yblock_sse2_accum_8("2", "8")
710 snow_inner_add_yblock_sse2_accum_8("1", "128")
711 snow_inner_add_yblock_sse2_accum_8("0", "136")
712
713              "mov %0, %%"REG_d"              \n\t"
714              "movdqa (%%"REG_D"), %%xmm0     \n\t"
715              "movdqa %%xmm1, %%xmm2          \n\t"
716
717              "punpckhwd %%xmm7, %%xmm1       \n\t"
718              "punpcklwd %%xmm7, %%xmm2       \n\t"
719              "paddd %%xmm2, %%xmm0           \n\t"
720              "movdqa 16(%%"REG_D"), %%xmm2   \n\t"
721              "paddd %%xmm1, %%xmm2           \n\t"
722              "paddd %%xmm3, %%xmm0           \n\t"
723              "paddd %%xmm3, %%xmm2           \n\t"
724
725              "mov %1, %%"REG_D"              \n\t"
726              "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
727              "add %3, %%"REG_D"              \n\t"
728
729              "movdqa (%%"REG_D"), %%xmm4     \n\t"
730              "movdqa %%xmm5, %%xmm6          \n\t"
731              "punpckhwd %%xmm7, %%xmm5       \n\t"
732              "punpcklwd %%xmm7, %%xmm6       \n\t"
733              "paddd %%xmm6, %%xmm4           \n\t"
734              "movdqa 16(%%"REG_D"), %%xmm6   \n\t"
735              "paddd %%xmm5, %%xmm6           \n\t"
736              "paddd %%xmm3, %%xmm4           \n\t"
737              "paddd %%xmm3, %%xmm6           \n\t"
738
739              "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
740              "psrad $8, %%xmm2               \n\t" /* FRAC_BITS. */
741              "packssdw %%xmm2, %%xmm0        \n\t"
742              "packuswb %%xmm7, %%xmm0        \n\t"
743              "movq %%xmm0, (%%"REG_d")       \n\t"
744
745              "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
746              "psrad $8, %%xmm6               \n\t" /* FRAC_BITS. */
747              "packssdw %%xmm6, %%xmm4        \n\t"
748              "packuswb %%xmm7, %%xmm4        \n\t"
749              "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
750 snow_inner_add_yblock_sse2_end_8
751 }
752
753 static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
754                       int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
755 snow_inner_add_yblock_sse2_header
756 snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
757 snow_inner_add_yblock_sse2_accum_16("2", "16")
758 snow_inner_add_yblock_sse2_accum_16("1", "512")
759 snow_inner_add_yblock_sse2_accum_16("0", "528")
760
761              "mov %0, %%"REG_d"              \n\t"
762              "movdqa %%xmm1, %%xmm0          \n\t"
763              "movdqa %%xmm5, %%xmm4          \n\t"
764              "punpcklwd %%xmm7, %%xmm0       \n\t"
765              "paddd (%%"REG_D"), %%xmm0      \n\t"
766              "punpckhwd %%xmm7, %%xmm1       \n\t"
767              "paddd 16(%%"REG_D"), %%xmm1    \n\t"
768              "punpcklwd %%xmm7, %%xmm4       \n\t"
769              "paddd 32(%%"REG_D"), %%xmm4    \n\t"
770              "punpckhwd %%xmm7, %%xmm5       \n\t"
771              "paddd 48(%%"REG_D"), %%xmm5    \n\t"
772              "paddd %%xmm3, %%xmm0           \n\t"
773              "paddd %%xmm3, %%xmm1           \n\t"
774              "paddd %%xmm3, %%xmm4           \n\t"
775              "paddd %%xmm3, %%xmm5           \n\t"
776              "psrad $8, %%xmm0               \n\t" /* FRAC_BITS. */
777              "psrad $8, %%xmm1               \n\t" /* FRAC_BITS. */
778              "psrad $8, %%xmm4               \n\t" /* FRAC_BITS. */
779              "psrad $8, %%xmm5               \n\t" /* FRAC_BITS. */
780
781              "packssdw %%xmm1, %%xmm0        \n\t"
782              "packssdw %%xmm5, %%xmm4        \n\t"
783              "packuswb %%xmm4, %%xmm0        \n\t"
784
785              "movdqu %%xmm0, (%%"REG_d")       \n\t"
786
787 snow_inner_add_yblock_sse2_end_16
788 }
789
790 #define snow_inner_add_yblock_mmx_header \
791     DWTELEM * * dst_array = sb->line + src_y;\
792     long tmp;\
793     asm volatile(\
794              "mov  %7, %%"REG_c"             \n\t"\
795              "mov  %6, %2                    \n\t"\
796              "mov  %4, %%"REG_S"             \n\t"\
797              "pxor %%mm7, %%mm7              \n\t" /* 0 */\
798              "pcmpeqd %%mm3, %%mm3           \n\t"\
799              "pslld $31, %%mm3               \n\t"\
800              "psrld $24, %%mm3               \n\t" /* FRAC_BITS >> 1 */\
801              "1:                             \n\t"\
802              "mov %1, %%"REG_D"              \n\t"\
803              "mov (%%"REG_D"), %%"REG_D"     \n\t"\
804              "add %3, %%"REG_D"              \n\t"
805
806 #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
807              "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
808              "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
809              "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
810              "punpcklbw %%mm7, %%"out_reg1" \n\t"\
811              "punpcklbw %%mm7, %%"out_reg2" \n\t"\
812              "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
813              "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
814              "punpcklbw %%mm7, %%mm0       \n\t"\
815              "punpcklbw %%mm7, %%mm4       \n\t"\
816              "pmullw %%mm0, %%"out_reg1"    \n\t"\
817              "pmullw %%mm4, %%"out_reg2"    \n\t"
818
819 #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
820              snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
821              "paddusw %%mm2, %%mm1         \n\t"\
822              "paddusw %%mm6, %%mm5         \n\t"
823
824 #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
825              "mov %0, %%"REG_d"              \n\t"\
826              "movq %%mm1, %%mm0              \n\t"\
827              "movq %%mm5, %%mm4              \n\t"\
828              "punpcklwd %%mm7, %%mm0         \n\t"\
829              "paddd "read_offset"(%%"REG_D"), %%mm0 \n\t"\
830              "punpckhwd %%mm7, %%mm1         \n\t"\
831              "paddd "read_offset"+8(%%"REG_D"), %%mm1 \n\t"\
832              "punpcklwd %%mm7, %%mm4         \n\t"\
833              "paddd "read_offset"+16(%%"REG_D"), %%mm4 \n\t"\
834              "punpckhwd %%mm7, %%mm5         \n\t"\
835              "paddd "read_offset"+24(%%"REG_D"), %%mm5 \n\t"\
836              "paddd %%mm3, %%mm0             \n\t"\
837              "paddd %%mm3, %%mm1             \n\t"\
838              "paddd %%mm3, %%mm4             \n\t"\
839              "paddd %%mm3, %%mm5             \n\t"\
840              "psrad $8, %%mm0                \n\t"\
841              "psrad $8, %%mm1                \n\t"\
842              "psrad $8, %%mm4                \n\t"\
843              "psrad $8, %%mm5                \n\t"\
844 \
845              "packssdw %%mm1, %%mm0          \n\t"\
846              "packssdw %%mm5, %%mm4          \n\t"\
847              "packuswb %%mm4, %%mm0          \n\t"\
848              "movq %%mm0, "write_offset"(%%"REG_d") \n\t"
849
850 #define snow_inner_add_yblock_mmx_end(s_step)\
851              "add $"s_step", %%"REG_S"             \n\t"\
852              "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
853              "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
854              "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
855              "add %%"REG_c", (%%"REG_a")     \n\t"\
856              "add $"PTR_SIZE"*1, %1          \n\t"\
857              "add %%"REG_c", %0              \n\t"\
858              "dec %2                         \n\t"\
859              "jnz 1b                         \n\t"\
860              :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
861              :\
862              "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
863              "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
864
865 static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
866                       int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
867 snow_inner_add_yblock_mmx_header
868 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
869 snow_inner_add_yblock_mmx_accum("2", "8", "0")
870 snow_inner_add_yblock_mmx_accum("1", "128", "0")
871 snow_inner_add_yblock_mmx_accum("0", "136", "0")
872 snow_inner_add_yblock_mmx_mix("0", "0")
873 snow_inner_add_yblock_mmx_end("16")
874 }
875
876 static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
877                       int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
878 snow_inner_add_yblock_mmx_header
879 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
880 snow_inner_add_yblock_mmx_accum("2", "16", "0")
881 snow_inner_add_yblock_mmx_accum("1", "512", "0")
882 snow_inner_add_yblock_mmx_accum("0", "528", "0")
883 snow_inner_add_yblock_mmx_mix("0", "0")
884
885 snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
886 snow_inner_add_yblock_mmx_accum("2", "24", "8")
887 snow_inner_add_yblock_mmx_accum("1", "520", "8")
888 snow_inner_add_yblock_mmx_accum("0", "536", "8")
889 snow_inner_add_yblock_mmx_mix("32", "8")
890 snow_inner_add_yblock_mmx_end("32")
891 }
892
893 void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
894                            int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
895
896     if (b_w == 16)
897         inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
898     else if (b_w == 8 && obmc_stride == 16) {
899         if (!(b_h & 1))
900             inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
901         else
902             inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
903     } else
904          ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
905 }
906
907 void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
908                           int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
909     if (b_w == 16)
910         inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
911     else if (b_w == 8 && obmc_stride == 16)
912         inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
913     else
914         ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
915 }