]> git.sesse.net Git - ffmpeg/blob - postproc/postprocess_template.c
fixed a bug in the tmp buffer
[ffmpeg] / postproc / postprocess_template.c
1 /*
2     Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20                         C       MMX     MMX2    3DNow
21 isVertDC                Ec      Ec
22 isVertMinMaxOk          Ec      Ec
23 doVertLowPass           E               e       e
24 doVertDefFilter         Ec      Ec      Ec
25 isHorizDC               Ec      Ec
26 isHorizMinMaxOk         a
27 doHorizLowPass          E               a       a
28 doHorizDefFilter        E       ac      ac
29 deRing
30 Vertical RKAlgo1        E               a       a
31 Vertical X1             a               E       E
32 Horizontal X1           a               E       E
33 LinIpolDeinterlace      e               E       E*
34 CubicIpolDeinterlace    a               e       e*
35 LinBlendDeinterlace     e               E       E*
36 MedianDeinterlace               Ec      Ec
37
38
39 * i dont have a 3dnow CPU -> its untested
40 E = Exact implementation
41 e = allmost exact implementation (slightly different rounding,...)
42 a = alternative / approximate impl
43 c = checked against the other implementations (-vo md5)
44 */
45
46 /*
47 TODO:
48 verify that everything workes as it should (how?)
49 reduce the time wasted on the mem transfer
50 implement dering
51 implement everything in C at least (done at the moment but ...)
52 unroll stuff if instructions depend too much on the prior one
53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
54 move YScale thing to the end instead of fixing QP
55 write a faster and higher quality deblocking filter :)
56 do something about the speed of the horizontal filters
57 make the mainloop more flexible (variable number of blocks at once
58         (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters
60 split this huge file
61 fix warnings (unused vars, ...)
62 noise reduction filters
63 ...
64
65 Notes:
66
67 */
68
69 //Changelog: use the CVS log
70
71 #include <inttypes.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include "../config.h"
75 //#undef HAVE_MMX2
76 //#define HAVE_3DNOW
77 //#undef HAVE_MMX
78 #include "postprocess.h"
79
80 #define MIN(a,b) ((a) > (b) ? (b) : (a))
81 #define MAX(a,b) ((a) < (b) ? (b) : (a))
82 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
83 #define SIGN(a) ((a) > 0 ? 1 : -1)
84
85 #ifdef HAVE_MMX2
86 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
87 #elif defined (HAVE_3DNOW)
88 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
89 #endif
90
91 static uint64_t packedYOffset=  0x0000000000000000LL;
92 static uint64_t packedYScale=   0x0100010001000100LL;
93 static uint64_t w05=            0x0005000500050005LL;
94 static uint64_t w20=            0x0020002000200020LL;
95 static uint64_t w1400=          0x1400140014001400LL;
96 static uint64_t bm00000001=     0x00000000000000FFLL;
97 static uint64_t bm00010000=     0x000000FF00000000LL;
98 static uint64_t bm00001000=     0x00000000FF000000LL;
99 static uint64_t bm10000000=     0xFF00000000000000LL;
100 static uint64_t bm10000001=     0xFF000000000000FFLL;
101 static uint64_t bm11000011=     0xFFFF00000000FFFFLL;
102 static uint64_t bm00000011=     0x000000000000FFFFLL;
103 static uint64_t bm11111110=     0xFFFFFFFFFFFFFF00LL;
104 static uint64_t bm11000000=     0xFFFF000000000000LL;
105 static uint64_t bm00011000=     0x000000FFFF000000LL;
106 static uint64_t bm00110011=     0x0000FFFF0000FFFFLL;
107 static uint64_t bm11001100=     0xFFFF0000FFFF0000LL;
108 static uint64_t b00=            0x0000000000000000LL;
109 static uint64_t b01=            0x0101010101010101LL;
110 static uint64_t b02=            0x0202020202020202LL;
111 static uint64_t b0F=            0x0F0F0F0F0F0F0F0FLL;
112 static uint64_t bFF=            0xFFFFFFFFFFFFFFFFLL;
113 static uint64_t b20=            0x2020202020202020LL;
114 static uint64_t b80=            0x8080808080808080LL;
115 static uint64_t b7E=            0x7E7E7E7E7E7E7E7ELL;
116 static uint64_t b7C=            0x7C7C7C7C7C7C7C7CLL;
117 static uint64_t b3F=            0x3F3F3F3F3F3F3F3FLL;
118 static uint64_t temp0=0;
119 static uint64_t temp1=0;
120 static uint64_t temp2=0;
121 static uint64_t temp3=0;
122 static uint64_t temp4=0;
123 static uint64_t temp5=0;
124 static uint64_t pQPb=0;
125 static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data
126
127 int hFlatnessThreshold= 56 - 16;
128 int vFlatnessThreshold= 56 - 16;
129
130 //amount of "black" u r willing to loose to get a brightness corrected picture
131 double maxClippedThreshold= 0.01;
132
133 int maxAllowedY=255;
134 //FIXME can never make a movie´s black brighter (anyone needs that?)
135 int minAllowedY=16;
136
137 #ifdef TIMING
138 static inline long long rdtsc()
139 {
140         long long l;
141         asm volatile(   "rdtsc\n\t"
142                 : "=A" (l)
143         );
144 //      printf("%d\n", int(l/1000));
145         return l;
146 }
147 #endif
148
149 #ifdef HAVE_MMX2
150 static inline void prefetchnta(void *p)
151 {
152         asm volatile(   "prefetchnta (%0)\n\t"
153                 : : "r" (p)
154         );
155 }
156
157 static inline void prefetcht0(void *p)
158 {
159         asm volatile(   "prefetcht0 (%0)\n\t"
160                 : : "r" (p)
161         );
162 }
163
164 static inline void prefetcht1(void *p)
165 {
166         asm volatile(   "prefetcht1 (%0)\n\t"
167                 : : "r" (p)
168         );
169 }
170
171 static inline void prefetcht2(void *p)
172 {
173         asm volatile(   "prefetcht2 (%0)\n\t"
174                 : : "r" (p)
175         );
176 }
177 #endif
178
179 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
180 /**
181  * Check if the middle 8x8 Block in the given 8x16 block is flat
182  */
183 static inline int isVertDC(uint8_t src[], int stride){
184         int numEq= 0;
185         int y;
186         src+= stride*4; // src points to begin of the 8x8 Block
187 #ifdef HAVE_MMX
188         asm volatile(
189                 "pushl %1\n\t"
190                 "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
191                 "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
192                 "movq (%1), %%mm0                               \n\t"
193                 "addl %2, %1                                    \n\t"
194                 "movq (%1), %%mm1                               \n\t"
195                 "psubb %%mm1, %%mm0                             \n\t" // mm0 = differnece
196                 "paddb %%mm7, %%mm0                             \n\t"
197                 "pcmpgtb %%mm6, %%mm0                           \n\t"
198
199                 "addl %2, %1                                    \n\t"
200                 "movq (%1), %%mm2                               \n\t"
201                 "psubb %%mm2, %%mm1                             \n\t"
202                 "paddb %%mm7, %%mm1                             \n\t"
203                 "pcmpgtb %%mm6, %%mm1                           \n\t"
204                 "paddb %%mm1, %%mm0                             \n\t"
205
206                 "addl %2, %1                                    \n\t"
207                 "movq (%1), %%mm1                               \n\t"
208                 "psubb %%mm1, %%mm2                             \n\t"
209                 "paddb %%mm7, %%mm2                             \n\t"
210                 "pcmpgtb %%mm6, %%mm2                           \n\t"
211                 "paddb %%mm2, %%mm0                             \n\t"
212
213                 "addl %2, %1                                    \n\t"
214                 "movq (%1), %%mm2                               \n\t"
215                 "psubb %%mm2, %%mm1                             \n\t"
216                 "paddb %%mm7, %%mm1                             \n\t"
217                 "pcmpgtb %%mm6, %%mm1                           \n\t"
218                 "paddb %%mm1, %%mm0                             \n\t"
219
220                 "addl %2, %1                                    \n\t"
221                 "movq (%1), %%mm1                               \n\t"
222                 "psubb %%mm1, %%mm2                             \n\t"
223                 "paddb %%mm7, %%mm2                             \n\t"
224                 "pcmpgtb %%mm6, %%mm2                           \n\t"
225                 "paddb %%mm2, %%mm0                             \n\t"
226
227                 "addl %2, %1                                    \n\t"
228                 "movq (%1), %%mm2                               \n\t"
229                 "psubb %%mm2, %%mm1                             \n\t"
230                 "paddb %%mm7, %%mm1                             \n\t"
231                 "pcmpgtb %%mm6, %%mm1                           \n\t"
232                 "paddb %%mm1, %%mm0                             \n\t"
233
234                 "addl %2, %1                                    \n\t"
235                 "movq (%1), %%mm1                               \n\t"
236                 "psubb %%mm1, %%mm2                             \n\t"
237                 "paddb %%mm7, %%mm2                             \n\t"
238                 "pcmpgtb %%mm6, %%mm2                           \n\t"
239                 "paddb %%mm2, %%mm0                             \n\t"
240
241                 "                                               \n\t"
242                 "movq %%mm0, %%mm1                              \n\t"
243                 "psrlw $8, %%mm0                                \n\t"
244                 "paddb %%mm1, %%mm0                             \n\t"
245                 "movq %%mm0, %%mm1                              \n\t"
246                 "psrlq $16, %%mm0                               \n\t"
247                 "paddb %%mm1, %%mm0                             \n\t"
248                 "movq %%mm0, %%mm1                              \n\t"
249                 "psrlq $32, %%mm0                               \n\t"
250                 "paddb %%mm1, %%mm0                             \n\t"
251                 "popl %1\n\t"
252                 "movd %%mm0, %0                                 \n\t"
253                 : "=r" (numEq)
254                 : "r" (src), "r" (stride)
255                 );
256 //      printf("%d\n", numEq);
257         numEq= (256 - (numEq & 0xFF)) &0xFF;
258
259 //      int asmEq= numEq;
260 //      numEq=0;
261 //      uint8_t *temp= src;
262
263 #else
264         for(y=0; y<BLOCK_SIZE-1; y++)
265         {
266                 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
267                 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
268                 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
269                 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
270                 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
271                 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
272                 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
273                 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
274                 src+= stride;
275         }
276 #endif
277 /*      if(abs(numEq - asmEq) > 0)
278         {
279                 printf("\nasm:%d  c:%d\n", asmEq, numEq);
280                 for(int y=0; y<8; y++)
281                 {
282                         for(int x=0; x<8; x++)
283                         {
284                                 printf("%d ", temp[x + y*stride]);
285                         }
286                         printf("\n");
287                 }
288         }
289 */
290 //      for(int i=0; i<numEq/8; i++) src[i]=255;
291         return (numEq > vFlatnessThreshold) ? 1 : 0;
292 }
293
294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
295 {
296 #ifdef HAVE_MMX
297         int isOk;
298         src+= stride*3;
299         asm volatile(
300 //              "int $3 \n\t"
301                 "movq (%1, %2), %%mm0                           \n\t"
302                 "movq (%1, %2, 8), %%mm1                        \n\t"
303                 "movq %%mm0, %%mm2                              \n\t"
304                 "psubusb %%mm1, %%mm0                           \n\t"
305                 "psubusb %%mm2, %%mm1                           \n\t"
306                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
307
308                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
309                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
310                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
311                 "pcmpeqd b00, %%mm0                             \n\t"
312                 "psrlq $16, %%mm0                               \n\t"
313                 "pcmpeqd bFF, %%mm0                             \n\t"
314 //              "movd %%mm0, (%1, %2, 4)\n\t"
315                 "movd %%mm0, %0                                 \n\t"
316                 : "=r" (isOk)
317                 : "r" (src), "r" (stride)
318                 );
319         return isOk ? 1 : 0;
320 #else
321
322         int isOk2= 1;
323         int x;
324         src+= stride*3;
325         for(x=0; x<BLOCK_SIZE; x++)
326         {
327                 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
328         }
329 /*      if(isOk && !isOk2 || !isOk && isOk2)
330         {
331                 printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
332                 for(int y=0; y<9; y++)
333                 {
334                         for(int x=0; x<8; x++)
335                         {
336                                 printf("%d ", src[x + y*stride]);
337                         }
338                         printf("\n");
339                 }
340         } */
341
342         return isOk2;
343 #endif
344
345 }
346
347 /**
348  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
349  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
350  */
351 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
352 {
353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
354         src+= stride*3;
355         asm volatile(   //"movv %0 %1 %2\n\t"
356                 "pushl %0 \n\t"
357                 "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
358
359                 "movq (%0), %%mm6                               \n\t"
360                 "movq (%0, %1), %%mm5                           \n\t"
361                 "movq %%mm5, %%mm1                              \n\t"
362                 "movq %%mm6, %%mm2                              \n\t"
363                 "psubusb %%mm6, %%mm5                           \n\t"
364                 "psubusb %%mm1, %%mm2                           \n\t"
365                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
366                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
367                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
368
369                 "pand %%mm2, %%mm6                              \n\t"
370                 "pandn %%mm1, %%mm2                             \n\t"
371                 "por %%mm2, %%mm6                               \n\t"// First Line to Filter
372
373                 "movq (%0, %1, 8), %%mm5                        \n\t"
374                 "leal (%0, %1, 4), %%eax                        \n\t"
375                 "leal (%0, %1, 8), %%ebx                        \n\t"
376                 "subl %1, %%ebx                                 \n\t"
377                 "addl %1, %0                                    \n\t" // %0 points to line 1 not 0
378                 "movq (%0, %1, 8), %%mm7                        \n\t"
379                 "movq %%mm5, %%mm1                              \n\t"
380                 "movq %%mm7, %%mm2                              \n\t"
381                 "psubusb %%mm7, %%mm5                           \n\t"
382                 "psubusb %%mm1, %%mm2                           \n\t"
383                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
384                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
385                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
386
387                 "pand %%mm2, %%mm7                              \n\t"
388                 "pandn %%mm1, %%mm2                             \n\t"
389                 "por %%mm2, %%mm7                               \n\t" // First Line to Filter
390
391
392                 //      1       2       3       4       5       6       7       8
393                 //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ebx     eax+4%1
394                 // 6 4 2 2 1 1
395                 // 6 4 4 2
396                 // 6 8 2
397
398                 "movq (%0, %1), %%mm0                           \n\t" //  1
399                 "movq %%mm0, %%mm1                              \n\t" //  1
400                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
401                 PAVGB(%%mm6, %%mm0)                                   //3 1     /4
402
403                 "movq (%0, %1, 4), %%mm2                        \n\t" //     1
404                 "movq %%mm2, %%mm5                              \n\t" //     1
405                 PAVGB((%%eax), %%mm2)                                 //    11  /2
406                 PAVGB((%0, %1, 2), %%mm2)                             //   211  /4
407                 "movq %%mm2, %%mm3                              \n\t" //   211  /4
408                 "movq (%0), %%mm4                               \n\t" // 1
409                 PAVGB(%%mm4, %%mm3)                                   // 4 211  /8
410                 PAVGB(%%mm0, %%mm3)                                   //642211  /16
411                 "movq %%mm3, (%0)                               \n\t" // X
412                 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
413                 "movq %%mm1, %%mm0                              \n\t" //  1
414                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
415                 "movq %%mm4, %%mm3                              \n\t" // 1
416                 PAVGB((%0,%1,2), %%mm3)                               // 1 1    /2
417                 PAVGB((%%eax,%1,2), %%mm5)                            //     11 /2
418                 PAVGB((%%eax), %%mm5)                                 //    211 /4
419                 PAVGB(%%mm5, %%mm3)                                   // 2 2211 /8
420                 PAVGB(%%mm0, %%mm3)                                   //4242211 /16
421                 "movq %%mm3, (%0,%1)                            \n\t" //  X
422                 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
423                 PAVGB(%%mm4, %%mm6)                                   //11      /2
424                 "movq (%%ebx), %%mm0                            \n\t" //       1
425                 PAVGB((%%eax, %1, 2), %%mm0)                          //      11/2
426                 "movq %%mm0, %%mm3                              \n\t" //      11/2
427                 PAVGB(%%mm1, %%mm0)                                   //  2   11/4
428                 PAVGB(%%mm6, %%mm0)                                   //222   11/8
429                 PAVGB(%%mm2, %%mm0)                                   //22242211/16
430                 "movq (%0, %1, 2), %%mm2                        \n\t" //   1
431                 "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
432                 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
433                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
434                 PAVGB((%%ebx), %%mm0)                                 //       11       /2
435                 PAVGB(%%mm0, %%mm6)                                   //11     11       /4
436                 PAVGB(%%mm1, %%mm4)                                   // 11             /2
437                 PAVGB(%%mm2, %%mm1)                                   //  11            /2
438                 PAVGB(%%mm1, %%mm6)                                   //1122   11       /8
439                 PAVGB(%%mm5, %%mm6)                                   //112242211       /16
440                 "movq (%%eax), %%mm5                            \n\t" //    1
441                 "movq %%mm6, (%%eax)                            \n\t" //    X
442                 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
443                 "movq (%%eax, %1, 4), %%mm6                     \n\t" //        1
444                 PAVGB(%%mm7, %%mm6)                                   //        11      /2
445                 PAVGB(%%mm4, %%mm6)                                   // 11     11      /4
446                 PAVGB(%%mm3, %%mm6)                                   // 11   2211      /8
447                 PAVGB(%%mm5, %%mm2)                                   //   11           /2
448                 "movq (%0, %1, 4), %%mm4                        \n\t" //     1
449                 PAVGB(%%mm4, %%mm2)                                   //   112          /4
450                 PAVGB(%%mm2, %%mm6)                                   // 112242211      /16
451                 "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
452                 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
453                 PAVGB(%%mm7, %%mm1)                                   //  11     2      /4
454                 PAVGB(%%mm4, %%mm5)                                   //    11          /2
455                 PAVGB(%%mm5, %%mm0)                                   //    11 11       /4
456                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
457                 PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
458                 PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
459                 "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
460                 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
461                 PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
462                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
463                 PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
464                 PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
465                 PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
466                 "movq %%mm6, (%%ebx)                            \n\t" //       X
467                 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
468                 PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
469                 PAVGB(%%mm7, %%mm5)                                   //    11   6      /8
470
471                 PAVGB(%%mm3, %%mm0)                                   //      112       /4
472                 PAVGB(%%mm0, %%mm5)                                   //    112246      /16
473                 "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
474                 "popl %0\n\t"
475
476                 :
477                 : "r" (src), "r" (stride)
478                 : "%eax", "%ebx"
479         );
480 #else
481         const int l1= stride;
482         const int l2= stride + l1;
483         const int l3= stride + l2;
484         const int l4= stride + l3;
485         const int l5= stride + l4;
486         const int l6= stride + l5;
487         const int l7= stride + l6;
488         const int l8= stride + l7;
489         const int l9= stride + l8;
490         int x;
491         src+= stride*3;
492         for(x=0; x<BLOCK_SIZE; x++)
493         {
494                 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
495                 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
496
497                 int sums[9];
498                 sums[0] = first + src[l1];
499                 sums[1] = src[l1] + src[l2];
500                 sums[2] = src[l2] + src[l3];
501                 sums[3] = src[l3] + src[l4];
502                 sums[4] = src[l4] + src[l5];
503                 sums[5] = src[l5] + src[l6];
504                 sums[6] = src[l6] + src[l7];
505                 sums[7] = src[l7] + src[l8];
506                 sums[8] = src[l8] + last;
507
508                 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
509                 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
510                 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
511                 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
512                 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
513                 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
514                 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
515                 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
516
517                 src++;
518         }
519
520 #endif
521 }
522
523 /**
524  * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
525  * values are correctly clipped (MMX2)
526  * values are wraparound (C)
527  * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
528         0 8 16 24
529         x = 8
530         x/2 = 4
531         x/8 = 1
532         1 12 12 23
533  */
534 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
535 {
536 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
537         src+= stride*3;
538 // FIXME rounding
539         asm volatile(
540                 "pxor %%mm7, %%mm7                              \n\t" // 0
541                 "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
542                 "leal (%0, %1), %%eax                           \n\t"
543                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
544 //      0       1       2       3       4       5       6       7       8       9
545 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
546                 "movq pQPb, %%mm0                               \n\t" // QP,..., QP
547                 "movq %%mm0, %%mm1                              \n\t" // QP,..., QP
548                 "paddusb b02, %%mm0                             \n\t"
549                 "psrlw $2, %%mm0                                \n\t"
550                 "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
551                 "paddusb %%mm1, %%mm0                           \n\t" // QP*1.25 ...
552                 "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
553                 "movq (%%ebx), %%mm3                            \n\t" // line 5
554                 "movq %%mm2, %%mm4                              \n\t" // line 4
555                 "pcmpeqb %%mm5, %%mm5                           \n\t" // -1
556                 "pxor %%mm2, %%mm5                              \n\t" // -line 4 - 1
557                 PAVGB(%%mm3, %%mm5)
558                 "paddb %%mm6, %%mm5                             \n\t" // (l5-l4)/2
559                 "psubusb %%mm3, %%mm4                           \n\t"
560                 "psubusb %%mm2, %%mm3                           \n\t"
561                 "por %%mm3, %%mm4                               \n\t" // |l4 - l5|
562                 "psubusb %%mm0, %%mm4                           \n\t"
563                 "pcmpeqb %%mm7, %%mm4                           \n\t"
564                 "pand %%mm4, %%mm5                              \n\t" // d/2
565
566 //              "paddb %%mm6, %%mm2                             \n\t" // line 4 + 0x80
567                 "paddb %%mm5, %%mm2                             \n\t"
568 //              "psubb %%mm6, %%mm2                             \n\t"
569                 "movq %%mm2, (%0,%1, 4)                         \n\t"
570
571                 "movq (%%ebx), %%mm2                            \n\t"
572 //              "paddb %%mm6, %%mm2                             \n\t" // line 5 + 0x80
573                 "psubb %%mm5, %%mm2                             \n\t"
574 //              "psubb %%mm6, %%mm2                             \n\t"
575                 "movq %%mm2, (%%ebx)                            \n\t"
576
577                 "paddb %%mm6, %%mm5                             \n\t"
578                 "psrlw $2, %%mm5                                \n\t"
579                 "pand b3F, %%mm5                                \n\t"
580                 "psubb b20, %%mm5                               \n\t" // (l5-l4)/8
581
582                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
583                 "paddb %%mm6, %%mm2                             \n\t" // line 3 + 0x80
584                 "paddsb %%mm5, %%mm2                            \n\t"
585                 "psubb %%mm6, %%mm2                             \n\t"
586                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
587
588                 "movq (%%ebx, %1), %%mm2                        \n\t"
589                 "paddb %%mm6, %%mm2                             \n\t" // line 6 + 0x80
590                 "psubsb %%mm5, %%mm2                            \n\t"
591                 "psubb %%mm6, %%mm2                             \n\t"
592                 "movq %%mm2, (%%ebx, %1)                        \n\t"
593
594                 :
595                 : "r" (src), "r" (stride)
596                 : "%eax", "%ebx"
597         );
598 #else
599         const int l1= stride;
600         const int l2= stride + l1;
601         const int l3= stride + l2;
602         const int l4= stride + l3;
603         const int l5= stride + l4;
604         const int l6= stride + l5;
605         const int l7= stride + l6;
606         const int l8= stride + l7;
607         const int l9= stride + l8;
608         int x;
609         src+= stride*3;
610         for(x=0; x<BLOCK_SIZE; x++)
611         {
612                 if(ABS(src[l4]-src[l5]) < QP + QP/4)
613                 {
614                         int v = (src[l5] - src[l4]);
615
616                         src[l3] +=v/8;
617                         src[l4] +=v/2;
618                         src[l5] -=v/2;
619                         src[l6] -=v/8;
620
621                 }
622                 src++;
623         }
624
625 #endif
626 }
627
628 /**
629  * Experimental Filter 1
630  * will not damage linear gradients
631  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
632  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
633  * MMX2 version does correct clipping C version doesnt
634  */
635 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
636 {
637 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
638         src+= stride*3;
639
640         asm volatile(
641                 "pxor %%mm7, %%mm7                              \n\t" // 0
642 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
643                 "leal (%0, %1), %%eax                           \n\t"
644                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
645 //      0       1       2       3       4       5       6       7       8       9
646 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
647                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
648                 "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
649                 "movq %%mm1, %%mm2                              \n\t" // line 4
650                 "psubusb %%mm0, %%mm1                           \n\t"
651                 "psubusb %%mm2, %%mm0                           \n\t"
652                 "por %%mm1, %%mm0                               \n\t" // |l2 - l3|
653                 "movq (%%ebx), %%mm3                            \n\t" // line 5
654                 "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
655                 "movq %%mm3, %%mm5                              \n\t" // line 5
656                 "psubusb %%mm4, %%mm3                           \n\t"
657                 "psubusb %%mm5, %%mm4                           \n\t"
658                 "por %%mm4, %%mm3                               \n\t" // |l5 - l6|
659                 PAVGB(%%mm3, %%mm0)                                   // (|l2 - l3| + |l5 - l6|)/2
660                 "movq %%mm2, %%mm1                              \n\t" // line 4
661                 "psubusb %%mm5, %%mm2                           \n\t"
662                 "movq %%mm2, %%mm4                              \n\t"
663                 "pcmpeqb %%mm7, %%mm2                           \n\t" // (l4 - l5) <= 0 ? -1 : 0
664                 "psubusb %%mm1, %%mm5                           \n\t"
665                 "por %%mm5, %%mm4                               \n\t" // |l4 - l5|
666                 "psubusb %%mm0, %%mm4           \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
667                 "movq %%mm4, %%mm3                              \n\t" // d
668                 "psubusb pQPb, %%mm4                            \n\t"
669                 "pcmpeqb %%mm7, %%mm4                           \n\t" // d <= QP ? -1 : 0
670                 "psubusb b01, %%mm3                             \n\t"
671                 "pand %%mm4, %%mm3                              \n\t" // d <= QP ? d : 0
672
673                 PAVGB(%%mm7, %%mm3)                                   // d/2
674                 "movq %%mm3, %%mm1                              \n\t" // d/2
675                 PAVGB(%%mm7, %%mm3)                                   // d/4
676                 PAVGB(%%mm1, %%mm3)                                   // 3*d/8
677
678                 "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
679                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
680                 "psubusb %%mm3, %%mm0                           \n\t"
681                 "pxor %%mm2, %%mm0                              \n\t"
682                 "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
683
684                 "movq (%%ebx), %%mm0                            \n\t" // line 5
685                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
686                 "paddusb %%mm3, %%mm0                           \n\t"
687                 "pxor %%mm2, %%mm0                              \n\t"
688                 "movq %%mm0, (%%ebx)                            \n\t" // line 5
689
690                 PAVGB(%%mm7, %%mm1)                                   // d/4
691
692                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
693                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
694                 "psubusb %%mm1, %%mm0                           \n\t"
695                 "pxor %%mm2, %%mm0                              \n\t"
696                 "movq %%mm0, (%%eax, %1, 2)                     \n\t" // line 3
697
698                 "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
699                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
700                 "paddusb %%mm1, %%mm0                           \n\t"
701                 "pxor %%mm2, %%mm0                              \n\t"
702                 "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
703
704                 PAVGB(%%mm7, %%mm1)                                   // d/8
705
706                 "movq (%%eax, %1), %%mm0                        \n\t" // line 2
707                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
708                 "psubusb %%mm1, %%mm0                           \n\t"
709                 "pxor %%mm2, %%mm0                              \n\t"
710                 "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
711
712                 "movq (%%ebx, %1, 2), %%mm0                     \n\t" // line 7
713                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
714                 "paddusb %%mm1, %%mm0                           \n\t"
715                 "pxor %%mm2, %%mm0                              \n\t"
716                 "movq %%mm0, (%%ebx, %1, 2)                     \n\t" // line 7
717
718                 :
719                 : "r" (src), "r" (stride)
720                 : "%eax", "%ebx"
721         );
722 #else
723
724         const int l1= stride;
725         const int l2= stride + l1;
726         const int l3= stride + l2;
727         const int l4= stride + l3;
728         const int l5= stride + l4;
729         const int l6= stride + l5;
730         const int l7= stride + l6;
731         const int l8= stride + l7;
732         const int l9= stride + l8;
733         int x;
734
735         src+= stride*3;
736         for(x=0; x<BLOCK_SIZE; x++)
737         {
738                 int a= src[l3] - src[l4];
739                 int b= src[l4] - src[l5];
740                 int c= src[l5] - src[l6];
741
742                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
743
744                 if(d < QP)
745                 {
746                         int v = d * SIGN(-b);
747
748                         src[l2] +=v/8;
749                         src[l3] +=v/4;
750                         src[l4] +=3*v/8;
751                         src[l5] -=3*v/8;
752                         src[l6] -=v/4;
753                         src[l7] -=v/8;
754
755                 }
756                 src++;
757         }
758         /*
759         const int l1= stride;
760         const int l2= stride + l1;
761         const int l3= stride + l2;
762         const int l4= stride + l3;
763         const int l5= stride + l4;
764         const int l6= stride + l5;
765         const int l7= stride + l6;
766         const int l8= stride + l7;
767         const int l9= stride + l8;
768         for(int x=0; x<BLOCK_SIZE; x++)
769         {
770                 int v2= src[l2];
771                 int v3= src[l3];
772                 int v4= src[l4];
773                 int v5= src[l5];
774                 int v6= src[l6];
775                 int v7= src[l7];
776
777                 if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
778                 {
779                         src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
780                         src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
781                         src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
782                         src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
783                 }
784                 src++;
785         }
786 */
787 #endif
788 }
789
790 /**
791  * Experimental Filter 1 (Horizontal)
792  * will not damage linear gradients
793  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
794  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
795  * MMX2 version does correct clipping C version doesnt
796  * not identical with the vertical one
797  */
798 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
799 {
800         int y;
801         static uint64_t *lut= NULL;
802         if(lut==NULL)
803         {
804                 int i;
805                 lut= (uint64_t*)memalign(8, 256*8);
806                 for(i=0; i<256; i++)
807                 {
808                         int v= i < 128 ? 2*i : 2*(i-256);
809 /*
810 //Simulate 112242211 9-Tap filter
811                         uint64_t a= (v/16) & 0xFF;
812                         uint64_t b= (v/8) & 0xFF;
813                         uint64_t c= (v/4) & 0xFF;
814                         uint64_t d= (3*v/8) & 0xFF;
815 */
816 //Simulate piecewise linear interpolation
817                         uint64_t a= (v/16) & 0xFF;
818                         uint64_t b= (v*3/16) & 0xFF;
819                         uint64_t c= (v*5/16) & 0xFF;
820                         uint64_t d= (7*v/16) & 0xFF;
821                         uint64_t A= (0x100 - a)&0xFF;
822                         uint64_t B= (0x100 - b)&0xFF;
823                         uint64_t C= (0x100 - c)&0xFF;
824                         uint64_t D= (0x100 - c)&0xFF;
825
826                         lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
827                                 (D<<24) | (C<<16) | (B<<8) | (A);
828                         //lut[i] = (v<<32) | (v<<24);
829                 }
830         }
831
832 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
833         asm volatile(
834                 "pxor %%mm7, %%mm7                              \n\t" // 0
835 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
836                 "leal (%0, %1), %%eax                           \n\t"
837                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
838
839                 "movq b80, %%mm6                                \n\t"
840                 "movd pQPb, %%mm5                               \n\t" // QP
841                 "movq %%mm5, %%mm4                              \n\t"
842                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
843                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
844                 "pxor %%mm5, %%mm5                              \n\t" // 0
845                 "psubb %%mm4, %%mm5                             \n\t" // -3QP
846                 "por bm11111110, %%mm5                          \n\t" // ...,FF,FF,-3QP
847                 "psllq $24, %%mm5                               \n\t"
848
849 //      0       1       2       3       4       5       6       7       8       9
850 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
851
852 #define HX1old(a) \
853                 "movd " #a ", %%mm0                             \n\t"\
854                 "movd 4" #a ", %%mm1                            \n\t"\
855                 "punpckldq %%mm1, %%mm0                         \n\t"\
856                 "movq %%mm0, %%mm1                              \n\t"\
857                 "movq %%mm0, %%mm2                              \n\t"\
858                 "psrlq $8, %%mm1                                \n\t"\
859                 "psubusb %%mm1, %%mm2                           \n\t"\
860                 "psubusb %%mm0, %%mm1                           \n\t"\
861                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
862                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
863                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
864                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
865                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
866                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
867                 "paddb %%mm5, %%mm1                             \n\t"\
868                 "psubusb %%mm5, %%mm1                           \n\t"\
869                 PAVGB(%%mm7, %%mm1)\
870                 "pxor %%mm2, %%mm1                              \n\t"\
871                 "psubb %%mm2, %%mm1                             \n\t"\
872                 "psrlq $24, %%mm1                               \n\t"\
873                 "movd %%mm1, %%ecx                              \n\t"\
874                 "paddb %%mm6, %%mm0                             \n\t"\
875                 "paddsb (%3, %%ecx, 8), %%mm0                   \n\t"\
876                 "paddb %%mm6, %%mm0                             \n\t"\
877                 "movq %%mm0, " #a "                             \n\t"\
878
879 /*
880 HX1old((%0))
881 HX1old((%%eax))
882 HX1old((%%eax, %1))
883 HX1old((%%eax, %1, 2))
884 HX1old((%0, %1, 4))
885 HX1old((%%ebx))
886 HX1old((%%ebx, %1))
887 HX1old((%%ebx, %1, 2))
888 */
889
890 //FIXME add some comments, its unreadable ...
891 #define HX1b(a, c, b, d) \
892                 "movd " #a ", %%mm0                             \n\t"\
893                 "movd 4" #a ", %%mm1                            \n\t"\
894                 "punpckldq %%mm1, %%mm0                         \n\t"\
895                 "movd " #b ", %%mm4                             \n\t"\
896                 "movq %%mm0, %%mm1                              \n\t"\
897                 "movq %%mm0, %%mm2                              \n\t"\
898                 "psrlq $8, %%mm1                                \n\t"\
899                 "movd 4" #b ", %%mm3                            \n\t"\
900                 "psubusb %%mm1, %%mm2                           \n\t"\
901                 "psubusb %%mm0, %%mm1                           \n\t"\
902                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
903                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
904                 "punpckldq %%mm3, %%mm4                         \n\t"\
905                 "movq %%mm1, %%mm3                              \n\t"\
906                 "psllq $32, %%mm3                               \n\t" /* p´5 = |p1 - p2| */\
907                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
908                 "paddb %%mm6, %%mm0                             \n\t"\
909                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
910                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
911                 "movq %%mm4, %%mm3                              \n\t"\
912                 "paddb %%mm5, %%mm1                             \n\t"\
913                 "psubusb %%mm5, %%mm1                           \n\t"\
914                 "psrlq $8, %%mm3                                \n\t"\
915                 PAVGB(%%mm7, %%mm1)\
916                 "pxor %%mm2, %%mm1                              \n\t"\
917                 "psubb %%mm2, %%mm1                             \n\t"\
918                 "movq %%mm4, %%mm2                              \n\t"\
919                 "psrlq $24, %%mm1                               \n\t"\
920                 "psubusb %%mm3, %%mm2                           \n\t"\
921                 "movd %%mm1, %%ecx                              \n\t"\
922                 "psubusb %%mm4, %%mm3                           \n\t"\
923                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
924                 "por %%mm2, %%mm3                               \n\t" /* p´x = |px - p(x+1)| */\
925                 "paddb %%mm6, %%mm0                             \n\t"\
926                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
927                 "movq %%mm3, %%mm1                              \n\t"\
928                 "psllq $32, %%mm1                               \n\t" /* p´5 = |p1 - p2| */\
929                 "movq %%mm0, " #a "                             \n\t"\
930                 PAVGB(%%mm3, %%mm1)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
931                 "paddb %%mm6, %%mm4                             \n\t"\
932                 "psrlq $16, %%mm1                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
933                 "psubusb %%mm1, %%mm3                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
934                 "paddb %%mm5, %%mm3                             \n\t"\
935                 "psubusb %%mm5, %%mm3                           \n\t"\
936                 PAVGB(%%mm7, %%mm3)\
937                 "pxor %%mm2, %%mm3                              \n\t"\
938                 "psubb %%mm2, %%mm3                             \n\t"\
939                 "psrlq $24, %%mm3                               \n\t"\
940                 "movd " #c ", %%mm0                             \n\t"\
941                 "movd 4" #c ", %%mm1                            \n\t"\
942                 "punpckldq %%mm1, %%mm0                         \n\t"\
943                 "paddb %%mm6, %%mm0                             \n\t"\
944                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
945                 "paddb %%mm6, %%mm0                             \n\t"\
946                 "movq %%mm0, " #c "                             \n\t"\
947                 "movd %%mm3, %%ecx                              \n\t"\
948                 "movd " #d ", %%mm0                             \n\t"\
949                 "paddsb (%2, %%ecx, 8), %%mm4                   \n\t"\
950                 "movd 4" #d ", %%mm1                            \n\t"\
951                 "paddb %%mm6, %%mm4                             \n\t"\
952                 "punpckldq %%mm1, %%mm0                         \n\t"\
953                 "movq %%mm4, " #b "                             \n\t"\
954                 "paddb %%mm6, %%mm0                             \n\t"\
955                 "paddsb (%2, %%ecx, 8), %%mm0                   \n\t"\
956                 "paddb %%mm6, %%mm0                             \n\t"\
957                 "movq %%mm0, " #d "                             \n\t"\
958
959 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
960 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
961
962
963                 :
964                 : "r" (src), "r" (stride), "r" (lut)
965                 : "%eax", "%ebx", "%ecx"
966         );
967 #else
968
969 //FIXME (has little in common with the mmx2 version)
970         for(y=0; y<BLOCK_SIZE; y++)
971         {
972                 int a= src[1] - src[2];
973                 int b= src[3] - src[4];
974                 int c= src[5] - src[6];
975
976                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
977
978                 if(d < QP)
979                 {
980                         int v = d * SIGN(-b);
981
982                         src[1] +=v/8;
983                         src[2] +=v/4;
984                         src[3] +=3*v/8;
985                         src[4] -=3*v/8;
986                         src[5] -=v/4;
987                         src[6] -=v/8;
988
989                 }
990                 src+=stride;
991         }
992 #endif
993 }
994
995
996 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
997 {
998 #ifdef HAVE_MMX
999         src+= stride*4;
1000         //FIXME try pmul for *5 stuff
1001 //      src[0]=0;
1002         asm volatile(
1003                 "pxor %%mm7, %%mm7                              \n\t"
1004                 "leal (%0, %1), %%eax                           \n\t"
1005                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1006 //      0       1       2       3       4       5       6       7
1007 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1008 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1009
1010                 "movq (%0), %%mm0                               \n\t"
1011                 "movq %%mm0, %%mm1                              \n\t"
1012                 "punpcklbw %%mm7, %%mm0                         \n\t" // low part of line 0
1013                 "punpckhbw %%mm7, %%mm1                         \n\t" // high part of line 0
1014
1015                 "movq (%%eax), %%mm2                            \n\t"
1016                 "movq %%mm2, %%mm3                              \n\t"
1017                 "punpcklbw %%mm7, %%mm2                         \n\t" // low part of line 1
1018                 "punpckhbw %%mm7, %%mm3                         \n\t" // high part of line 1
1019
1020                 "movq (%%eax, %1), %%mm4                        \n\t"
1021                 "movq %%mm4, %%mm5                              \n\t"
1022                 "punpcklbw %%mm7, %%mm4                         \n\t" // low part of line 2
1023                 "punpckhbw %%mm7, %%mm5                         \n\t" // high part of line 2
1024
1025                 "paddw %%mm0, %%mm0                             \n\t" // 2L0
1026                 "paddw %%mm1, %%mm1                             \n\t" // 2H0
1027                 "psubw %%mm4, %%mm2                             \n\t" // L1 - L2
1028                 "psubw %%mm5, %%mm3                             \n\t" // H1 - H2
1029                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - L1 + L2
1030                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - H1 + H2
1031
1032                 "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1033                 "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1034                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2
1035                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2
1036
1037                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
1038                 "movq %%mm2, %%mm3                              \n\t"
1039                 "punpcklbw %%mm7, %%mm2                         \n\t" // L3
1040                 "punpckhbw %%mm7, %%mm3                         \n\t" // H3
1041
1042                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - L3
1043                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - H3
1044                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1045                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1046                 "movq %%mm0, temp0                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1047                 "movq %%mm1, temp1                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1048
1049                 "movq (%0, %1, 4), %%mm0                        \n\t"
1050                 "movq %%mm0, %%mm1                              \n\t"
1051                 "punpcklbw %%mm7, %%mm0                         \n\t" // L4
1052                 "punpckhbw %%mm7, %%mm1                         \n\t" // H4
1053
1054                 "psubw %%mm0, %%mm2                             \n\t" // L3 - L4
1055                 "psubw %%mm1, %%mm3                             \n\t" // H3 - H4
1056                 "movq %%mm2, temp2                              \n\t" // L3 - L4
1057                 "movq %%mm3, temp3                              \n\t" // H3 - H4
1058                 "paddw %%mm4, %%mm4                             \n\t" // 2L2
1059                 "paddw %%mm5, %%mm5                             \n\t" // 2H2
1060                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - L3 + L4
1061                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - H3 + H4
1062
1063                 "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1064                 "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1065                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4
1066                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4
1067 //50 opcodes so far
1068                 "movq (%%ebx), %%mm2                            \n\t"
1069                 "movq %%mm2, %%mm3                              \n\t"
1070                 "punpcklbw %%mm7, %%mm2                         \n\t" // L5
1071                 "punpckhbw %%mm7, %%mm3                         \n\t" // H5
1072                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - L5
1073                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - H5
1074                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1075                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1076
1077                 "movq (%%ebx, %1), %%mm6                        \n\t"
1078                 "punpcklbw %%mm7, %%mm6                         \n\t" // L6
1079                 "psubw %%mm6, %%mm2                             \n\t" // L5 - L6
1080                 "movq (%%ebx, %1), %%mm6                        \n\t"
1081                 "punpckhbw %%mm7, %%mm6                         \n\t" // H6
1082                 "psubw %%mm6, %%mm3                             \n\t" // H5 - H6
1083
1084                 "paddw %%mm0, %%mm0                             \n\t" // 2L4
1085                 "paddw %%mm1, %%mm1                             \n\t" // 2H4
1086                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - L5 + L6
1087                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - H5 + H6
1088
1089                 "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1090                 "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1091                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6
1092                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6
1093
1094                 "movq (%%ebx, %1, 2), %%mm2                     \n\t"
1095                 "movq %%mm2, %%mm3                              \n\t"
1096                 "punpcklbw %%mm7, %%mm2                         \n\t" // L7
1097                 "punpckhbw %%mm7, %%mm3                         \n\t" // H7
1098
1099                 "paddw %%mm2, %%mm2                             \n\t" // 2L7
1100                 "paddw %%mm3, %%mm3                             \n\t" // 2H7
1101                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1102                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1103
1104                 "movq temp0, %%mm2                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1105                 "movq temp1, %%mm3                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1106 //FIXME pxor, psubw, pmax for abs
1107                 "movq %%mm7, %%mm6                              \n\t" // 0
1108                 "pcmpgtw %%mm0, %%mm6                           \n\t"
1109                 "pxor %%mm6, %%mm0                              \n\t"
1110                 "psubw %%mm6, %%mm0                             \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1111                 "movq %%mm7, %%mm6                              \n\t" // 0
1112                 "pcmpgtw %%mm1, %%mm6                           \n\t"
1113                 "pxor %%mm6, %%mm1                              \n\t"
1114                 "psubw %%mm6, %%mm1                             \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1115
1116                 "movq %%mm7, %%mm6                              \n\t" // 0
1117                 "pcmpgtw %%mm2, %%mm6                           \n\t"
1118                 "pxor %%mm6, %%mm2                              \n\t"
1119                 "psubw %%mm6, %%mm2                             \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1120                 "movq %%mm7, %%mm6                              \n\t" // 0
1121                 "pcmpgtw %%mm3, %%mm6                           \n\t"
1122                 "pxor %%mm6, %%mm3                              \n\t"
1123                 "psubw %%mm6, %%mm3                             \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1124
1125 #ifdef HAVE_MMX2
1126                 "pminsw %%mm2, %%mm0                            \n\t"
1127                 "pminsw %%mm3, %%mm1                            \n\t"
1128 #else
1129                 "movq %%mm0, %%mm6                              \n\t"
1130                 "psubusw %%mm2, %%mm6                           \n\t"
1131                 "psubw %%mm6, %%mm0                             \n\t"
1132                 "movq %%mm1, %%mm6                              \n\t"
1133                 "psubusw %%mm3, %%mm6                           \n\t"
1134                 "psubw %%mm6, %%mm1                             \n\t"
1135 #endif
1136
1137                 "movq %%mm7, %%mm6                              \n\t" // 0
1138                 "pcmpgtw %%mm4, %%mm6                           \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1139                 "pxor %%mm6, %%mm4                              \n\t"
1140                 "psubw %%mm6, %%mm4                             \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1141                 "pcmpgtw %%mm5, %%mm7                           \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1142                 "pxor %%mm7, %%mm5                              \n\t"
1143                 "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1144 // 100 opcodes
1145                 "movd %2, %%mm2                                 \n\t" // QP
1146                 "punpcklwd %%mm2, %%mm2                         \n\t"
1147                 "punpcklwd %%mm2, %%mm2                         \n\t"
1148                 "psllw $3, %%mm2                                \n\t" // 8QP
1149                 "movq %%mm2, %%mm3                              \n\t" // 8QP
1150                 "pcmpgtw %%mm4, %%mm2                           \n\t"
1151                 "pcmpgtw %%mm5, %%mm3                           \n\t"
1152                 "pand %%mm2, %%mm4                              \n\t"
1153                 "pand %%mm3, %%mm5                              \n\t"
1154
1155
1156                 "psubusw %%mm0, %%mm4                           \n\t" // hd
1157                 "psubusw %%mm1, %%mm5                           \n\t" // ld
1158
1159
1160                 "movq w05, %%mm2                                \n\t" // 5
1161                 "pmullw %%mm2, %%mm4                            \n\t"
1162                 "pmullw %%mm2, %%mm5                            \n\t"
1163                 "movq w20, %%mm2                                \n\t" // 32
1164                 "paddw %%mm2, %%mm4                             \n\t"
1165                 "paddw %%mm2, %%mm5                             \n\t"
1166                 "psrlw $6, %%mm4                                \n\t"
1167                 "psrlw $6, %%mm5                                \n\t"
1168
1169 /*
1170                 "movq w06, %%mm2                                \n\t" // 6
1171                 "paddw %%mm2, %%mm4                             \n\t"
1172                 "paddw %%mm2, %%mm5                             \n\t"
1173                 "movq w1400, %%mm2                              \n\t" // 1400h = 5120 = 5/64*2^16
1174 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1175                 "pmulhw %%mm2, %%mm4                            \n\t" // hd/13
1176                 "pmulhw %%mm2, %%mm5                            \n\t" // ld/13
1177 */
1178
1179                 "movq temp2, %%mm0                              \n\t" // L3 - L4
1180                 "movq temp3, %%mm1                              \n\t" // H3 - H4
1181
1182                 "pxor %%mm2, %%mm2                              \n\t"
1183                 "pxor %%mm3, %%mm3                              \n\t"
1184
1185                 // FIXME rounding error
1186                 "psraw $1, %%mm0                                \n\t" // (L3 - L4)/2
1187                 "psraw $1, %%mm1                                \n\t" // (H3 - H4)/2
1188                 "pcmpgtw %%mm0, %%mm2                           \n\t" // sign (L3-L4)
1189                 "pcmpgtw %%mm1, %%mm3                           \n\t" // sign (H3-H4)
1190                 "pxor %%mm2, %%mm0                              \n\t"
1191                 "pxor %%mm3, %%mm1                              \n\t"
1192                 "psubw %%mm2, %%mm0                             \n\t" // |L3-L4|
1193                 "psubw %%mm3, %%mm1                             \n\t" // |H3-H4|
1194 //              "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1195 //              "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1196
1197                 "pxor %%mm6, %%mm2                              \n\t"
1198                 "pxor %%mm7, %%mm3                              \n\t"
1199                 "pand %%mm2, %%mm4                              \n\t"
1200                 "pand %%mm3, %%mm5                              \n\t"
1201
1202 #ifdef HAVE_MMX2
1203                 "pminsw %%mm0, %%mm4                            \n\t"
1204                 "pminsw %%mm1, %%mm5                            \n\t"
1205 #else
1206                 "movq %%mm4, %%mm2                              \n\t"
1207                 "psubusw %%mm0, %%mm2                           \n\t"
1208                 "psubw %%mm2, %%mm4                             \n\t"
1209                 "movq %%mm5, %%mm2                              \n\t"
1210                 "psubusw %%mm1, %%mm2                           \n\t"
1211                 "psubw %%mm2, %%mm5                             \n\t"
1212 #endif
1213                 "pxor %%mm6, %%mm4                              \n\t"
1214                 "pxor %%mm7, %%mm5                              \n\t"
1215                 "psubw %%mm6, %%mm4                             \n\t"
1216                 "psubw %%mm7, %%mm5                             \n\t"
1217                 "packsswb %%mm5, %%mm4                          \n\t"
1218                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1219                 "paddb   %%mm4, %%mm0                           \n\t"
1220                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1221                 "movq (%0, %1, 4), %%mm0                        \n\t"
1222                 "psubb %%mm4, %%mm0                             \n\t"
1223                 "movq %%mm0, (%0, %1, 4)                        \n\t"
1224
1225                 :
1226                 : "r" (src), "r" (stride), "r" (QP)
1227                 : "%eax", "%ebx"
1228         );
1229 #else
1230         const int l1= stride;
1231         const int l2= stride + l1;
1232         const int l3= stride + l2;
1233         const int l4= stride + l3;
1234         const int l5= stride + l4;
1235         const int l6= stride + l5;
1236         const int l7= stride + l6;
1237         const int l8= stride + l7;
1238 //      const int l9= stride + l8;
1239         int x;
1240         src+= stride*3;
1241         for(x=0; x<BLOCK_SIZE; x++)
1242         {
1243                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1244                 if(ABS(middleEnergy) < 8*QP)
1245                 {
1246                         const int q=(src[l4] - src[l5])/2;
1247                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1248                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1249
1250                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1251                         d= MAX(d, 0);
1252
1253                         d= (5*d + 32) >> 6;
1254                         d*= SIGN(-middleEnergy);
1255
1256                         if(q>0)
1257                         {
1258                                 d= d<0 ? 0 : d;
1259                                 d= d>q ? q : d;
1260                         }
1261                         else
1262                         {
1263                                 d= d>0 ? 0 : d;
1264                                 d= d<q ? q : d;
1265                         }
1266
1267                         src[l4]-= d;
1268                         src[l5]+= d;
1269                 }
1270                 src++;
1271         }
1272 #endif
1273 }
1274
1275 //FIXME?  |255-0| = 1
1276 /**
1277  * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
1278  */
1279 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
1280 {
1281 //      src++;
1282         int numEq= 0;
1283 #ifdef HAVE_MMX
1284 asm volatile (
1285 //              "int $3 \n\t"
1286                 "pushl %1\n\t"
1287                 "movq b7E, %%mm7                                \n\t" // mm7 = 0x7F
1288                 "movq b7C, %%mm6                                \n\t" // mm6 = 0x7D
1289                 "leal tempBlock, %%eax                          \n\t"
1290                 "pxor %%mm0, %%mm0                              \n\t"
1291
1292 #define HDC_CHECK_AND_CPY(i) \
1293                 "movq -4(%1), %%mm2                             \n\t"\
1294                 "psrlq $32, %%mm2                               \n\t"\
1295                 "punpckldq 4(%1), %%mm2                         \n\t" /* (%1) */\
1296                 "movq %%mm2, %%mm1                              \n\t"\
1297                 "psrlq $8, %%mm2                                \n\t"\
1298                 "psubb %%mm1, %%mm2                             \n\t"\
1299                 "paddb %%mm7, %%mm2                             \n\t"\
1300                 "pcmpgtb %%mm6, %%mm2                           \n\t"\
1301                 "paddb %%mm2, %%mm0                             \n\t"\
1302                 "movq %%mm1," #i "(%%eax)                       \n\t"
1303
1304                 HDC_CHECK_AND_CPY(0)
1305                 "addl %2, %1                                    \n\t"
1306                 HDC_CHECK_AND_CPY(8)
1307                 "addl %2, %1                                    \n\t"
1308                 HDC_CHECK_AND_CPY(16)
1309                 "addl %2, %1                                    \n\t"
1310                 HDC_CHECK_AND_CPY(24)
1311                 "addl %2, %1                                    \n\t"
1312                 HDC_CHECK_AND_CPY(32)
1313                 "addl %2, %1                                    \n\t"
1314                 HDC_CHECK_AND_CPY(40)
1315                 "addl %2, %1                                    \n\t"
1316                 HDC_CHECK_AND_CPY(48)
1317                 "addl %2, %1                                    \n\t"
1318                 HDC_CHECK_AND_CPY(56)
1319
1320                 "psllq $8, %%mm0                                \n\t" // remove dummy value
1321                 "movq %%mm0, %%mm1                              \n\t"
1322                 "psrlw $8, %%mm0                                \n\t"
1323                 "paddb %%mm1, %%mm0                             \n\t"
1324                 "movq %%mm0, %%mm1                              \n\t"
1325                 "psrlq $16, %%mm0                               \n\t"
1326                 "paddb %%mm1, %%mm0                             \n\t"
1327                 "movq %%mm0, %%mm1                              \n\t"
1328                 "psrlq $32, %%mm0                               \n\t"
1329                 "paddb %%mm1, %%mm0                             \n\t"
1330                 "popl %1\n\t"
1331                 "movd %%mm0, %0                                 \n\t"
1332                 : "=r" (numEq)
1333                 : "r" (src), "r" (stride)
1334                 : "%eax"
1335                 );
1336 //      printf("%d\n", numEq);
1337         numEq= (256 - (numEq & 0xFF)) &0xFF;
1338 #else
1339         int y;
1340         for(y=0; y<BLOCK_SIZE; y++)
1341         {
1342                 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1343                 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1344                 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1345                 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1346                 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1347                 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1348                 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1349                 tempBlock[0 + y*TEMP_STRIDE] = src[0];
1350                 tempBlock[1 + y*TEMP_STRIDE] = src[1];
1351                 tempBlock[2 + y*TEMP_STRIDE] = src[2];
1352                 tempBlock[3 + y*TEMP_STRIDE] = src[3];
1353                 tempBlock[4 + y*TEMP_STRIDE] = src[4];
1354                 tempBlock[5 + y*TEMP_STRIDE] = src[5];
1355                 tempBlock[6 + y*TEMP_STRIDE] = src[6];
1356                 tempBlock[7 + y*TEMP_STRIDE] = src[7];
1357                 src+= stride;
1358         }
1359 #endif
1360 /*      if(abs(numEq - asmEq) > 0)
1361         {
1362 //              printf("\nasm:%d  c:%d\n", asmEq, numEq);
1363                 for(int y=0; y<8; y++)
1364                 {
1365                         for(int x=0; x<8; x++)
1366                         {
1367                                 printf("%d ", src[x + y*stride]);
1368                         }
1369                         printf("\n");
1370                 }
1371         }
1372 */
1373 //      printf("%d\n", numEq);
1374         return numEq > hFlatnessThreshold;
1375 }
1376
1377 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1378 {
1379 #ifdef MMX_FIXME
1380 FIXME
1381         int isOk;
1382         asm volatile(
1383 //              "int $3 \n\t"
1384                 "movq (%1, %2), %%mm0                           \n\t"
1385                 "movq (%1, %2, 8), %%mm1                        \n\t"
1386                 "movq %%mm0, %%mm2                              \n\t"
1387                 "psubusb %%mm1, %%mm0                           \n\t"
1388                 "psubusb %%mm2, %%mm1                           \n\t"
1389                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
1390
1391                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
1392                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
1393                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
1394                 "pcmpeqd b00, %%mm0                             \n\t"
1395                 "psrlq $16, %%mm0                               \n\t"
1396                 "pcmpeqd bFF, %%mm0                             \n\t"
1397 //              "movd %%mm0, (%1, %2, 4)\n\t"
1398                 "movd %%mm0, %0                                 \n\t"
1399                 : "=r" (isOk)
1400                 : "r" (src), "r" (stride)
1401                 );
1402         return isOk;
1403 #else
1404         if(abs(src[0] - src[7]) > 2*QP) return 0;
1405
1406         return 1;
1407 #endif
1408 }
1409
1410 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1411 {
1412 #ifdef HAVE_MMX
1413         asm volatile(
1414                 "pushl %0                                       \n\t"
1415                 "pxor %%mm7, %%mm7                              \n\t"
1416                 "movq bm00001000, %%mm6                         \n\t"
1417                 "movd %2, %%mm5                                 \n\t" // QP
1418                 "movq %%mm5, %%mm4                              \n\t"
1419                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
1420                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
1421                 "psllq $24, %%mm4                               \n\t"
1422                 "pxor %%mm5, %%mm5                              \n\t" // 0
1423                 "psubb %%mm4, %%mm5                             \n\t" // -QP
1424                 "leal tempBlock, %%eax                          \n\t"
1425
1426 //FIXME? "unroll by 2" and mix
1427 #ifdef HAVE_MMX2
1428 #define HDF(i)  \
1429                 "movq " #i "(%%eax), %%mm0                      \n\t"\
1430                 "movq %%mm0, %%mm1                              \n\t"\
1431                 "movq %%mm0, %%mm2                              \n\t"\
1432                 "psrlq $8, %%mm1                                \n\t"\
1433                 "psubusb %%mm1, %%mm2                           \n\t"\
1434                 "psubusb %%mm0, %%mm1                           \n\t"\
1435                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1436                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1437                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
1438                 "pminub %%mm1, %%mm3                            \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1439                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1440                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1441                 "paddb %%mm5, %%mm1                             \n\t"\
1442                 "psubusb %%mm5, %%mm1                           \n\t"\
1443                 "psrlw $2, %%mm1                                \n\t"\
1444                 "pxor %%mm2, %%mm1                              \n\t"\
1445                 "psubb %%mm2, %%mm1                             \n\t"\
1446                 "pand %%mm6, %%mm1                              \n\t"\
1447                 "psubb %%mm1, %%mm0                             \n\t"\
1448                 "psllq $8, %%mm1                                \n\t"\
1449                 "paddb %%mm1, %%mm0                             \n\t"\
1450                 "movd %%mm0, (%0)                               \n\t"\
1451                 "psrlq $32, %%mm0                               \n\t"\
1452                 "movd %%mm0, 4(%0)                              \n\t"
1453 #else
1454 #define HDF(i)\
1455                 "movq " #i "(%%eax), %%mm0                      \n\t"\
1456                 "movq %%mm0, %%mm1                              \n\t"\
1457                 "movq %%mm0, %%mm2                              \n\t"\
1458                 "psrlq $8, %%mm1                                \n\t"\
1459                 "psubusb %%mm1, %%mm2                           \n\t"\
1460                 "psubusb %%mm0, %%mm1                           \n\t"\
1461                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1462                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1463                 "movq %%mm1, %%mm3                              \n\t"\
1464                 "psllq $32, %%mm3                               \n\t"\
1465                 "movq %%mm3, %%mm4                              \n\t"\
1466                 "psubusb %%mm1, %%mm4                           \n\t"\
1467                 "psubb %%mm4, %%mm3                             \n\t"\
1468                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1469                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1470                 "paddb %%mm5, %%mm1                             \n\t"\
1471                 "psubusb %%mm5, %%mm1                           \n\t"\
1472                 "psrlw $2, %%mm1                                \n\t"\
1473                 "pxor %%mm2, %%mm1                              \n\t"\
1474                 "psubb %%mm2, %%mm1                             \n\t"\
1475                 "pand %%mm6, %%mm1                              \n\t"\
1476                 "psubb %%mm1, %%mm0                             \n\t"\
1477                 "psllq $8, %%mm1                                \n\t"\
1478                 "paddb %%mm1, %%mm0                             \n\t"\
1479                 "movd %%mm0, (%0)                               \n\t"\
1480                 "psrlq $32, %%mm0                               \n\t"\
1481                 "movd %%mm0, 4(%0)                              \n\t"
1482 #endif
1483                 HDF(0)
1484                 "addl %1, %0                                    \n\t"
1485                 HDF(8)
1486                 "addl %1, %0                                    \n\t"
1487                 HDF(16)
1488                 "addl %1, %0                                    \n\t"
1489                 HDF(24)
1490                 "addl %1, %0                                    \n\t"
1491                 HDF(32)
1492                 "addl %1, %0                                    \n\t"
1493                 HDF(40)
1494                 "addl %1, %0                                    \n\t"
1495                 HDF(48)
1496                 "addl %1, %0                                    \n\t"
1497                 HDF(56)
1498                 "popl %0                                        \n\t"
1499                 :
1500                 : "r" (dst), "r" (stride), "r" (QP)
1501                 : "%eax"
1502         );
1503 #else
1504         uint8_t *src= tempBlock;
1505
1506         int y;
1507         for(y=0; y<BLOCK_SIZE; y++)
1508         {
1509                 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
1510
1511                 dst[0] = src[0];
1512                 dst[1] = src[1];
1513                 dst[2] = src[2];
1514                 dst[3] = src[3];
1515                 dst[4] = src[4];
1516                 dst[5] = src[5];
1517                 dst[6] = src[6];
1518                 dst[7] = src[7];
1519
1520                 if(ABS(middleEnergy) < 8*QP)
1521                 {
1522                         const int q=(src[3] - src[4])/2;
1523                         const int leftEnergy=  5*(src[2] - src[1]) + 2*(src[0] - src[3]);
1524                         const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
1525
1526                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1527                         d= MAX(d, 0);
1528
1529                         d= (5*d + 32) >> 6;
1530                         d*= SIGN(-middleEnergy);
1531
1532                         if(q>0)
1533                         {
1534                                 d= d<0 ? 0 : d;
1535                                 d= d>q ? q : d;
1536                         }
1537                         else
1538                         {
1539                                 d= d>0 ? 0 : d;
1540                                 d= d<q ? q : d;
1541                         }
1542
1543                         dst[3]-= d;
1544                         dst[4]+= d;
1545                 }
1546                 dst+= stride;
1547                 src+= TEMP_STRIDE;
1548         }
1549 #endif
1550 }
1551
1552 /**
1553  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1554  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1555  * using the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1556  */
1557 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1558 {
1559 //return;
1560 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1561         asm volatile(   //"movv %0 %1 %2\n\t"
1562                 "pushl %0\n\t"
1563                 "pxor %%mm7, %%mm7                                      \n\t"
1564                 "leal tempBlock, %%eax                                  \n\t"
1565 /*
1566 #define HLP1    "movq (%0), %%mm0                                       \n\t"\
1567                 "movq %%mm0, %%mm1                                      \n\t"\
1568                 "psllq $8, %%mm0                                        \n\t"\
1569                 PAVGB(%%mm1, %%mm0)\
1570                 "psrlw $8, %%mm0                                        \n\t"\
1571                 "pxor %%mm1, %%mm1                                      \n\t"\
1572                 "packuswb %%mm1, %%mm0                                  \n\t"\
1573                 "movq %%mm0, %%mm1                                      \n\t"\
1574                 "movq %%mm0, %%mm2                                      \n\t"\
1575                 "psllq $32, %%mm0                                       \n\t"\
1576                 "paddb %%mm0, %%mm1                                     \n\t"\
1577                 "psllq $16, %%mm2                                       \n\t"\
1578                 PAVGB(%%mm2, %%mm0)\
1579                 "movq %%mm0, %%mm3                                      \n\t"\
1580                 "pand bm11001100, %%mm0                                 \n\t"\
1581                 "paddusb %%mm0, %%mm3                                   \n\t"\
1582                 "psrlq $8, %%mm3                                        \n\t"\
1583                 PAVGB(%%mm1, %%mm4)\
1584                 PAVGB(%%mm3, %%mm2)\
1585                 "psrlq $16, %%mm2                                       \n\t"\
1586                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1587                 "movq %%mm2, (%0)                                       \n\t"\
1588
1589 #define HLP2    "movq (%0), %%mm0                                       \n\t"\
1590                 "movq %%mm0, %%mm1                                      \n\t"\
1591                 "psllq $8, %%mm0                                        \n\t"\
1592                 PAVGB(%%mm1, %%mm0)\
1593                 "psrlw $8, %%mm0                                        \n\t"\
1594                 "pxor %%mm1, %%mm1                                      \n\t"\
1595                 "packuswb %%mm1, %%mm0                                  \n\t"\
1596                 "movq %%mm0, %%mm2                                      \n\t"\
1597                 "psllq $32, %%mm0                                       \n\t"\
1598                 "psllq $16, %%mm2                                       \n\t"\
1599                 PAVGB(%%mm2, %%mm0)\
1600                 "movq %%mm0, %%mm3                                      \n\t"\
1601                 "pand bm11001100, %%mm0                                 \n\t"\
1602                 "paddusb %%mm0, %%mm3                                   \n\t"\
1603                 "psrlq $8, %%mm3                                        \n\t"\
1604                 PAVGB(%%mm3, %%mm2)\
1605                 "psrlq $16, %%mm2                                       \n\t"\
1606                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1607                 "movq %%mm2, (%0)                                       \n\t"\
1608 */
1609 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1610 /*
1611 Implemented     Exact 7-Tap
1612  9421           A321
1613  36421          64321
1614  334321         =
1615  1234321        =
1616   1234321       =
1617    123433       =
1618     12463         12346
1619      1249          123A
1620
1621 */
1622
1623 #ifdef HAVE_MMX2
1624 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1625                 "movq %%mm0, %%mm1                                      \n\t"\
1626                 "movq %%mm0, %%mm2                                      \n\t"\
1627                 "movq %%mm0, %%mm3                                      \n\t"\
1628                 "movq %%mm0, %%mm4                                      \n\t"\
1629                 "psllq $8, %%mm1                                        \n\t"\
1630                 "psrlq $8, %%mm2                                        \n\t"\
1631                 "pand bm00000001, %%mm3                                 \n\t"\
1632                 "pand bm10000000, %%mm4                                 \n\t"\
1633                 "por %%mm3, %%mm1                                       \n\t"\
1634                 "por %%mm4, %%mm2                                       \n\t"\
1635                 PAVGB(%%mm2, %%mm1)\
1636                 PAVGB(%%mm1, %%mm0)\
1637 \
1638                 "pshufw $0xF9, %%mm0, %%mm3                             \n\t"\
1639                 "pshufw $0x90, %%mm0, %%mm4                             \n\t"\
1640                 PAVGB(%%mm3, %%mm4)\
1641                 PAVGB(%%mm4, %%mm0)\
1642                 "movd %%mm0, (%0)                                       \n\t"\
1643                 "psrlq $32, %%mm0                                       \n\t"\
1644                 "movd %%mm0, 4(%0)                                      \n\t"
1645 #else
1646 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1647                 "movq %%mm0, %%mm1                                      \n\t"\
1648                 "movq %%mm0, %%mm2                                      \n\t"\
1649                 "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
1650                 "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
1651                 "psllq $8, %%mm1                                        \n\t"\
1652                 "psrlq $8, %%mm2                                        \n\t"\
1653                 "psrlq $24, %%mm3                                       \n\t"\
1654                 "psllq $56, %%mm4                                       \n\t"\
1655                 "por %%mm3, %%mm1                                       \n\t"\
1656                 "por %%mm4, %%mm2                                       \n\t"\
1657                 PAVGB(%%mm2, %%mm1)\
1658                 PAVGB(%%mm1, %%mm0)\
1659 \
1660                 "movq %%mm0, %%mm3                                      \n\t"\
1661                 "movq %%mm0, %%mm4                                      \n\t"\
1662                 "movq %%mm0, %%mm5                                      \n\t"\
1663                 "psrlq $16, %%mm3                                       \n\t"\
1664                 "psllq $16, %%mm4                                       \n\t"\
1665                 "pand bm11000000, %%mm5                                 \n\t"\
1666                 "por %%mm5, %%mm3                                       \n\t"\
1667                 "movq %%mm0, %%mm5                                      \n\t"\
1668                 "pand bm00000011, %%mm5                                 \n\t"\
1669                 "por %%mm5, %%mm4                                       \n\t"\
1670                 PAVGB(%%mm3, %%mm4)\
1671                 PAVGB(%%mm4, %%mm0)\
1672                 "movd %%mm0, (%0)                                       \n\t"\
1673                 "psrlq $32, %%mm0                                       \n\t"\
1674                 "movd %%mm0, 4(%0)                                      \n\t"
1675 #endif
1676
1677 /* uses the 7-Tap Filter: 1112111 */
1678 #define NEW_HLP(i)\
1679                 "movq " #i "(%%eax), %%mm0                              \n\t"\
1680                 "movq %%mm0, %%mm1                                      \n\t"\
1681                 "movq %%mm0, %%mm2                                      \n\t"\
1682                 "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
1683                 "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
1684                 "psllq $8, %%mm1                                        \n\t"\
1685                 "psrlq $8, %%mm2                                        \n\t"\
1686                 "psrlq $24, %%mm3                                       \n\t"\
1687                 "psllq $56, %%mm4                                       \n\t"\
1688                 "por %%mm3, %%mm1                                       \n\t"\
1689                 "por %%mm4, %%mm2                                       \n\t"\
1690                 "movq %%mm1, %%mm5                                      \n\t"\
1691                 PAVGB(%%mm2, %%mm1)\
1692                 PAVGB(%%mm1, %%mm0)\
1693                 "psllq $8, %%mm5                                        \n\t"\
1694                 "psrlq $8, %%mm2                                        \n\t"\
1695                 "por %%mm3, %%mm5                                       \n\t"\
1696                 "por %%mm4, %%mm2                                       \n\t"\
1697                 "movq %%mm5, %%mm1                                      \n\t"\
1698                 PAVGB(%%mm2, %%mm5)\
1699                 "psllq $8, %%mm1                                        \n\t"\
1700                 "psrlq $8, %%mm2                                        \n\t"\
1701                 "por %%mm3, %%mm1                                       \n\t"\
1702                 "por %%mm4, %%mm2                                       \n\t"\
1703                 PAVGB(%%mm2, %%mm1)\
1704                 PAVGB(%%mm1, %%mm5)\
1705                 PAVGB(%%mm5, %%mm0)\
1706                 "movd %%mm0, (%0)                                       \n\t"\
1707                 "psrlq $32, %%mm0                                       \n\t"\
1708                 "movd %%mm0, 4(%0)                                      \n\t"
1709
1710 /* uses the 9-Tap Filter: 112242211 */
1711 #define NEW_HLP2(i)\
1712                 "movq " #i "(%%eax), %%mm0                              \n\t" /*0001000*/\
1713                 "movq %%mm0, %%mm1                                      \n\t" /*0001000*/\
1714                 "movq %%mm0, %%mm2                                      \n\t" /*0001000*/\
1715                 "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
1716                 "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
1717                 "psllq $8, %%mm1                                        \n\t"\
1718                 "psrlq $8, %%mm2                                        \n\t"\
1719                 "psrlq $24, %%mm3                                       \n\t"\
1720                 "psllq $56, %%mm4                                       \n\t"\
1721                 "por %%mm3, %%mm1                                       \n\t" /*0010000*/\
1722                 "por %%mm4, %%mm2                                       \n\t" /*0000100*/\
1723                 "movq %%mm1, %%mm5                                      \n\t" /*0010000*/\
1724                 PAVGB(%%mm2, %%mm1)                                           /*0010100*/\
1725                 PAVGB(%%mm1, %%mm0)                                           /*0012100*/\
1726                 "psllq $8, %%mm5                                        \n\t"\
1727                 "psrlq $8, %%mm2                                        \n\t"\
1728                 "por %%mm3, %%mm5                                       \n\t" /*0100000*/\
1729                 "por %%mm4, %%mm2                                       \n\t" /*0000010*/\
1730                 "movq %%mm5, %%mm1                                      \n\t" /*0100000*/\
1731                 PAVGB(%%mm2, %%mm5)                                           /*0100010*/\
1732                 "psllq $8, %%mm1                                        \n\t"\
1733                 "psrlq $8, %%mm2                                        \n\t"\
1734                 "por %%mm3, %%mm1                                       \n\t" /*1000000*/\
1735                 "por %%mm4, %%mm2                                       \n\t" /*0000001*/\
1736                 "movq %%mm1, %%mm6                                      \n\t" /*1000000*/\
1737                 PAVGB(%%mm2, %%mm1)                                           /*1000001*/\
1738                 "psllq $8, %%mm6                                        \n\t"\
1739                 "psrlq $8, %%mm2                                        \n\t"\
1740                 "por %%mm3, %%mm6                                       \n\t"/*100000000*/\
1741                 "por %%mm4, %%mm2                                       \n\t"/*000000001*/\
1742                 PAVGB(%%mm2, %%mm6)                                          /*100000001*/\
1743                 PAVGB(%%mm6, %%mm1)                                          /*110000011*/\
1744                 PAVGB(%%mm1, %%mm5)                                          /*112000211*/\
1745                 PAVGB(%%mm5, %%mm0)                                          /*112242211*/\
1746                 "movd %%mm0, (%0)                                       \n\t"\
1747                 "psrlq $32, %%mm0                                       \n\t"\
1748                 "movd %%mm0, 4(%0)                                      \n\t"
1749
1750 #define HLP(i) NEW_HLP(i)
1751
1752                 HLP(0)
1753                 "addl %1, %0                                            \n\t"
1754                 HLP(8)
1755                 "addl %1, %0                                            \n\t"
1756                 HLP(16)
1757                 "addl %1, %0                                            \n\t"
1758                 HLP(24)
1759                 "addl %1, %0                                            \n\t"
1760                 HLP(32)
1761                 "addl %1, %0                                            \n\t"
1762                 HLP(40)
1763                 "addl %1, %0                                            \n\t"
1764                 HLP(48)
1765                 "addl %1, %0                                            \n\t"
1766                 HLP(56)
1767
1768                 "popl %0\n\t"
1769                 :
1770                 : "r" (dst), "r" (stride)
1771                 : "%eax", "%ebx"
1772         );
1773
1774 #else
1775         uint8_t *temp= tempBlock;
1776         int y;
1777         for(y=0; y<BLOCK_SIZE; y++)
1778         {
1779                 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1780                 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1781
1782                 int sums[9];
1783                 sums[0] = first + temp[0];
1784                 sums[1] = temp[0] + temp[1];
1785                 sums[2] = temp[1] + temp[2];
1786                 sums[3] = temp[2] + temp[3];
1787                 sums[4] = temp[3] + temp[4];
1788                 sums[5] = temp[4] + temp[5];
1789                 sums[6] = temp[5] + temp[6];
1790                 sums[7] = temp[6] + temp[7];
1791                 sums[8] = temp[7] + last;
1792
1793                 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1794                 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1795                 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1796                 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1797                 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
1798                 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1799                 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1800                 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1801
1802                 dst+= stride;
1803                 temp+= TEMP_STRIDE;
1804         }
1805 #endif
1806 }
1807
1808
1809 static inline void dering(uint8_t src[], int stride, int QP)
1810 {
1811 //FIXME
1812
1813 #ifdef HAVE_MMX2X
1814         asm volatile(
1815                 "leal (%0, %1), %%eax                           \n\t"
1816                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1817 //      0       1       2       3       4       5       6       7       8       9
1818 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1819
1820                 "pcmpeq %%mm6, %%mm6                            \n\t"
1821                 "pxor %%mm7, %%mm7                              \n\t"
1822
1823 #define FIND_MIN_MAX(addr)\
1824                 "movq (" #addr "), %%mm0,                       \n\t"\
1825                 "pminub %%mm0, %%mm6                            \n\t"\
1826                 "pmaxub %%mm0, %%mm7                            \n\t"
1827
1828 FIND_MIN_MAX(%0)
1829 FIND_MIN_MAX(%%eax)
1830 FIND_MIN_MAX(%%eax, %1)
1831 FIND_MIN_MAX(%%eax, %1, 2)
1832 FIND_MIN_MAX(%0, %1, 4)
1833 FIND_MIN_MAX(%%ebx)
1834 FIND_MIN_MAX(%%ebx, %1)
1835 FIND_MIN_MAX(%%ebx, %1, 2)
1836 FIND_MIN_MAX(%0, %1, 8)
1837 FIND_MIN_MAX(%%ebx, %1, 2)
1838
1839                 "movq %%mm6, %%mm4                              \n\t"
1840                 "psrlq $32, %%mm6                               \n\t"
1841                 "pminub %%mm4, %%mm6                            \n\t"
1842                 "movq %%mm6, %%mm4                              \n\t"
1843                 "psrlq $16, %%mm6                               \n\t"
1844                 "pminub %%mm4, %%mm6                            \n\t"
1845                 "movq %%mm6, %%mm4                              \n\t"
1846                 "psrlq $8, %%mm6                                \n\t"
1847                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1848
1849                 "movq %%mm7, %%mm4                              \n\t"
1850                 "psrlq $32, %%mm7                               \n\t"
1851                 "pmaxub %%mm4, %%mm7                            \n\t"
1852                 "movq %%mm7, %%mm4                              \n\t"
1853                 "psrlq $16, %%mm7                               \n\t"
1854                 "pmaxub %%mm4, %%mm7                            \n\t"
1855                 "movq %%mm7, %%mm4                              \n\t"
1856                 "psrlq $8, %%mm7                                \n\t"
1857                 "pmaxub %%mm4, %%mm7                            \n\t" // max of pixels
1858                 PAVGB(%%mm6, %%mm7)                                   // (max + min)/2
1859
1860
1861                 : : "r" (src), "r" (stride), "r" (QP)
1862                 : "%eax", "%ebx"
1863         );
1864 #else
1865
1866 //FIXME
1867 #endif
1868 }
1869
1870 /**
1871  * Deinterlaces the given block
1872  * will be called for every 8x8 block, and can read & write into an 8x16 block
1873  */
1874 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
1875 {
1876 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1877         asm volatile(
1878                 "leal (%0, %1), %%eax                           \n\t"
1879                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1880 //      0       1       2       3       4       5       6       7       8       9
1881 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1882
1883                 "movq (%0), %%mm0                               \n\t"
1884                 "movq (%%eax, %1), %%mm1                        \n\t"
1885                 PAVGB(%%mm1, %%mm0)
1886                 "movq %%mm0, (%%eax)                            \n\t"
1887                 "movq (%0, %1, 4), %%mm0                        \n\t"
1888                 PAVGB(%%mm0, %%mm1)
1889                 "movq %%mm1, (%%eax, %1, 2)                     \n\t"
1890                 "movq (%%ebx, %1), %%mm1                        \n\t"
1891                 PAVGB(%%mm1, %%mm0)
1892                 "movq %%mm0, (%%ebx)                            \n\t"
1893                 "movq (%0, %1, 8), %%mm0                        \n\t"
1894                 PAVGB(%%mm0, %%mm1)
1895                 "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
1896
1897                 : : "r" (src), "r" (stride)
1898                 : "%eax", "%ebx"
1899         );
1900 #else
1901         int x;
1902         for(x=0; x<8; x++)
1903         {
1904                 src[stride]   = (src[0]        + src[stride*2])>>1;
1905                 src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1906                 src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1907                 src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1908                 src++;
1909         }
1910 #endif
1911 }
1912
1913 /**
1914  * Deinterlaces the given block
1915  * will be called for every 8x8 block, and can read & write into an 8x16 block
1916  * no cliping in C version
1917  */
1918 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride)
1919 {
1920 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1921         asm volatile(
1922                 "leal (%0, %1), %%eax                           \n\t"
1923                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1924                 "leal (%%ebx, %1, 4), %%ecx                     \n\t"
1925                 "addl %1, %%ecx                                 \n\t"
1926                 "pxor %%mm7, %%mm7                              \n\t"
1927 //      0       1       2       3       4       5       6       7       8       9       10
1928 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1 ecx
1929
1930 #define DEINT_CUBIC(a,b,c,d,e)\
1931                 "movq " #a ", %%mm0                             \n\t"\
1932                 "movq " #b ", %%mm1                             \n\t"\
1933                 "movq " #d ", %%mm2                             \n\t"\
1934                 "movq " #e ", %%mm3                             \n\t"\
1935                 PAVGB(%%mm2, %%mm1)                                     /* (b+d) /2 */\
1936                 PAVGB(%%mm3, %%mm0)                                     /* a(a+e) /2 */\
1937                 "movq %%mm0, %%mm2                              \n\t"\
1938                 "punpcklbw %%mm7, %%mm0                         \n\t"\
1939                 "punpckhbw %%mm7, %%mm2                         \n\t"\
1940                 "movq %%mm1, %%mm3                              \n\t"\
1941                 "punpcklbw %%mm7, %%mm1                         \n\t"\
1942                 "punpckhbw %%mm7, %%mm3                         \n\t"\
1943                 "psubw %%mm1, %%mm0                             \n\t"   /* L(a+e - (b+d))/2 */\
1944                 "psubw %%mm3, %%mm2                             \n\t"   /* H(a+e - (b+d))/2 */\
1945                 "psraw $3, %%mm0                                \n\t"   /* L(a+e - (b+d))/16 */\
1946                 "psraw $3, %%mm2                                \n\t"   /* H(a+e - (b+d))/16 */\
1947                 "psubw %%mm0, %%mm1                             \n\t"   /* L(9b + 9d - a - e)/16 */\
1948                 "psubw %%mm2, %%mm3                             \n\t"   /* H(9b + 9d - a - e)/16 */\
1949                 "packuswb %%mm3, %%mm1                          \n\t"\
1950                 "movq %%mm1, " #c "                             \n\t"
1951
1952 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1))
1953 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8))
1954 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx))
1955 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1956
1957                 : : "r" (src), "r" (stride)
1958                 : "%eax", "%ebx", "ecx"
1959         );
1960 #else
1961         int x;
1962         for(x=0; x<8; x++)
1963         {
1964                 src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1965                 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1966                 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1967                 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1968                 src++;
1969         }
1970 #endif
1971 }
1972
1973 /**
1974  * Deinterlaces the given block
1975  * will be called for every 8x8 block, and can read & write into an 8x16 block
1976  * will shift the image up by 1 line (FIXME if this is a problem)
1977  */
1978 static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
1979 {
1980 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1981         asm volatile(
1982                 "leal (%0, %1), %%eax                           \n\t"
1983                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1984 //      0       1       2       3       4       5       6       7       8       9
1985 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1986
1987                 "movq (%0), %%mm0                               \n\t" // L0
1988                 "movq (%%eax, %1), %%mm1                        \n\t" // L2
1989                 PAVGB(%%mm1, %%mm0)                                   // L0+L2
1990                 "movq (%%eax), %%mm2                            \n\t" // L1
1991                 PAVGB(%%mm2, %%mm0)
1992                 "movq %%mm0, (%0)                               \n\t"
1993                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
1994                 PAVGB(%%mm0, %%mm2)                                   // L1+L3
1995                 PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
1996                 "movq %%mm2, (%%eax)                            \n\t"
1997                 "movq (%0, %1, 4), %%mm2                        \n\t" // L4
1998                 PAVGB(%%mm2, %%mm1)                                   // L2+L4
1999                 PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
2000                 "movq %%mm1, (%%eax, %1)                        \n\t"
2001                 "movq (%%ebx), %%mm1                            \n\t" // L5
2002                 PAVGB(%%mm1, %%mm0)                                   // L3+L5
2003                 PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
2004                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
2005                 "movq (%%ebx, %1), %%mm0                        \n\t" // L6
2006                 PAVGB(%%mm0, %%mm2)                                   // L4+L6
2007                 PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
2008                 "movq %%mm2, (%0, %1, 4)                        \n\t"
2009                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
2010                 PAVGB(%%mm2, %%mm1)                                   // L5+L7
2011                 PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
2012                 "movq %%mm1, (%%ebx)                            \n\t"
2013                 "movq (%0, %1, 8), %%mm1                        \n\t" // L8
2014                 PAVGB(%%mm1, %%mm0)                                   // L6+L8
2015                 PAVGB(%%mm2, %%mm0)                                   // 2L7 + L6 + L8
2016                 "movq %%mm0, (%%ebx, %1)                        \n\t"
2017                 "movq (%%ebx, %1, 4), %%mm0                     \n\t" // L9
2018                 PAVGB(%%mm0, %%mm2)                                   // L7+L9
2019                 PAVGB(%%mm1, %%mm2)                                   // 2L8 + L7 + L9
2020                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2021
2022
2023                 : : "r" (src), "r" (stride)
2024                 : "%eax", "%ebx"
2025         );
2026 #else
2027         int x;
2028         for(x=0; x<8; x++)
2029         {
2030                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2031                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2032                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2033                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2034                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2035                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2036                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2037                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2038                 src++;
2039         }
2040 #endif
2041 }
2042
2043 /**
2044  * Deinterlaces the given block
2045  * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
2046  */
2047 static inline void deInterlaceMedian(uint8_t src[], int stride)
2048 {
2049 #ifdef HAVE_MMX
2050 #ifdef HAVE_MMX2
2051         asm volatile(
2052                 "leal (%0, %1), %%eax                           \n\t"
2053                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2054 //      0       1       2       3       4       5       6       7       8       9
2055 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2056
2057                 "movq (%0), %%mm0                               \n\t" //
2058                 "movq (%%eax, %1), %%mm2                        \n\t" //
2059                 "movq (%%eax), %%mm1                            \n\t" //
2060                 "movq %%mm0, %%mm3                              \n\t"
2061                 "pmaxub %%mm1, %%mm0                            \n\t" //
2062                 "pminub %%mm3, %%mm1                            \n\t" //
2063                 "pmaxub %%mm2, %%mm1                            \n\t" //
2064                 "pminub %%mm1, %%mm0                            \n\t"
2065                 "movq %%mm0, (%%eax)                            \n\t"
2066
2067                 "movq (%0, %1, 4), %%mm0                        \n\t" //
2068                 "movq (%%eax, %1, 2), %%mm1                     \n\t" //
2069                 "movq %%mm2, %%mm3                              \n\t"
2070                 "pmaxub %%mm1, %%mm2                            \n\t" //
2071                 "pminub %%mm3, %%mm1                            \n\t" //
2072                 "pmaxub %%mm0, %%mm1                            \n\t" //
2073                 "pminub %%mm1, %%mm2                            \n\t"
2074                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
2075
2076                 "movq (%%ebx), %%mm2                            \n\t" //
2077                 "movq (%%ebx, %1), %%mm1                        \n\t" //
2078                 "movq %%mm2, %%mm3                              \n\t"
2079                 "pmaxub %%mm0, %%mm2                            \n\t" //
2080                 "pminub %%mm3, %%mm0                            \n\t" //
2081                 "pmaxub %%mm1, %%mm0                            \n\t" //
2082                 "pminub %%mm0, %%mm2                            \n\t"
2083                 "movq %%mm2, (%%ebx)                            \n\t"
2084
2085                 "movq (%%ebx, %1, 2), %%mm2                     \n\t" //
2086                 "movq (%0, %1, 8), %%mm0                        \n\t" //
2087                 "movq %%mm2, %%mm3                              \n\t"
2088                 "pmaxub %%mm0, %%mm2                            \n\t" //
2089                 "pminub %%mm3, %%mm0                            \n\t" //
2090                 "pmaxub %%mm1, %%mm0                            \n\t" //
2091                 "pminub %%mm0, %%mm2                            \n\t"
2092                 "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
2093
2094
2095                 : : "r" (src), "r" (stride)
2096                 : "%eax", "%ebx"
2097         );
2098
2099 #else // MMX without MMX2
2100         asm volatile(
2101                 "leal (%0, %1), %%eax                           \n\t"
2102                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
2103 //      0       1       2       3       4       5       6       7       8       9
2104 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
2105                 "pxor %%mm7, %%mm7                              \n\t"
2106
2107 #define MEDIAN(a,b,c)\
2108                 "movq " #a ", %%mm0                             \n\t"\
2109                 "movq " #b ", %%mm2                             \n\t"\
2110                 "movq " #c ", %%mm1                             \n\t"\
2111                 "movq %%mm0, %%mm3                              \n\t"\
2112                 "movq %%mm1, %%mm4                              \n\t"\
2113                 "movq %%mm2, %%mm5                              \n\t"\
2114                 "psubusb %%mm1, %%mm3                           \n\t"\
2115                 "psubusb %%mm2, %%mm4                           \n\t"\
2116                 "psubusb %%mm0, %%mm5                           \n\t"\
2117                 "pcmpeqb %%mm7, %%mm3                           \n\t"\
2118                 "pcmpeqb %%mm7, %%mm4                           \n\t"\
2119                 "pcmpeqb %%mm7, %%mm5                           \n\t"\
2120                 "movq %%mm3, %%mm6                              \n\t"\
2121                 "pxor %%mm4, %%mm3                              \n\t"\
2122                 "pxor %%mm5, %%mm4                              \n\t"\
2123                 "pxor %%mm6, %%mm5                              \n\t"\
2124                 "por %%mm3, %%mm1                               \n\t"\
2125                 "por %%mm4, %%mm2                               \n\t"\
2126                 "por %%mm5, %%mm0                               \n\t"\
2127                 "pand %%mm2, %%mm0                              \n\t"\
2128                 "pand %%mm1, %%mm0                              \n\t"\
2129                 "movq %%mm0, " #b "                             \n\t"
2130
2131 MEDIAN((%0), (%%eax), (%%eax, %1))
2132 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
2133 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1))
2134 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8))
2135
2136                 : : "r" (src), "r" (stride)
2137                 : "%eax", "%ebx"
2138         );
2139 #endif // MMX
2140 #else
2141         //FIXME
2142         int x;
2143         for(x=0; x<8; x++)
2144         {
2145                 src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
2146                 src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
2147                 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
2148                 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
2149                 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
2150                 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
2151                 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
2152                 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
2153                 src++;
2154         }
2155 #endif
2156 }
2157
2158 #ifdef HAVE_ODIVX_POSTPROCESS
2159 #include "../opendivx/postprocess.h"
2160 int use_old_pp=0;
2161 #endif
2162
2163 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2164         QP_STORE_T QPs[], int QPStride, int isColor, int mode);
2165
2166 /**
2167  * ...
2168  */
2169 void  postprocess(unsigned char * src[], int src_stride,
2170                  unsigned char * dst[], int dst_stride,
2171                  int horizontal_size,   int vertical_size,
2172                  QP_STORE_T *QP_store,  int QP_stride,
2173                                           int mode)
2174 {
2175
2176 #ifdef HAVE_ODIVX_POSTPROCESS
2177 // Note: I could make this shit outside of this file, but it would mean one
2178 // more function call...
2179         if(use_old_pp){
2180             odivx_postprocess(src,src_stride,dst,dst_stride,horizontal_size,vertical_size,QP_store,QP_stride,mode);
2181             return;
2182         }
2183 #endif
2184
2185 /*
2186         long long T= rdtsc();
2187         for(int y=vertical_size-1; y>=0 ; y--)
2188                 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
2189 //      memcpy(dst[0], src[0],src_stride*vertical_size);
2190         printf("%4dk\r", (rdtsc()-T)/1000);
2191
2192         return;
2193 */
2194 /*
2195         long long T= rdtsc();
2196         while( (rdtsc() - T)/1000 < 4000);
2197
2198         return;
2199 */
2200         postProcess(src[0], src_stride, dst[0], dst_stride,
2201                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
2202
2203         horizontal_size >>= 1;
2204         vertical_size   >>= 1;
2205         src_stride      >>= 1;
2206         dst_stride      >>= 1;
2207         mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
2208
2209         if(1)
2210         {
2211                 postProcess(src[1], src_stride, dst[1], dst_stride,
2212                         horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2213                 postProcess(src[2], src_stride, dst[2], dst_stride,
2214                         horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
2215         }
2216         else
2217         {
2218                 memcpy(dst[1], src[1], src_stride*horizontal_size);
2219                 memcpy(dst[2], src[2], src_stride*horizontal_size);
2220         }
2221 }
2222
2223 /**
2224  * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
2225  * 0 <= quality <= 6
2226  */
2227 int getPpModeForQuality(int quality){
2228         int modes[1+GET_PP_QUALITY_MAX]= {
2229                 0,
2230 #if 1
2231                 // horizontal filters first
2232                 LUM_H_DEBLOCK,
2233                 LUM_H_DEBLOCK | LUM_V_DEBLOCK,
2234                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK,
2235                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK,
2236                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING,
2237                 LUM_H_DEBLOCK | LUM_V_DEBLOCK | CHROM_H_DEBLOCK | CHROM_V_DEBLOCK | LUM_DERING | CHROM_DERING
2238 #else
2239                 // vertical filters first
2240                 LUM_V_DEBLOCK,
2241                 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
2242                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
2243                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
2244                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
2245                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
2246 #endif
2247         };
2248
2249 #ifdef HAVE_ODIVX_POSTPROCESS
2250         int odivx_modes[1+GET_PP_QUALITY_MAX]= {
2251                 0,
2252                 PP_DEBLOCK_Y_H,
2253                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V,
2254                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H,
2255                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V,
2256                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y,
2257                 PP_DEBLOCK_Y_H|PP_DEBLOCK_Y_V|PP_DEBLOCK_C_H|PP_DEBLOCK_C_V|PP_DERING_Y|PP_DERING_C
2258         };
2259         if(use_old_pp) return odivx_modes[quality];
2260 #endif
2261         return modes[quality];
2262 }
2263
2264 //} // extern "C"
2265
2266 /**
2267  * Copies a block from src to dst and fixes the blacklevel
2268  * numLines must be a multiple of 4
2269  * levelFix == 0 -> dont touch the brighness & contrast
2270  */
2271 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2272         int numLines, int levelFix)
2273 {
2274         int i;
2275         if(levelFix)
2276         {
2277 #ifdef HAVE_MMX
2278                                         asm volatile(
2279                                                 "movl %4, %%eax \n\t"
2280                                                 "movl %%eax, temp0\n\t"
2281                                                 "pushl %0 \n\t"
2282                                                 "pushl %1 \n\t"
2283                                                 "leal (%2,%2), %%eax    \n\t"
2284                                                 "leal (%3,%3), %%ebx    \n\t"
2285                                                 "movq packedYOffset, %%mm2      \n\t"
2286                                                 "movq packedYScale, %%mm3       \n\t"
2287                                                 "pxor %%mm4, %%mm4      \n\t"
2288
2289 #define SCALED_CPY                                      \
2290                                                 "movq (%0), %%mm0       \n\t"\
2291                                                 "movq (%0,%2), %%mm1    \n\t"\
2292                                                 "psubusb %%mm2, %%mm0   \n\t"\
2293                                                 "psubusb %%mm2, %%mm1   \n\t"\
2294                                                 "movq %%mm0, %%mm5      \n\t"\
2295                                                 "punpcklbw %%mm4, %%mm0 \n\t"\
2296                                                 "punpckhbw %%mm4, %%mm5 \n\t"\
2297                                                 "psllw $7, %%mm0        \n\t"\
2298                                                 "psllw $7, %%mm5        \n\t"\
2299                                                 "pmulhw %%mm3, %%mm0    \n\t"\
2300                                                 "pmulhw %%mm3, %%mm5    \n\t"\
2301                                                 "packuswb %%mm5, %%mm0  \n\t"\
2302                                                 "movq %%mm0, (%1)       \n\t"\
2303                                                 "movq %%mm1, %%mm5      \n\t"\
2304                                                 "punpcklbw %%mm4, %%mm1 \n\t"\
2305                                                 "punpckhbw %%mm4, %%mm5 \n\t"\
2306                                                 "psllw $7, %%mm1        \n\t"\
2307                                                 "psllw $7, %%mm5        \n\t"\
2308                                                 "pmulhw %%mm3, %%mm1    \n\t"\
2309                                                 "pmulhw %%mm3, %%mm5    \n\t"\
2310                                                 "packuswb %%mm5, %%mm1  \n\t"\
2311                                                 "movq %%mm1, (%1, %3)   \n\t"\
2312
2313                                                 "1:                     \n\t"
2314 SCALED_CPY
2315                                                 "addl %%eax, %0         \n\t"
2316                                                 "addl %%ebx, %1         \n\t"
2317 SCALED_CPY
2318                                                 "addl %%eax, %0         \n\t"
2319                                                 "addl %%ebx, %1         \n\t"
2320                                                 "decl temp0             \n\t"
2321                                                 "jnz 1b                 \n\t"
2322
2323                                                 "popl %1 \n\t"
2324                                                 "popl %0 \n\t"
2325                                                 : : "r" (src),
2326                                                 "r" (dst),
2327                                                 "r" (srcStride),
2328                                                 "r" (dstStride),
2329                                                 "m" (numLines>>2)
2330                                                 : "%eax", "%ebx"
2331                                         );
2332 #else
2333                                 for(i=0; i<numLines; i++)
2334                                         memcpy( &(dst[dstStride*i]),
2335                                                 &(src[srcStride*i]), BLOCK_SIZE);
2336 #endif
2337         }
2338         else
2339         {
2340 #ifdef HAVE_MMX
2341                                         asm volatile(
2342                                                 "movl %4, %%eax \n\t"
2343                                                 "movl %%eax, temp0\n\t"
2344                                                 "pushl %0 \n\t"
2345                                                 "pushl %1 \n\t"
2346                                                 "leal (%2,%2), %%eax    \n\t"
2347                                                 "leal (%3,%3), %%ebx    \n\t"
2348                                                 "movq packedYOffset, %%mm2      \n\t"
2349                                                 "movq packedYScale, %%mm3       \n\t"
2350
2351 #define SIMPLE_CPY                                      \
2352                                                 "movq (%0), %%mm0       \n\t"\
2353                                                 "movq (%0,%2), %%mm1    \n\t"\
2354                                                 "movq %%mm0, (%1)       \n\t"\
2355                                                 "movq %%mm1, (%1, %3)   \n\t"\
2356
2357                                                 "1:                     \n\t"
2358 SIMPLE_CPY
2359                                                 "addl %%eax, %0         \n\t"
2360                                                 "addl %%ebx, %1         \n\t"
2361 SIMPLE_CPY
2362                                                 "addl %%eax, %0         \n\t"
2363                                                 "addl %%ebx, %1         \n\t"
2364                                                 "decl temp0             \n\t"
2365                                                 "jnz 1b                 \n\t"
2366
2367                                                 "popl %1 \n\t"
2368                                                 "popl %0 \n\t"
2369                                                 : : "r" (src),
2370                                                 "r" (dst),
2371                                                 "r" (srcStride),
2372                                                 "r" (dstStride),
2373                                                 "m" (numLines>>2)
2374                                                 : "%eax", "%ebx"
2375                                         );
2376 #else
2377                                 for(i=0; i<numLines; i++)
2378                                         memcpy( &(dst[dstStride*i]),
2379                                                 &(src[srcStride*i]), BLOCK_SIZE);
2380 #endif
2381         }
2382 }
2383
2384
2385 /**
2386  * Filters array of bytes (Y or U or V values)
2387  */
2388 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2389         QP_STORE_T QPs[], int QPStride, int isColor, int mode)
2390 {
2391         int x,y;
2392         /* we need 64bit here otherwise we´ll going to have a problem
2393            after watching a black picture for 5 hours*/
2394         static uint64_t *yHistogram= NULL;
2395         int black=0, white=255; // blackest black and whitest white in the picture
2396
2397         /* Temporary buffers for handling the last row(s) */
2398         static uint8_t *tempDst= NULL;
2399         static uint8_t *tempSrc= NULL;
2400
2401         /* Temporary buffers for handling the last block */
2402         static uint8_t *tempDstBlock= NULL;
2403         static uint8_t *tempSrcBlock= NULL;
2404
2405         uint8_t *dstBlockPtrBackup;
2406         uint8_t *srcBlockPtrBackup;
2407
2408 #ifdef TIMING
2409         long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
2410         sumTime= rdtsc();
2411 #endif
2412
2413         if(tempDst==NULL)
2414         {
2415                 tempDst= (uint8_t*)memalign(8, 1024*24);
2416                 tempSrc= (uint8_t*)memalign(8, 1024*24);
2417                 tempDstBlock= (uint8_t*)memalign(8, 1024*24);
2418                 tempSrcBlock= (uint8_t*)memalign(8, 1024*24);
2419         }
2420
2421         if(!yHistogram)
2422         {
2423                 int i;
2424                 yHistogram= (uint64_t*)malloc(8*256);
2425                 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2426
2427                 if(mode & FULL_Y_RANGE)
2428                 {
2429                         maxAllowedY=255;
2430                         minAllowedY=0;
2431                 }
2432         }
2433
2434         if(!isColor)
2435         {
2436                 uint64_t sum= 0;
2437                 int i;
2438                 static int framenum= -1;
2439                 uint64_t maxClipped;
2440                 uint64_t clipped;
2441                 double scale;
2442
2443                 framenum++;
2444                 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2445
2446                 for(i=0; i<256; i++)
2447                 {
2448                         sum+= yHistogram[i];
2449 //                      printf("%d ", yHistogram[i]);
2450                 }
2451 //              printf("\n\n");
2452
2453                 /* we allways get a completly black picture first */
2454                 maxClipped= (uint64_t)(sum * maxClippedThreshold);
2455
2456                 clipped= sum;
2457                 for(black=255; black>0; black--)
2458                 {
2459                         if(clipped < maxClipped) break;
2460                         clipped-= yHistogram[black];
2461                 }
2462
2463                 clipped= sum;
2464                 for(white=0; white<256; white++)
2465                 {
2466                         if(clipped < maxClipped) break;
2467                         clipped-= yHistogram[white];
2468                 }
2469
2470                 // we cant handle negative correctures
2471                 packedYOffset= MAX(black - minAllowedY, 0);
2472                 packedYOffset|= packedYOffset<<32;
2473                 packedYOffset|= packedYOffset<<16;
2474                 packedYOffset|= packedYOffset<<8;
2475
2476                 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
2477
2478                 packedYScale= (uint16_t)(scale*512.0 + 0.5);
2479                 packedYScale|= packedYScale<<32;
2480                 packedYScale|= packedYScale<<16;
2481         }
2482         else
2483         {
2484                 packedYScale= 0x0100010001000100LL;
2485                 packedYOffset= 0;
2486         }
2487
2488         /* copy first row of 8x8 blocks */
2489         for(x=0; x<width; x+=BLOCK_SIZE)
2490                 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2491
2492         for(y=0; y<height; y+=BLOCK_SIZE)
2493         {
2494                 //1% speedup if these are here instead of the inner loop
2495                 uint8_t *srcBlock= &(src[y*srcStride]);
2496                 uint8_t *dstBlock= &(dst[y*dstStride]);
2497
2498                 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2499                    than use a temporary buffer */
2500                 if(y+15 >= height)
2501                 {
2502                         /* copy from line 5 to 12 of src, these will e copied with
2503                            blockcopy to dst later */
2504                         memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5,
2505                                 srcStride*MAX(height-y-5, 0) );
2506
2507                         /* duplicate last line to fill the void upto line 12 */
2508                         if(y+12 >= height)
2509                         {
2510                                 int i;
2511                                 for(i=height-y; i<=12; i++)
2512                                         memcpy(tempSrc + srcStride*i,
2513                                                 src + srcStride*(height-1), srcStride);
2514                         }
2515
2516
2517                         /* copy up to 5 lines of dst */
2518                         memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) );
2519                         dstBlock= tempDst;
2520                         srcBlock= tempSrc;
2521                 }
2522
2523                 // From this point on it is guranteed that we can read and write 16 lines downward
2524                 // finish 1 block before the next otherwise we´ll might have a problem
2525                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2526                 for(x=0; x<width; x+=BLOCK_SIZE)
2527                 {
2528                         const int stride= dstStride;
2529                         int QP= isColor ?
2530                                 QPs[(y>>3)*QPStride + (x>>3)]:
2531                                 QPs[(y>>4)*QPStride + (x>>4)];
2532                         if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8;
2533 #ifdef HAVE_MMX
2534                         asm volatile(
2535                                 "movd %0, %%mm7                                 \n\t"
2536                                 "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2537                                 "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2538                                 "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
2539                                 "movq %%mm7, pQPb                               \n\t"
2540                                 : : "r" (QP)
2541                         );
2542 #endif
2543
2544 #ifdef MORE_TIMING
2545                         T0= rdtsc();
2546 #endif
2547
2548 #ifdef HAVE_MMX2
2549                         prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2550                         prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2551                         prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2552                         prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2553 #elif defined(HAVE_3DNOW)
2554 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2555 /*                      prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2556                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2557                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2558                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2559 */
2560 #endif
2561
2562                         if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
2563
2564                         //can we mess with a 8x16 block, if not use a temp buffer, yes again
2565                         if(x+7 >= width)
2566                         {
2567                                 int i;
2568                                 dstBlockPtrBackup= dstBlock;
2569                                 srcBlockPtrBackup= srcBlock;
2570
2571                                 for(i=0;i<BLOCK_SIZE*2; i++)
2572                                 {
2573                                         memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x);
2574                                         memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x);
2575                                 }
2576
2577                                 dstBlock= tempDstBlock;
2578                                 srcBlock= tempSrcBlock;
2579                         }
2580
2581                         blockCopy(dstBlock + dstStride*5, dstStride,
2582                                 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX);
2583
2584                         if(mode & LINEAR_IPOL_DEINT_FILTER)
2585                                 deInterlaceInterpolateLinear(dstBlock, dstStride);
2586                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
2587                                 deInterlaceBlendLinear(dstBlock, dstStride);
2588                         else if(mode & MEDIAN_DEINT_FILTER)
2589                                 deInterlaceMedian(dstBlock, dstStride);
2590                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
2591                                 deInterlaceInterpolateCubic(dstBlock, dstStride);
2592 /*                      else if(mode & CUBIC_BLEND_DEINT_FILTER)
2593                                 deInterlaceBlendCubic(dstBlock, dstStride);
2594 */
2595
2596                         /* only deblock if we have 2 blocks */
2597                         if(y + 8 < height)
2598                         {
2599 #ifdef MORE_TIMING
2600                                 T1= rdtsc();
2601                                 memcpyTime+= T1-T0;
2602                                 T0=T1;
2603 #endif
2604                                 if(mode & V_DEBLOCK)
2605                                 {
2606                                         if(mode & V_RK1_FILTER)
2607                                                 vertRK1Filter(dstBlock, stride, QP);
2608                                         else if(mode & V_X1_FILTER)
2609                                                 vertX1Filter(dstBlock, stride, QP);
2610                                         else
2611                                         {
2612                                                 if( isVertDC(dstBlock, stride))
2613                                                 {
2614                                                         if(isVertMinMaxOk(dstBlock, stride, QP))
2615                                                                 doVertLowPass(dstBlock, stride, QP);
2616                                                 }
2617                                                 else
2618                                                         doVertDefFilter(dstBlock, stride, QP);
2619                                         }
2620                                 }
2621 #ifdef MORE_TIMING
2622                                 T1= rdtsc();
2623                                 vertTime+= T1-T0;
2624                                 T0=T1;
2625 #endif
2626                         }
2627
2628                         /* check if we have a previous block to deblock it with dstBlock */
2629                         if(x - 8 >= 0)
2630                         {
2631 #ifdef MORE_TIMING
2632                                 T0= rdtsc();
2633 #endif
2634                                 if(mode & H_DEBLOCK)
2635                                 {
2636                                         if(mode & H_X1_FILTER)
2637                                                 horizX1Filter(dstBlock-4, stride, QP);
2638                                         else
2639                                         {
2640                                                 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
2641                                                 {
2642                                                         if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
2643                                                                 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
2644                                                 }
2645                                                 else
2646                                                         doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
2647                                         }
2648                                 }
2649 #ifdef MORE_TIMING
2650                                 T1= rdtsc();
2651                                 horizTime+= T1-T0;
2652                                 T0=T1;
2653 #endif
2654                                 dering(dstBlock - 9 - stride, stride, QP);
2655                         }
2656                         else if(y!=0)
2657                                 dering(dstBlock - stride*9 + width-9, stride, QP);
2658                         //FIXME dering filter will not be applied to last block (bottom right)
2659
2660                         /* did we use a tmp-block buffer */
2661                         if(x+7 >= width)
2662                         {
2663                                 int i;
2664                                 dstBlock= dstBlockPtrBackup;
2665                                 srcBlock= srcBlockPtrBackup;
2666
2667                                 for(i=0;i<BLOCK_SIZE*2; i++)
2668                                 {
2669                                         memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x);
2670                                 }
2671                         }
2672
2673                         dstBlock+=8;
2674                         srcBlock+=8;
2675                 }
2676
2677                 /* did we use a tmp buffer */
2678                 if(y+15 >= height)
2679                 {
2680                         uint8_t *dstBlock= &(dst[y*dstStride]);
2681                         memcpy(dstBlock, tempDst, dstStride*(height-y) );
2682                 }
2683         }
2684 #ifdef HAVE_3DNOW
2685         asm volatile("femms");
2686 #elif defined (HAVE_MMX)
2687         asm volatile("emms");
2688 #endif
2689
2690 #ifdef TIMING
2691         // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
2692         sumTime= rdtsc() - sumTime;
2693         if(!isColor)
2694                 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
2695                         (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
2696                         (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
2697                         , black, white);
2698 #endif
2699 }