]> git.sesse.net Git - ffmpeg/blob - postproc/postprocess_template.c
fixed a rounding bug thing in the X1 Filter
[ffmpeg] / postproc / postprocess_template.c
1 /*
2     Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17 */
18
19 /*
20                         C       MMX     MMX2    3DNow*
21 isVertDC                Ec      Ec
22 isVertMinMaxOk          Ec      Ec
23 doVertLowPass           E               e       e*
24 doVertDefFilter         Ec      Ec      Ec
25 isHorizDC               Ec      Ec
26 isHorizMinMaxOk         a
27 doHorizLowPass          E               a       a*
28 doHorizDefFilter        E       ac      ac
29 deRing
30 Vertical RKAlgo1        E               a       a*
31 Vertical X1             a               E       E*
32 Horizontal X1           a               E       E*
33
34
35 * i dont have a 3dnow CPU -> its untested
36 E = Exact implementation
37 e = allmost exact implementation
38 a = alternative / approximate impl
39 c = checked against the other implementations (-vo md5)
40 */
41
42 /*
43 TODO:
44 verify that everything workes as it should (how?)
45 reduce the time wasted on the mem transfer
46 implement dering
47 implement everything in C at least (done at the moment but ...)
48 unroll stuff if instructions depend too much on the prior one
49 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4?
50 move YScale thing to the end instead of fixing QP
51 write a faster and higher quality deblocking filter :)
52 do something about the speed of the horizontal filters
53 make the mainloop more flexible (variable number of blocks at once
54         (the if/else stuff per block is slowing things down)
55 compare the quality & speed of all filters
56 implement a few simple deinterlacing filters
57 split this huge file
58 ...
59
60 Notes:
61
62 */
63
64 /*
65 Changelog: use the CVS log
66 0.1.3
67         bugfixes: last 3 lines not brightness/contrast corrected
68                 brightness statistics messed up with initial black pic
69         changed initial values of the brightness statistics
70         C++ -> C conversation
71         QP range question solved (very likely 1<=QP<=32 according to arpi)
72         new experimental vertical deblocking filter
73         RK filter has 3dNow support now (untested)
74 0.1.2
75         fixed a bug in the horizontal default filter
76         3dnow version of the Horizontal & Vertical Lowpass filters
77         mmx version of the Horizontal Default filter
78         mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar
79         added mode flags & quality2mode function
80 0.1.1
81 */
82
83
84 #include <inttypes.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include "../config.h"
88 //#undef HAVE_MMX2
89 //#define HAVE_3DNOW
90 //#undef HAVE_MMX
91 #include "postprocess.h"
92
93
94 static uint64_t packedYOffset=  0x0000000000000000LL;
95 static uint64_t packedYScale=   0x0100010001000100LL;
96 static uint64_t w05=            0x0005000500050005LL;
97 static uint64_t w20=            0x0020002000200020LL;
98 static uint64_t w1400=          0x1400140014001400LL;
99 static uint64_t bm00000001=     0x00000000000000FFLL;
100 static uint64_t bm00010000=     0x000000FF00000000LL;
101 static uint64_t bm00001000=     0x00000000FF000000LL;
102 static uint64_t bm10000000=     0xFF00000000000000LL;
103 static uint64_t bm10000001=     0xFF000000000000FFLL;
104 static uint64_t bm11000011=     0xFFFF00000000FFFFLL;
105 static uint64_t bm00000011=     0x000000000000FFFFLL;
106 static uint64_t bm11111110=     0xFFFFFFFFFFFFFF00LL;
107 static uint64_t bm11000000=     0xFFFF000000000000LL;
108 static uint64_t bm00011000=     0x000000FFFF000000LL;
109 static uint64_t bm00110011=     0x0000FFFF0000FFFFLL;
110 static uint64_t bm11001100=     0xFFFF0000FFFF0000LL;
111 static uint64_t b00=            0x0000000000000000LL;
112 static uint64_t b01=            0x0101010101010101LL;
113 static uint64_t b02=            0x0202020202020202LL;
114 static uint64_t b0F=            0x0F0F0F0F0F0F0F0FLL;
115 static uint64_t bFF=            0xFFFFFFFFFFFFFFFFLL;
116 static uint64_t b20=            0x2020202020202020LL;
117 static uint64_t b80=            0x8080808080808080LL;
118 static uint64_t b7E=            0x7E7E7E7E7E7E7E7ELL;
119 static uint64_t b7C=            0x7C7C7C7C7C7C7C7CLL;
120 static uint64_t b3F=            0x3F3F3F3F3F3F3F3FLL;
121 static uint64_t temp0=0;
122 static uint64_t temp1=0;
123 static uint64_t temp2=0;
124 static uint64_t temp3=0;
125 static uint64_t temp4=0;
126 static uint64_t temp5=0;
127 static uint64_t pQPb=0;
128 static uint8_t tempBlock[16*16];
129
130 int hFlatnessThreshold= 56 - 16;
131 int vFlatnessThreshold= 56 - 16;
132
133 //amount of "black" u r willing to loose to get a brightness corrected picture
134 double maxClippedThreshold= 0.01;
135
136 int maxAllowedY=255;
137 //FIXME can never make a movie´s black brighter (anyone needs that?)
138 int minAllowedY=0;
139
140
141 static inline long long rdtsc()
142 {
143         long long l;
144         asm volatile(   "rdtsc\n\t"
145                 : "=A" (l)
146         );
147 //      printf("%d\n", int(l/1000));
148         return l;
149 }
150
151 static inline void prefetchnta(void *p)
152 {
153         asm volatile(   "prefetchnta (%0)\n\t"
154                 : : "r" (p)
155         );
156 }
157
158 static inline void prefetcht0(void *p)
159 {
160         asm volatile(   "prefetcht0 (%0)\n\t"
161                 : : "r" (p)
162         );
163 }
164
165 static inline void prefetcht1(void *p)
166 {
167         asm volatile(   "prefetcht1 (%0)\n\t"
168                 : : "r" (p)
169         );
170 }
171
172 static inline void prefetcht2(void *p)
173 {
174         asm volatile(   "prefetcht2 (%0)\n\t"
175                 : : "r" (p)
176         );
177 }
178
179 //FIXME? |255-0| = 1 (shouldnt be a problem ...)
180 /**
181  * Check if the middle 8x8 Block in the given 8x10 block is flat
182  */
183 static inline int isVertDC(uint8_t src[], int stride){
184 //      return true;
185         int numEq= 0;
186         int y;
187         src+= stride; // src points to begin of the 8x8 Block
188 #ifdef HAVE_MMX
189         asm volatile(
190 //              "int $3 \n\t"
191                 "pushl %1\n\t"
192                 "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
193                 "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
194                 "movq (%1), %%mm0                               \n\t"
195                 "addl %2, %1                                    \n\t"
196                 "movq (%1), %%mm1                               \n\t"
197                 "psubb %%mm1, %%mm0                             \n\t" // mm0 = differnece
198                 "paddb %%mm7, %%mm0                             \n\t"
199                 "pcmpgtb %%mm6, %%mm0                           \n\t"
200
201                 "addl %2, %1                                    \n\t"
202                 "movq (%1), %%mm2                               \n\t"
203                 "psubb %%mm2, %%mm1                             \n\t"
204                 "paddb %%mm7, %%mm1                             \n\t"
205                 "pcmpgtb %%mm6, %%mm1                           \n\t"
206                 "paddb %%mm1, %%mm0                             \n\t"
207
208                 "addl %2, %1                                    \n\t"
209                 "movq (%1), %%mm1                               \n\t"
210                 "psubb %%mm1, %%mm2                             \n\t"
211                 "paddb %%mm7, %%mm2                             \n\t"
212                 "pcmpgtb %%mm6, %%mm2                           \n\t"
213                 "paddb %%mm2, %%mm0                             \n\t"
214
215                 "addl %2, %1                                    \n\t"
216                 "movq (%1), %%mm2                               \n\t"
217                 "psubb %%mm2, %%mm1                             \n\t"
218                 "paddb %%mm7, %%mm1                             \n\t"
219                 "pcmpgtb %%mm6, %%mm1                           \n\t"
220                 "paddb %%mm1, %%mm0                             \n\t"
221
222                 "addl %2, %1                                    \n\t"
223                 "movq (%1), %%mm1                               \n\t"
224                 "psubb %%mm1, %%mm2                             \n\t"
225                 "paddb %%mm7, %%mm2                             \n\t"
226                 "pcmpgtb %%mm6, %%mm2                           \n\t"
227                 "paddb %%mm2, %%mm0                             \n\t"
228
229                 "addl %2, %1                                    \n\t"
230                 "movq (%1), %%mm2                               \n\t"
231                 "psubb %%mm2, %%mm1                             \n\t"
232                 "paddb %%mm7, %%mm1                             \n\t"
233                 "pcmpgtb %%mm6, %%mm1                           \n\t"
234                 "paddb %%mm1, %%mm0                             \n\t"
235
236                 "addl %2, %1                                    \n\t"
237                 "movq (%1), %%mm1                               \n\t"
238                 "psubb %%mm1, %%mm2                             \n\t"
239                 "paddb %%mm7, %%mm2                             \n\t"
240                 "pcmpgtb %%mm6, %%mm2                           \n\t"
241                 "paddb %%mm2, %%mm0                             \n\t"
242
243                 "                                               \n\t"
244                 "movq %%mm0, %%mm1                              \n\t"
245                 "psrlw $8, %%mm0                                \n\t"
246                 "paddb %%mm1, %%mm0                             \n\t"
247                 "movq %%mm0, %%mm1                              \n\t"
248                 "psrlq $16, %%mm0                               \n\t"
249                 "paddb %%mm1, %%mm0                             \n\t"
250                 "movq %%mm0, %%mm1                              \n\t"
251                 "psrlq $32, %%mm0                               \n\t"
252                 "paddb %%mm1, %%mm0                             \n\t"
253                 "popl %1\n\t"
254                 "movd %%mm0, %0                                 \n\t"
255                 : "=r" (numEq)
256                 : "r" (src), "r" (stride)
257                 );
258 //      printf("%d\n", numEq);
259         numEq= (256 - (numEq & 0xFF)) &0xFF;
260
261 //      int asmEq= numEq;
262 //      numEq=0;
263 //      uint8_t *temp= src;
264
265 #else
266         for(y=0; y<BLOCK_SIZE-1; y++)
267         {
268                 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++;
269                 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++;
270                 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++;
271                 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++;
272                 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++;
273                 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++;
274                 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++;
275                 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++;
276                 src+= stride;
277         }
278 #endif
279 /*      if(abs(numEq - asmEq) > 0)
280         {
281                 printf("\nasm:%d  c:%d\n", asmEq, numEq);
282                 for(int y=0; y<8; y++)
283                 {
284                         for(int x=0; x<8; x++)
285                         {
286                                 printf("%d ", temp[x + y*stride]);
287                         }
288                         printf("\n");
289                 }
290         }
291 */
292 //      for(int i=0; i<numEq/8; i++) src[i]=255;
293         return (numEq > vFlatnessThreshold) ? 1 : 0;
294 }
295
296 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP)
297 {
298 #ifdef HAVE_MMX
299         int isOk;
300         asm volatile(
301 //              "int $3 \n\t"
302                 "movq (%1, %2), %%mm0                           \n\t"
303                 "movq (%1, %2, 8), %%mm1                        \n\t"
304                 "movq %%mm0, %%mm2                              \n\t"
305                 "psubusb %%mm1, %%mm0                           \n\t"
306                 "psubusb %%mm2, %%mm1                           \n\t"
307                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
308
309                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
310                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
311                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
312                 "pcmpeqd b00, %%mm0                             \n\t"
313                 "psrlq $16, %%mm0                               \n\t"
314                 "pcmpeqd bFF, %%mm0                             \n\t"
315 //              "movd %%mm0, (%1, %2, 4)\n\t"
316                 "movd %%mm0, %0                                 \n\t"
317                 : "=r" (isOk)
318                 : "r" (src), "r" (stride)
319                 );
320         return isOk ? 1 : 0;
321 #else
322
323         int isOk2= 1;
324         int x;
325         for(x=0; x<BLOCK_SIZE; x++)
326         {
327                 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0;
328         }
329 /*      if(isOk && !isOk2 || !isOk && isOk2)
330         {
331                 printf("\nasm:%d  c:%d QP:%d\n", isOk, isOk2, QP);
332                 for(int y=0; y<9; y++)
333                 {
334                         for(int x=0; x<8; x++)
335                         {
336                                 printf("%d ", src[x + y*stride]);
337                         }
338                         printf("\n");
339                 }
340         } */
341
342         return isOk2;
343 #endif
344
345 }
346
347 /**
348  * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle)
349  * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
350  */
351 static inline void doVertLowPass(uint8_t *src, int stride, int QP)
352 {
353 //      QP= 64;
354
355 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
356 //#ifdef HAVE_MMX2
357         asm volatile(   //"movv %0 %1 %2\n\t"
358                 "pushl %0 \n\t"
359                 "movq pQPb, %%mm0                               \n\t"  // QP,..., QP
360 //              "movq bFF  , %%mm0                              \n\t"  // QP,..., QP
361
362                 "movq (%0), %%mm6                               \n\t"
363                 "movq (%0, %1), %%mm5                           \n\t"
364                 "movq %%mm5, %%mm1                              \n\t"
365                 "movq %%mm6, %%mm2                              \n\t"
366                 "psubusb %%mm6, %%mm5                           \n\t"
367                 "psubusb %%mm1, %%mm2                           \n\t"
368                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
369                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
370                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
371
372                 "pand %%mm2, %%mm6                              \n\t"
373                 "pandn %%mm1, %%mm2                             \n\t"
374                 "por %%mm2, %%mm6                               \n\t"// First Line to Filter
375
376                 "movq (%0, %1, 8), %%mm5                        \n\t"
377                 "leal (%0, %1, 4), %%eax                        \n\t"
378                 "leal (%0, %1, 8), %%ebx                        \n\t"
379                 "subl %1, %%ebx                                 \n\t"
380                 "addl %1, %0                                    \n\t" // %0 points to line 1 not 0
381                 "movq (%0, %1, 8), %%mm7                        \n\t"
382                 "movq %%mm5, %%mm1                              \n\t"
383                 "movq %%mm7, %%mm2                              \n\t"
384                 "psubusb %%mm7, %%mm5                           \n\t"
385                 "psubusb %%mm1, %%mm2                           \n\t"
386                 "por %%mm5, %%mm2                               \n\t" // ABS Diff of lines
387                 "psubusb %%mm0, %%mm2                           \n\t" // diff <= QP -> 0
388                 "pcmpeqb b00, %%mm2                             \n\t" // diff <= QP -> FF
389
390                 "pand %%mm2, %%mm7                              \n\t"
391                 "pandn %%mm1, %%mm2                             \n\t"
392                 "por %%mm2, %%mm7                               \n\t" // First Line to Filter
393
394
395                 //      1       2       3       4       5       6       7       8
396                 //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ebx     eax+4%1
397                 // 6 4 2 2 1 1
398                 // 6 4 4 2
399                 // 6 8 2
400 /*
401                 "movq %%mm6, %%mm2                              \n\t" //1
402                 "movq %%mm6, %%mm3                              \n\t" //1
403                 "paddusb b02, %%mm3                             \n\t"
404                 "psrlw $2, %%mm3                                \n\t" //1       /4
405                 "pand b3F, %%mm3                                \n\t"
406                 "psubb %%mm3, %%mm2                             \n\t"
407                 "movq (%0, %1), %%mm0                           \n\t" //  1
408                 "movq %%mm0, %%mm1                              \n\t" //  1
409                 "paddusb b02, %%mm0                             \n\t"
410                 "psrlw $2, %%mm0                                \n\t" //  1     /4
411                 "pand b3F, %%mm0                                \n\t"
412                 "paddusb %%mm2, %%mm0                           \n\t" //3 1     /4
413 */
414                 "movq (%0, %1), %%mm0                           \n\t" //  1
415                 "movq %%mm0, %%mm1                              \n\t" //  1
416                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
417                 PAVGB(%%mm6, %%mm0)                                   //3 1     /4
418
419                 "movq (%0, %1, 4), %%mm2                        \n\t" //     1
420                 "movq %%mm2, %%mm5                              \n\t" //     1
421                 PAVGB((%%eax), %%mm2)                                 //    11  /2
422                 PAVGB((%0, %1, 2), %%mm2)                             //   211  /4
423                 "movq %%mm2, %%mm3                              \n\t" //   211  /4
424                 "movq (%0), %%mm4                               \n\t" // 1
425                 PAVGB(%%mm4, %%mm3)                                   // 4 211  /8
426                 PAVGB(%%mm0, %%mm3)                                   //642211  /16
427                 "movq %%mm3, (%0)                               \n\t" // X
428                 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
429                 "movq %%mm1, %%mm0                              \n\t" //  1
430                 PAVGB(%%mm6, %%mm0)                                   //1 1     /2
431                 "movq %%mm4, %%mm3                              \n\t" // 1
432                 PAVGB((%0,%1,2), %%mm3)                               // 1 1    /2
433                 PAVGB((%%eax,%1,2), %%mm5)                            //     11 /2
434                 PAVGB((%%eax), %%mm5)                                 //    211 /4
435                 PAVGB(%%mm5, %%mm3)                                   // 2 2211 /8
436                 PAVGB(%%mm0, %%mm3)                                   //4242211 /16
437                 "movq %%mm3, (%0,%1)                            \n\t" //  X
438                 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
439                 PAVGB(%%mm4, %%mm6)                                   //11      /2
440                 "movq (%%ebx), %%mm0                            \n\t" //       1
441                 PAVGB((%%eax, %1, 2), %%mm0)                          //      11/2
442                 "movq %%mm0, %%mm3                              \n\t" //      11/2
443                 PAVGB(%%mm1, %%mm0)                                   //  2   11/4
444                 PAVGB(%%mm6, %%mm0)                                   //222   11/8
445                 PAVGB(%%mm2, %%mm0)                                   //22242211/16
446                 "movq (%0, %1, 2), %%mm2                        \n\t" //   1
447                 "movq %%mm0, (%0, %1, 2)                        \n\t" //   X
448                 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
449                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
450                 PAVGB((%%ebx), %%mm0)                                 //       11       /2
451                 PAVGB(%%mm0, %%mm6)                                   //11     11       /4
452                 PAVGB(%%mm1, %%mm4)                                   // 11             /2
453                 PAVGB(%%mm2, %%mm1)                                   //  11            /2
454                 PAVGB(%%mm1, %%mm6)                                   //1122   11       /8
455                 PAVGB(%%mm5, %%mm6)                                   //112242211       /16
456                 "movq (%%eax), %%mm5                            \n\t" //    1
457                 "movq %%mm6, (%%eax)                            \n\t" //    X
458                 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
459                 "movq (%%eax, %1, 4), %%mm6                     \n\t" //        1
460                 PAVGB(%%mm7, %%mm6)                                   //        11      /2
461                 PAVGB(%%mm4, %%mm6)                                   // 11     11      /4
462                 PAVGB(%%mm3, %%mm6)                                   // 11   2211      /8
463                 PAVGB(%%mm5, %%mm2)                                   //   11           /2
464                 "movq (%0, %1, 4), %%mm4                        \n\t" //     1
465                 PAVGB(%%mm4, %%mm2)                                   //   112          /4
466                 PAVGB(%%mm2, %%mm6)                                   // 112242211      /16
467                 "movq %%mm6, (%0, %1, 4)                        \n\t" //     X
468                 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
469                 PAVGB(%%mm7, %%mm1)                                   //  11     2      /4
470                 PAVGB(%%mm4, %%mm5)                                   //    11          /2
471                 PAVGB(%%mm5, %%mm0)                                   //    11 11       /4
472                 "movq (%%eax, %1, 2), %%mm6                     \n\t" //      1
473                 PAVGB(%%mm6, %%mm1)                                   //  11  4  2      /8
474                 PAVGB(%%mm0, %%mm1)                                   //  11224222      /16
475 //              "pxor %%mm1, %%mm1 \n\t"
476                 "movq %%mm1, (%%eax, %1, 2)                     \n\t" //      X
477                 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
478                 PAVGB((%%ebx), %%mm2)                                 //   112 4        /8
479                 "movq (%%eax, %1, 4), %%mm0                     \n\t" //        1
480                 PAVGB(%%mm0, %%mm6)                                   //      1 1       /2
481                 PAVGB(%%mm7, %%mm6)                                   //      1 12      /4
482                 PAVGB(%%mm2, %%mm6)                                   //   1122424      /4
483 //              "pxor %%mm6, %%mm6 \n\t"
484                 "movq %%mm6, (%%ebx)                            \n\t" //       X
485                 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
486                 PAVGB(%%mm7, %%mm5)                                   //    11   2      /4
487                 PAVGB(%%mm7, %%mm5)                                   //    11   6      /8
488
489                 PAVGB(%%mm3, %%mm0)                                   //      112       /4
490                 PAVGB(%%mm0, %%mm5)                                   //    112246      /16
491 //              "pxor %%mm5, %%mm5 \n\t"
492 //              "movq pQPb, %%mm5 \n\t"
493                 "movq %%mm5, (%%eax, %1, 4)                     \n\t" //        X
494                 "popl %0\n\t"
495
496                 :
497                 : "r" (src), "r" (stride)
498                 : "%eax", "%ebx"
499         );
500 #else
501         const int l1= stride;
502         const int l2= stride + l1;
503         const int l3= stride + l2;
504         const int l4= stride + l3;
505         const int l5= stride + l4;
506         const int l6= stride + l5;
507         const int l7= stride + l6;
508         const int l8= stride + l7;
509         const int l9= stride + l8;
510         int x;
511         for(x=0; x<BLOCK_SIZE; x++)
512         {
513                 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1];
514                 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8];
515
516                 int sums[9];
517                 sums[0] = first + src[l1];
518                 sums[1] = src[l1] + src[l2];
519                 sums[2] = src[l2] + src[l3];
520                 sums[3] = src[l3] + src[l4];
521                 sums[4] = src[l4] + src[l5];
522                 sums[5] = src[l5] + src[l6];
523                 sums[6] = src[l6] + src[l7];
524                 sums[7] = src[l7] + src[l8];
525                 sums[8] = src[l8] + last;
526
527                 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
528                 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
529                 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
530                 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
531                 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
532                 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
533                 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4;
534                 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
535
536                 src++;
537         }
538
539 #endif
540 }
541
542 /**
543  * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
544  * values are correctly clipped (MMX2)
545  * values are wraparound (C)
546  * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
547         0 8 16 24
548         x = 8
549         x/2 = 4
550         x/8 = 1
551         1 12 12 23
552  */
553 static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
554 {
555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
556 // FIXME rounding
557         asm volatile(
558                 "pxor %%mm7, %%mm7                              \n\t" // 0
559                 "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
560                 "leal (%0, %1), %%eax                           \n\t"
561                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
562 //      0       1       2       3       4       5       6       7       8       9
563 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
564                 "movq pQPb, %%mm0                               \n\t" // QP,..., QP
565                 "movq %%mm0, %%mm1                              \n\t" // QP,..., QP
566                 "paddusb b02, %%mm0                             \n\t"
567                 "psrlw $2, %%mm0                                \n\t"
568                 "pand b3F, %%mm0                                \n\t" // QP/4,..., QP/4
569                 "paddusb %%mm1, %%mm0                           \n\t" // QP*1.25 ...
570                 "movq (%0, %1, 4), %%mm2                        \n\t" // line 4
571                 "movq (%%ebx), %%mm3                            \n\t" // line 5
572                 "movq %%mm2, %%mm4                              \n\t" // line 4
573                 "pcmpeqb %%mm5, %%mm5                           \n\t" // -1
574                 "pxor %%mm2, %%mm5                              \n\t" // -line 4 - 1
575                 PAVGB(%%mm3, %%mm5)
576                 "paddb %%mm6, %%mm5                             \n\t" // (l5-l4)/2
577                 "psubusb %%mm3, %%mm4                           \n\t"
578                 "psubusb %%mm2, %%mm3                           \n\t"
579                 "por %%mm3, %%mm4                               \n\t" // |l4 - l5|
580                 "psubusb %%mm0, %%mm4                           \n\t"
581                 "pcmpeqb %%mm7, %%mm4                           \n\t"
582                 "pand %%mm4, %%mm5                              \n\t" // d/2
583
584 //              "paddb %%mm6, %%mm2                             \n\t" // line 4 + 0x80
585                 "paddb %%mm5, %%mm2                             \n\t"
586 //              "psubb %%mm6, %%mm2                             \n\t"
587                 "movq %%mm2, (%0,%1, 4)                         \n\t"
588
589                 "movq (%%ebx), %%mm2                            \n\t"
590 //              "paddb %%mm6, %%mm2                             \n\t" // line 5 + 0x80
591                 "psubb %%mm5, %%mm2                             \n\t"
592 //              "psubb %%mm6, %%mm2                             \n\t"
593                 "movq %%mm2, (%%ebx)                            \n\t"
594
595                 "paddb %%mm6, %%mm5                             \n\t"
596                 "psrlw $2, %%mm5                                \n\t"
597                 "pand b3F, %%mm5                                \n\t"
598                 "psubb b20, %%mm5                               \n\t" // (l5-l4)/8
599
600                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
601                 "paddb %%mm6, %%mm2                             \n\t" // line 3 + 0x80
602                 "paddsb %%mm5, %%mm2                            \n\t"
603                 "psubb %%mm6, %%mm2                             \n\t"
604                 "movq %%mm2, (%%eax, %1, 2)                     \n\t"
605
606                 "movq (%%ebx, %1), %%mm2                        \n\t"
607                 "paddb %%mm6, %%mm2                             \n\t" // line 6 + 0x80
608                 "psubsb %%mm5, %%mm2                            \n\t"
609                 "psubb %%mm6, %%mm2                             \n\t"
610                 "movq %%mm2, (%%ebx, %1)                        \n\t"
611
612                 :
613                 : "r" (src), "r" (stride)
614                 : "%eax", "%ebx"
615         );
616 #else
617         const int l1= stride;
618         const int l2= stride + l1;
619         const int l3= stride + l2;
620         const int l4= stride + l3;
621         const int l5= stride + l4;
622         const int l6= stride + l5;
623         const int l7= stride + l6;
624         const int l8= stride + l7;
625         const int l9= stride + l8;
626         int x;
627         for(x=0; x<BLOCK_SIZE; x++)
628         {
629                 if(ABS(src[l4]-src[l5]) < QP + QP/4)
630                 {
631                         int v = (src[l5] - src[l4]);
632
633                         src[l3] +=v/8;
634                         src[l4] +=v/2;
635                         src[l5] -=v/2;
636                         src[l6] -=v/8;
637
638                 }
639                 src++;
640         }
641
642 #endif
643 }
644
645 /**
646  * Experimental Filter 1
647  * will not damage linear gradients
648  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
649  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
650  * MMX2 version does correct clipping C version doesnt
651  */
652 static inline void vertX1Filter(uint8_t *src, int stride, int QP)
653 {
654 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
655         asm volatile(
656                 "pxor %%mm7, %%mm7                              \n\t" // 0
657 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
658                 "leal (%0, %1), %%eax                           \n\t"
659                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
660 //      0       1       2       3       4       5       6       7       8       9
661 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
662                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
663                 "movq (%0, %1, 4), %%mm1                        \n\t" // line 4
664                 "movq %%mm1, %%mm2                              \n\t" // line 4
665                 "psubusb %%mm0, %%mm1                           \n\t"
666                 "psubusb %%mm2, %%mm0                           \n\t"
667                 "por %%mm1, %%mm0                               \n\t" // |l2 - l3|
668                 "movq (%%ebx), %%mm3                            \n\t" // line 5
669                 "movq (%%ebx, %1), %%mm4                                \n\t" // line 6
670                 "movq %%mm3, %%mm5                              \n\t" // line 5
671                 "psubusb %%mm4, %%mm3                           \n\t"
672                 "psubusb %%mm5, %%mm4                           \n\t"
673                 "por %%mm4, %%mm3                               \n\t" // |l5 - l6|
674                 PAVGB(%%mm3, %%mm0)                                   // (|l2 - l3| + |l5 - l6|)/2
675                 "movq %%mm2, %%mm1                              \n\t" // line 4
676                 "psubusb %%mm5, %%mm2                           \n\t"
677                 "movq %%mm2, %%mm4                              \n\t"
678                 "pcmpeqb %%mm7, %%mm2                           \n\t" // (l4 - l5) <= 0 ? -1 : 0
679                 "psubusb %%mm1, %%mm5                           \n\t"
680                 "por %%mm5, %%mm4                               \n\t" // |l4 - l5|
681                 "psubusb %%mm0, %%mm4           \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
682                 "movq %%mm4, %%mm3                              \n\t" // d
683                 "psubusb pQPb, %%mm4                            \n\t"
684                 "pcmpeqb %%mm7, %%mm4                           \n\t" // d <= QP ? -1 : 0
685                 "psubusb b01, %%mm3                             \n\t"
686                 "pand %%mm4, %%mm3                              \n\t" // d <= QP ? d : 0
687
688                 PAVGB(%%mm7, %%mm3)                                   // d/2
689                 "movq %%mm3, %%mm1                              \n\t" // d/2
690                 PAVGB(%%mm7, %%mm3)                                   // d/4
691                 PAVGB(%%mm1, %%mm3)                                   // 3*d/8
692
693                 "movq (%0, %1, 4), %%mm0                        \n\t" // line 4
694                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
695                 "psubusb %%mm3, %%mm0                           \n\t"
696                 "pxor %%mm2, %%mm0                              \n\t"
697                 "movq %%mm0, (%0, %1, 4)                        \n\t" // line 4
698
699                 "movq (%%ebx), %%mm0                            \n\t" // line 5
700                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
701                 "paddusb %%mm3, %%mm0                           \n\t"
702                 "pxor %%mm2, %%mm0                              \n\t"
703                 "movq %%mm0, (%%ebx)                            \n\t" // line 5
704
705                 PAVGB(%%mm7, %%mm1)                                   // d/4
706
707                 "movq (%%eax, %1, 2), %%mm0                     \n\t" // line 3
708                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
709                 "psubusb %%mm1, %%mm0                           \n\t"
710                 "pxor %%mm2, %%mm0                              \n\t"
711                 "movq %%mm0, (%%eax, %1, 2)                     \n\t" // line 3
712
713                 "movq (%%ebx, %1), %%mm0                        \n\t" // line 6
714                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
715                 "paddusb %%mm1, %%mm0                           \n\t"
716                 "pxor %%mm2, %%mm0                              \n\t"
717                 "movq %%mm0, (%%ebx, %1)                        \n\t" // line 6
718
719                 PAVGB(%%mm7, %%mm1)                                   // d/8
720
721                 "movq (%%eax, %1), %%mm0                        \n\t" // line 2
722                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
723                 "psubusb %%mm1, %%mm0                           \n\t"
724                 "pxor %%mm2, %%mm0                              \n\t"
725                 "movq %%mm0, (%%eax, %1)                        \n\t" // line 2
726
727                 "movq (%%ebx, %1, 2), %%mm0                     \n\t" // line 7
728                 "pxor %%mm2, %%mm0                              \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
729                 "paddusb %%mm1, %%mm0                           \n\t"
730                 "pxor %%mm2, %%mm0                              \n\t"
731                 "movq %%mm0, (%%ebx, %1, 2)                     \n\t" // line 7
732
733                 :
734                 : "r" (src), "r" (stride)
735                 : "%eax", "%ebx"
736         );
737 #else
738
739         const int l1= stride;
740         const int l2= stride + l1;
741         const int l3= stride + l2;
742         const int l4= stride + l3;
743         const int l5= stride + l4;
744         const int l6= stride + l5;
745         const int l7= stride + l6;
746         const int l8= stride + l7;
747         const int l9= stride + l8;
748         int x;
749         for(x=0; x<BLOCK_SIZE; x++)
750         {
751                 int a= src[l3] - src[l4];
752                 int b= src[l4] - src[l5];
753                 int c= src[l5] - src[l6];
754
755                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
756
757                 if(d < QP)
758                 {
759                         int v = d * SIGN(-b);
760
761                         src[l2] +=v/8;
762                         src[l3] +=v/4;
763                         src[l4] +=3*v/8;
764                         src[l5] -=3*v/8;
765                         src[l6] -=v/4;
766                         src[l7] -=v/8;
767
768                 }
769                 src++;
770         }
771         /*
772         const int l1= stride;
773         const int l2= stride + l1;
774         const int l3= stride + l2;
775         const int l4= stride + l3;
776         const int l5= stride + l4;
777         const int l6= stride + l5;
778         const int l7= stride + l6;
779         const int l8= stride + l7;
780         const int l9= stride + l8;
781         for(int x=0; x<BLOCK_SIZE; x++)
782         {
783                 int v2= src[l2];
784                 int v3= src[l3];
785                 int v4= src[l4];
786                 int v5= src[l5];
787                 int v6= src[l6];
788                 int v7= src[l7];
789
790                 if(ABS(v4-v5)<QP &&  ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 )
791                 {
792                         src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6         )/16;
793                         src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7  )/16;
794                         src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16;
795                         src[l6] = (       1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16;
796                 }
797                 src++;
798         }
799 */
800 #endif
801 }
802
803 /**
804  * Experimental Filter 1 (Horizontal)
805  * will not damage linear gradients
806  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
807  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
808  * MMX2 version does correct clipping C version doesnt
809  * not identical with the vertical one
810  */
811 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
812 {
813         int y;
814         static uint64_t *lut= NULL;
815         if(lut==NULL)
816         {
817                 int i;
818                 lut= (uint64_t*)memalign(8, 256*8);
819                 for(i=0; i<256; i++)
820                 {
821                         int v= i < 128 ? 2*i : 2*(i-256);
822 /*
823 //Simulate 112242211 9-Tap filter
824                         uint64_t a= (v/16) & 0xFF;
825                         uint64_t b= (v/8) & 0xFF;
826                         uint64_t c= (v/4) & 0xFF;
827                         uint64_t d= (3*v/8) & 0xFF;
828 */
829 //Simulate piecewise linear interpolation
830                         uint64_t a= (v/16) & 0xFF;
831                         uint64_t b= (v*3/16) & 0xFF;
832                         uint64_t c= (v*5/16) & 0xFF;
833                         uint64_t d= (7*v/16) & 0xFF;
834                         uint64_t A= (0x100 - a)&0xFF;
835                         uint64_t B= (0x100 - b)&0xFF;
836                         uint64_t C= (0x100 - c)&0xFF;
837                         uint64_t D= (0x100 - c)&0xFF;
838
839                         lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
840                                 (D<<24) | (C<<16) | (B<<8) | (A);
841                         //lut[i] = (v<<32) | (v<<24);
842                 }
843         }
844
845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
846         asm volatile(
847                 "pxor %%mm7, %%mm7                              \n\t" // 0
848 //              "movq b80, %%mm6                                \n\t" // MIN_SIGNED_BYTE
849                 "leal (%0, %1), %%eax                           \n\t"
850                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
851
852                 "movq b80, %%mm6                                \n\t"
853                 "movd %2, %%mm5                                 \n\t" // QP
854                 "movq %%mm5, %%mm4                              \n\t"
855                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
856                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
857                 "pxor %%mm5, %%mm5                              \n\t" // 0
858                 "psubb %%mm4, %%mm5                             \n\t" // -3QP
859                 "por bm11111110, %%mm5                          \n\t" // ...,FF,FF,-3QP
860                 "psllq $24, %%mm5                               \n\t"
861
862 //      0       1       2       3       4       5       6       7       8       9
863 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
864
865 #define HX1old(a) \
866                 "movd " #a ", %%mm0                             \n\t"\
867                 "movd 4" #a ", %%mm1                            \n\t"\
868                 "punpckldq %%mm1, %%mm0                         \n\t"\
869                 "movq %%mm0, %%mm1                              \n\t"\
870                 "movq %%mm0, %%mm2                              \n\t"\
871                 "psrlq $8, %%mm1                                \n\t"\
872                 "psubusb %%mm1, %%mm2                           \n\t"\
873                 "psubusb %%mm0, %%mm1                           \n\t"\
874                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
875                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
876                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
877                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
878                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
879                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
880                 "paddb %%mm5, %%mm1                             \n\t"\
881                 "psubusb %%mm5, %%mm1                           \n\t"\
882                 PAVGB(%%mm7, %%mm1)\
883                 "pxor %%mm2, %%mm1                              \n\t"\
884                 "psubb %%mm2, %%mm1                             \n\t"\
885                 "psrlq $24, %%mm1                               \n\t"\
886                 "movd %%mm1, %%ecx                              \n\t"\
887                 "paddb %%mm6, %%mm0                             \n\t"\
888                 "paddsb (%3, %%ecx, 8), %%mm0                   \n\t"\
889                 "paddb %%mm6, %%mm0                             \n\t"\
890                 "movq %%mm0, " #a "                             \n\t"\
891
892 /*
893 HX1old((%0))
894 HX1old((%%eax))
895 HX1old((%%eax, %1))
896 HX1old((%%eax, %1, 2))
897 HX1old((%0, %1, 4))
898 HX1old((%%ebx))
899 HX1old((%%ebx, %1))
900 HX1old((%%ebx, %1, 2))
901 */
902
903 //FIXME add some comments, its unreadable ...
904 #define HX1b(a, c, b, d) \
905                 "movd " #a ", %%mm0                             \n\t"\
906                 "movd 4" #a ", %%mm1                            \n\t"\
907                 "punpckldq %%mm1, %%mm0                         \n\t"\
908                 "movd " #b ", %%mm4                             \n\t"\
909                 "movq %%mm0, %%mm1                              \n\t"\
910                 "movq %%mm0, %%mm2                              \n\t"\
911                 "psrlq $8, %%mm1                                \n\t"\
912                 "movd 4" #b ", %%mm3                            \n\t"\
913                 "psubusb %%mm1, %%mm2                           \n\t"\
914                 "psubusb %%mm0, %%mm1                           \n\t"\
915                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
916                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
917                 "punpckldq %%mm3, %%mm4                         \n\t"\
918                 "movq %%mm1, %%mm3                              \n\t"\
919                 "psllq $32, %%mm3                               \n\t" /* p´5 = |p1 - p2| */\
920                 PAVGB(%%mm1, %%mm3)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
921                 "paddb %%mm6, %%mm0                             \n\t"\
922                 "psrlq $16, %%mm3                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
923                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
924                 "movq %%mm4, %%mm3                              \n\t"\
925                 "paddb %%mm5, %%mm1                             \n\t"\
926                 "psubusb %%mm5, %%mm1                           \n\t"\
927                 "psrlq $8, %%mm3                                \n\t"\
928                 PAVGB(%%mm7, %%mm1)\
929                 "pxor %%mm2, %%mm1                              \n\t"\
930                 "psubb %%mm2, %%mm1                             \n\t"\
931                 "movq %%mm4, %%mm2                              \n\t"\
932                 "psrlq $24, %%mm1                               \n\t"\
933                 "psubusb %%mm3, %%mm2                           \n\t"\
934                 "movd %%mm1, %%ecx                              \n\t"\
935                 "psubusb %%mm4, %%mm3                           \n\t"\
936                 "paddsb (%3, %%ecx, 8), %%mm0                   \n\t"\
937                 "por %%mm2, %%mm3                               \n\t" /* p´x = |px - p(x+1)| */\
938                 "paddb %%mm6, %%mm0                             \n\t"\
939                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
940                 "movq %%mm3, %%mm1                              \n\t"\
941                 "psllq $32, %%mm1                               \n\t" /* p´5 = |p1 - p2| */\
942                 "movq %%mm0, " #a "                             \n\t"\
943                 PAVGB(%%mm3, %%mm1)                                   /* p´5 = (|p2-p1| + |p6-p5|)/2 */\
944                 "paddb %%mm6, %%mm4                             \n\t"\
945                 "psrlq $16, %%mm1                               \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\
946                 "psubusb %%mm1, %%mm3                   \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
947                 "paddb %%mm5, %%mm3                             \n\t"\
948                 "psubusb %%mm5, %%mm3                           \n\t"\
949                 PAVGB(%%mm7, %%mm3)\
950                 "pxor %%mm2, %%mm3                              \n\t"\
951                 "psubb %%mm2, %%mm3                             \n\t"\
952                 "psrlq $24, %%mm3                               \n\t"\
953                 "movd " #c ", %%mm0                             \n\t"\
954                 "movd 4" #c ", %%mm1                            \n\t"\
955                 "punpckldq %%mm1, %%mm0                         \n\t"\
956                 "paddb %%mm6, %%mm0                             \n\t"\
957                 "paddsb (%3, %%ecx, 8), %%mm0                   \n\t"\
958                 "paddb %%mm6, %%mm0                             \n\t"\
959                 "movq %%mm0, " #c "                             \n\t"\
960                 "movd %%mm3, %%ecx                              \n\t"\
961                 "movd " #d ", %%mm0                             \n\t"\
962                 "paddsb (%3, %%ecx, 8), %%mm4                   \n\t"\
963                 "movd 4" #d ", %%mm1                            \n\t"\
964                 "paddb %%mm6, %%mm4                             \n\t"\
965                 "punpckldq %%mm1, %%mm0                         \n\t"\
966                 "movq %%mm4, " #b "                             \n\t"\
967                 "paddb %%mm6, %%mm0                             \n\t"\
968                 "paddsb (%3, %%ecx, 8), %%mm0                   \n\t"\
969                 "paddb %%mm6, %%mm0                             \n\t"\
970                 "movq %%mm0, " #d "                             \n\t"\
971
972 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
973 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
974
975
976                 :
977                 : "r" (src), "r" (stride), "r" (QP), "r" (lut)
978                 : "%eax", "%ebx", "%ecx"
979         );
980 #else
981
982 //FIXME (has little in common with the mmx2 version)
983         for(y=0; y<BLOCK_SIZE; y++)
984         {
985                 int a= src[1] - src[2];
986                 int b= src[3] - src[4];
987                 int c= src[5] - src[6];
988
989                 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
990
991                 if(d < QP)
992                 {
993                         int v = d * SIGN(-b);
994
995                         src[1] +=v/8;
996                         src[2] +=v/4;
997                         src[3] +=3*v/8;
998                         src[4] -=3*v/8;
999                         src[5] -=v/4;
1000                         src[6] -=v/8;
1001
1002                 }
1003                 src+=stride;
1004         }
1005 #endif
1006 }
1007
1008
1009 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
1010 {
1011 #ifdef HAVE_MMX
1012         src+= stride;
1013         //FIXME try pmul for *5 stuff
1014 //      src[0]=0;
1015         asm volatile(
1016                 "pxor %%mm7, %%mm7                              \n\t"
1017                 "leal (%0, %1), %%eax                           \n\t"
1018                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1019 //      0       1       2       3       4       5       6       7
1020 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ebx+%1  ebx+2%1
1021 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1
1022
1023                 "movq (%0), %%mm0                               \n\t"
1024                 "movq %%mm0, %%mm1                              \n\t"
1025                 "punpcklbw %%mm7, %%mm0                         \n\t" // low part of line 0
1026                 "punpckhbw %%mm7, %%mm1                         \n\t" // high part of line 0
1027
1028                 "movq (%%eax), %%mm2                            \n\t"
1029                 "movq %%mm2, %%mm3                              \n\t"
1030                 "punpcklbw %%mm7, %%mm2                         \n\t" // low part of line 1
1031                 "punpckhbw %%mm7, %%mm3                         \n\t" // high part of line 1
1032
1033                 "movq (%%eax, %1), %%mm4                        \n\t"
1034                 "movq %%mm4, %%mm5                              \n\t"
1035                 "punpcklbw %%mm7, %%mm4                         \n\t" // low part of line 2
1036                 "punpckhbw %%mm7, %%mm5                         \n\t" // high part of line 2
1037
1038                 "paddw %%mm0, %%mm0                             \n\t" // 2L0
1039                 "paddw %%mm1, %%mm1                             \n\t" // 2H0
1040                 "psubw %%mm4, %%mm2                             \n\t" // L1 - L2
1041                 "psubw %%mm5, %%mm3                             \n\t" // H1 - H2
1042                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - L1 + L2
1043                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - H1 + H2
1044
1045                 "psllw $2, %%mm2                                \n\t" // 4L1 - 4L2
1046                 "psllw $2, %%mm3                                \n\t" // 4H1 - 4H2
1047                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2
1048                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2
1049
1050                 "movq (%%eax, %1, 2), %%mm2                     \n\t"
1051                 "movq %%mm2, %%mm3                              \n\t"
1052                 "punpcklbw %%mm7, %%mm2                         \n\t" // L3
1053                 "punpckhbw %%mm7, %%mm3                         \n\t" // H3
1054
1055                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - L3
1056                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - H3
1057                 "psubw %%mm2, %%mm0                             \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1058                 "psubw %%mm3, %%mm1                             \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1059                 "movq %%mm0, temp0                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1060                 "movq %%mm1, temp1                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1061
1062                 "movq (%0, %1, 4), %%mm0                        \n\t"
1063                 "movq %%mm0, %%mm1                              \n\t"
1064                 "punpcklbw %%mm7, %%mm0                         \n\t" // L4
1065                 "punpckhbw %%mm7, %%mm1                         \n\t" // H4
1066
1067                 "psubw %%mm0, %%mm2                             \n\t" // L3 - L4
1068                 "psubw %%mm1, %%mm3                             \n\t" // H3 - H4
1069                 "movq %%mm2, temp2                              \n\t" // L3 - L4
1070                 "movq %%mm3, temp3                              \n\t" // H3 - H4
1071                 "paddw %%mm4, %%mm4                             \n\t" // 2L2
1072                 "paddw %%mm5, %%mm5                             \n\t" // 2H2
1073                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - L3 + L4
1074                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - H3 + H4
1075
1076                 "psllw $2, %%mm2                                \n\t" // 4L3 - 4L4
1077                 "psllw $2, %%mm3                                \n\t" // 4H3 - 4H4
1078                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4
1079                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4
1080 //50 opcodes so far
1081                 "movq (%%ebx), %%mm2                            \n\t"
1082                 "movq %%mm2, %%mm3                              \n\t"
1083                 "punpcklbw %%mm7, %%mm2                         \n\t" // L5
1084                 "punpckhbw %%mm7, %%mm3                         \n\t" // H5
1085                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - L5
1086                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - H5
1087                 "psubw %%mm2, %%mm4                             \n\t" // 2L2 - 5L3 + 5L4 - 2L5
1088                 "psubw %%mm3, %%mm5                             \n\t" // 2H2 - 5H3 + 5H4 - 2H5
1089
1090                 "movq (%%ebx, %1), %%mm6                        \n\t"
1091                 "punpcklbw %%mm7, %%mm6                         \n\t" // L6
1092                 "psubw %%mm6, %%mm2                             \n\t" // L5 - L6
1093                 "movq (%%ebx, %1), %%mm6                        \n\t"
1094                 "punpckhbw %%mm7, %%mm6                         \n\t" // H6
1095                 "psubw %%mm6, %%mm3                             \n\t" // H5 - H6
1096
1097                 "paddw %%mm0, %%mm0                             \n\t" // 2L4
1098                 "paddw %%mm1, %%mm1                             \n\t" // 2H4
1099                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - L5 + L6
1100                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - H5 + H6
1101
1102                 "psllw $2, %%mm2                                \n\t" // 4L5 - 4L6
1103                 "psllw $2, %%mm3                                \n\t" // 4H5 - 4H6
1104                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6
1105                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6
1106
1107                 "movq (%%ebx, %1, 2), %%mm2                     \n\t"
1108                 "movq %%mm2, %%mm3                              \n\t"
1109                 "punpcklbw %%mm7, %%mm2                         \n\t" // L7
1110                 "punpckhbw %%mm7, %%mm3                         \n\t" // H7
1111
1112                 "paddw %%mm2, %%mm2                             \n\t" // 2L7
1113                 "paddw %%mm3, %%mm3                             \n\t" // 2H7
1114                 "psubw %%mm2, %%mm0                             \n\t" // 2L4 - 5L5 + 5L6 - 2L7
1115                 "psubw %%mm3, %%mm1                             \n\t" // 2H4 - 5H5 + 5H6 - 2H7
1116
1117                 "movq temp0, %%mm2                              \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1118                 "movq temp1, %%mm3                              \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1119 //FIXME pxor, psubw, pmax for abs
1120                 "movq %%mm7, %%mm6                              \n\t" // 0
1121                 "pcmpgtw %%mm0, %%mm6                           \n\t"
1122                 "pxor %%mm6, %%mm0                              \n\t"
1123                 "psubw %%mm6, %%mm0                             \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1124                 "movq %%mm7, %%mm6                              \n\t" // 0
1125                 "pcmpgtw %%mm1, %%mm6                           \n\t"
1126                 "pxor %%mm6, %%mm1                              \n\t"
1127                 "psubw %%mm6, %%mm1                             \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1128
1129                 "movq %%mm7, %%mm6                              \n\t" // 0
1130                 "pcmpgtw %%mm2, %%mm6                           \n\t"
1131                 "pxor %%mm6, %%mm2                              \n\t"
1132                 "psubw %%mm6, %%mm2                             \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1133                 "movq %%mm7, %%mm6                              \n\t" // 0
1134                 "pcmpgtw %%mm3, %%mm6                           \n\t"
1135                 "pxor %%mm6, %%mm3                              \n\t"
1136                 "psubw %%mm6, %%mm3                             \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1137
1138 #ifdef HAVE_MMX2
1139                 "pminsw %%mm2, %%mm0                            \n\t"
1140                 "pminsw %%mm3, %%mm1                            \n\t"
1141 #else
1142                 "movq %%mm0, %%mm6                              \n\t"
1143                 "psubusw %%mm2, %%mm6                           \n\t"
1144                 "psubw %%mm6, %%mm0                             \n\t"
1145                 "movq %%mm1, %%mm6                              \n\t"
1146                 "psubusw %%mm3, %%mm6                           \n\t"
1147                 "psubw %%mm6, %%mm1                             \n\t"
1148 #endif
1149
1150                 "movq %%mm7, %%mm6                              \n\t" // 0
1151                 "pcmpgtw %%mm4, %%mm6                           \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1152                 "pxor %%mm6, %%mm4                              \n\t"
1153                 "psubw %%mm6, %%mm4                             \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1154                 "pcmpgtw %%mm5, %%mm7                           \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1155                 "pxor %%mm7, %%mm5                              \n\t"
1156                 "psubw %%mm7, %%mm5                             \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1157 // 100 opcodes
1158                 "movd %2, %%mm2                                 \n\t" // QP
1159 //"pcmpeqb %%mm2, %%mm2\n\t"
1160                 "punpcklwd %%mm2, %%mm2                         \n\t"
1161                 "punpcklwd %%mm2, %%mm2                         \n\t"
1162                 "psllw $3, %%mm2                                \n\t" // 8QP
1163                 "movq %%mm2, %%mm3                              \n\t" // 8QP
1164                 "pcmpgtw %%mm4, %%mm2                           \n\t"
1165                 "pcmpgtw %%mm5, %%mm3                           \n\t"
1166                 "pand %%mm2, %%mm4                              \n\t"
1167                 "pand %%mm3, %%mm5                              \n\t"
1168
1169
1170                 "psubusw %%mm0, %%mm4                           \n\t" // hd
1171                 "psubusw %%mm1, %%mm5                           \n\t" // ld
1172
1173
1174                 "movq w05, %%mm2                                \n\t" // 5
1175                 "pmullw %%mm2, %%mm4                            \n\t"
1176                 "pmullw %%mm2, %%mm5                            \n\t"
1177                 "movq w20, %%mm2                                \n\t" // 32
1178                 "paddw %%mm2, %%mm4                             \n\t"
1179                 "paddw %%mm2, %%mm5                             \n\t"
1180                 "psrlw $6, %%mm4                                \n\t"
1181                 "psrlw $6, %%mm5                                \n\t"
1182
1183 /*
1184                 "movq w06, %%mm2                                \n\t" // 6
1185                 "paddw %%mm2, %%mm4                             \n\t"
1186                 "paddw %%mm2, %%mm5                             \n\t"
1187                 "movq w1400, %%mm2                              \n\t" // 1400h = 5120 = 5/64*2^16
1188 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120
1189                 "pmulhw %%mm2, %%mm4                            \n\t" // hd/13
1190                 "pmulhw %%mm2, %%mm5                            \n\t" // ld/13
1191 */
1192
1193                 "movq temp2, %%mm0                              \n\t" // L3 - L4
1194                 "movq temp3, %%mm1                              \n\t" // H3 - H4
1195
1196                 "pxor %%mm2, %%mm2                              \n\t"
1197                 "pxor %%mm3, %%mm3                              \n\t"
1198
1199                 // FIXME rounding error
1200                 "psraw $1, %%mm0                                \n\t" // (L3 - L4)/2
1201                 "psraw $1, %%mm1                                \n\t" // (H3 - H4)/2
1202                 "pcmpgtw %%mm0, %%mm2                           \n\t" // sign (L3-L4)
1203                 "pcmpgtw %%mm1, %%mm3                           \n\t" // sign (H3-H4)
1204                 "pxor %%mm2, %%mm0                              \n\t"
1205                 "pxor %%mm3, %%mm1                              \n\t"
1206                 "psubw %%mm2, %%mm0                             \n\t" // |L3-L4|
1207                 "psubw %%mm3, %%mm1                             \n\t" // |H3-H4|
1208 //              "psrlw $1, %%mm0                                \n\t" // |L3 - L4|/2
1209 //              "psrlw $1, %%mm1                                \n\t" // |H3 - H4|/2
1210
1211                 "pxor %%mm6, %%mm2                              \n\t"
1212                 "pxor %%mm7, %%mm3                              \n\t"
1213                 "pand %%mm2, %%mm4                              \n\t"
1214                 "pand %%mm3, %%mm5                              \n\t"
1215
1216 #ifdef HAVE_MMX2
1217                 "pminsw %%mm0, %%mm4                            \n\t"
1218                 "pminsw %%mm1, %%mm5                            \n\t"
1219 #else
1220                 "movq %%mm4, %%mm2                              \n\t"
1221                 "psubusw %%mm0, %%mm2                           \n\t"
1222                 "psubw %%mm2, %%mm4                             \n\t"
1223                 "movq %%mm5, %%mm2                              \n\t"
1224                 "psubusw %%mm1, %%mm2                           \n\t"
1225                 "psubw %%mm2, %%mm5                             \n\t"
1226 #endif
1227                 "pxor %%mm6, %%mm4                              \n\t"
1228                 "pxor %%mm7, %%mm5                              \n\t"
1229                 "psubw %%mm6, %%mm4                             \n\t"
1230                 "psubw %%mm7, %%mm5                             \n\t"
1231                 "packsswb %%mm5, %%mm4                          \n\t"
1232                 "movq (%%eax, %1, 2), %%mm0                     \n\t"
1233                 "paddb   %%mm4, %%mm0                           \n\t"
1234                 "movq %%mm0, (%%eax, %1, 2)                     \n\t"
1235                 "movq (%0, %1, 4), %%mm0                        \n\t"
1236                 "psubb %%mm4, %%mm0                             \n\t"
1237 //              "pxor %%mm0, %%mm0 \n\t"
1238                 "movq %%mm0, (%0, %1, 4)                        \n\t"
1239
1240                 :
1241                 : "r" (src), "r" (stride), "r" (QP)
1242                 : "%eax", "%ebx"
1243         );
1244 #else
1245         const int l1= stride;
1246         const int l2= stride + l1;
1247         const int l3= stride + l2;
1248         const int l4= stride + l3;
1249         const int l5= stride + l4;
1250         const int l6= stride + l5;
1251         const int l7= stride + l6;
1252         const int l8= stride + l7;
1253 //      const int l9= stride + l8;
1254         int x;
1255         for(x=0; x<BLOCK_SIZE; x++)
1256         {
1257                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1258                 if(ABS(middleEnergy) < 8*QP)
1259                 {
1260                         const int q=(src[l4] - src[l5])/2;
1261                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1262                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1263
1264                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1265                         d= MAX(d, 0);
1266
1267                         d= (5*d + 32) >> 6;
1268                         d*= SIGN(-middleEnergy);
1269
1270                         if(q>0)
1271                         {
1272                                 d= d<0 ? 0 : d;
1273                                 d= d>q ? q : d;
1274                         }
1275                         else
1276                         {
1277                                 d= d>0 ? 0 : d;
1278                                 d= d<q ? q : d;
1279                         }
1280
1281                         src[l4]-= d;
1282                         src[l5]+= d;
1283                 }
1284                 src++;
1285         }
1286 #endif
1287 }
1288
1289 //FIXME?  |255-0| = 1
1290 /**
1291  * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock.
1292  */
1293 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride)
1294 {
1295 //      src++;
1296         int numEq= 0;
1297 #ifdef HAVE_MMX
1298 asm volatile (
1299 //              "int $3 \n\t"
1300                 "pushl %1\n\t"
1301                 "movq b7E, %%mm7                                \n\t" // mm7 = 0x7F
1302                 "movq b7C, %%mm6                                \n\t" // mm6 = 0x7D
1303                 "leal tempBlock, %%eax                          \n\t"
1304                 "pxor %%mm0, %%mm0                              \n\t"
1305
1306 #define HDC_CHECK_AND_CPY(i) \
1307                 "movq -4(%1), %%mm2                             \n\t"\
1308                 "psrlq $32, %%mm2                               \n\t"\
1309                 "punpckldq 4(%1), %%mm2                         \n\t" /* (%1) */\
1310                 "movq %%mm2, %%mm1                              \n\t"\
1311                 "psrlq $8, %%mm2                                \n\t"\
1312                 "psubb %%mm1, %%mm2                             \n\t"\
1313                 "paddb %%mm7, %%mm2                             \n\t"\
1314                 "pcmpgtb %%mm6, %%mm2                           \n\t"\
1315                 "paddb %%mm2, %%mm0                             \n\t"\
1316                 "movq %%mm1," #i "(%%eax)                       \n\t"
1317
1318                 HDC_CHECK_AND_CPY(0)
1319                 "addl %2, %1                                    \n\t"
1320                 HDC_CHECK_AND_CPY(8)
1321                 "addl %2, %1                                    \n\t"
1322                 HDC_CHECK_AND_CPY(16)
1323                 "addl %2, %1                                    \n\t"
1324                 HDC_CHECK_AND_CPY(24)
1325                 "addl %2, %1                                    \n\t"
1326                 HDC_CHECK_AND_CPY(32)
1327                 "addl %2, %1                                    \n\t"
1328                 HDC_CHECK_AND_CPY(40)
1329                 "addl %2, %1                                    \n\t"
1330                 HDC_CHECK_AND_CPY(48)
1331                 "addl %2, %1                                    \n\t"
1332                 HDC_CHECK_AND_CPY(56)
1333
1334                 "psllq $8, %%mm0                                \n\t" // remove dummy value
1335                 "movq %%mm0, %%mm1                              \n\t"
1336                 "psrlw $8, %%mm0                                \n\t"
1337                 "paddb %%mm1, %%mm0                             \n\t"
1338                 "movq %%mm0, %%mm1                              \n\t"
1339                 "psrlq $16, %%mm0                               \n\t"
1340                 "paddb %%mm1, %%mm0                             \n\t"
1341                 "movq %%mm0, %%mm1                              \n\t"
1342                 "psrlq $32, %%mm0                               \n\t"
1343                 "paddb %%mm1, %%mm0                             \n\t"
1344                 "popl %1\n\t"
1345                 "movd %%mm0, %0                                 \n\t"
1346                 : "=r" (numEq)
1347                 : "r" (src), "r" (stride)
1348                 : "%eax"
1349                 );
1350 //      printf("%d\n", numEq);
1351         numEq= (256 - (numEq & 0xFF)) &0xFF;
1352 #else
1353         int y;
1354         for(y=0; y<BLOCK_SIZE; y++)
1355         {
1356                 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++;
1357                 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++;
1358                 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1359                 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1360                 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1361                 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1362                 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1363                 tempBlock[0 + y*TEMP_STRIDE] = src[0];
1364                 tempBlock[1 + y*TEMP_STRIDE] = src[1];
1365                 tempBlock[2 + y*TEMP_STRIDE] = src[2];
1366                 tempBlock[3 + y*TEMP_STRIDE] = src[3];
1367                 tempBlock[4 + y*TEMP_STRIDE] = src[4];
1368                 tempBlock[5 + y*TEMP_STRIDE] = src[5];
1369                 tempBlock[6 + y*TEMP_STRIDE] = src[6];
1370                 tempBlock[7 + y*TEMP_STRIDE] = src[7];
1371                 src+= stride;
1372         }
1373 #endif
1374 /*      if(abs(numEq - asmEq) > 0)
1375         {
1376 //              printf("\nasm:%d  c:%d\n", asmEq, numEq);
1377                 for(int y=0; y<8; y++)
1378                 {
1379                         for(int x=0; x<8; x++)
1380                         {
1381                                 printf("%d ", src[x + y*stride]);
1382                         }
1383                         printf("\n");
1384                 }
1385         }
1386 */
1387 //      printf("%d\n", numEq);
1388         return numEq > hFlatnessThreshold;
1389 }
1390
1391 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1392 {
1393 #ifdef MMX_FIXME
1394 FIXME
1395         int isOk;
1396         asm volatile(
1397 //              "int $3 \n\t"
1398                 "movq (%1, %2), %%mm0                           \n\t"
1399                 "movq (%1, %2, 8), %%mm1                        \n\t"
1400                 "movq %%mm0, %%mm2                              \n\t"
1401                 "psubusb %%mm1, %%mm0                           \n\t"
1402                 "psubusb %%mm2, %%mm1                           \n\t"
1403                 "por %%mm1, %%mm0                               \n\t" // ABS Diff
1404
1405                 "movq pQPb, %%mm7                               \n\t" // QP,..., QP
1406                 "paddusb %%mm7, %%mm7                           \n\t" // 2QP ... 2QP
1407                 "psubusb %%mm7, %%mm0                           \n\t" // Diff <= 2QP -> 0
1408                 "pcmpeqd b00, %%mm0                             \n\t"
1409                 "psrlq $16, %%mm0                               \n\t"
1410                 "pcmpeqd bFF, %%mm0                             \n\t"
1411 //              "movd %%mm0, (%1, %2, 4)\n\t"
1412                 "movd %%mm0, %0                                 \n\t"
1413                 : "=r" (isOk)
1414                 : "r" (src), "r" (stride)
1415                 );
1416         return isOk;
1417 #else
1418         if(abs(src[0] - src[7]) > 2*QP) return 0;
1419
1420         return 1;
1421 #endif
1422 }
1423
1424 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1425 {
1426 #ifdef HAVE_MMX
1427         asm volatile(
1428                 "pushl %0                                       \n\t"
1429                 "pxor %%mm7, %%mm7                              \n\t"
1430                 "movq bm00001000, %%mm6                         \n\t"
1431                 "movd %2, %%mm5                                 \n\t" // QP
1432                 "movq %%mm5, %%mm4                              \n\t"
1433                 "paddusb %%mm5, %%mm5                           \n\t" // 2QP
1434                 "paddusb %%mm5, %%mm4                           \n\t" // 3QP
1435                 "psllq $24, %%mm4                               \n\t"
1436                 "pxor %%mm5, %%mm5                              \n\t" // 0
1437                 "psubb %%mm4, %%mm5                             \n\t" // -QP
1438                 "leal tempBlock, %%eax                          \n\t"
1439
1440 //FIXME? "unroll by 2" and mix
1441 #ifdef HAVE_MMX2
1442 #define HDF(i)  \
1443                 "movq " #i "(%%eax), %%mm0                      \n\t"\
1444                 "movq %%mm0, %%mm1                              \n\t"\
1445                 "movq %%mm0, %%mm2                              \n\t"\
1446                 "psrlq $8, %%mm1                                \n\t"\
1447                 "psubusb %%mm1, %%mm2                           \n\t"\
1448                 "psubusb %%mm0, %%mm1                           \n\t"\
1449                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1450                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1451                 "pshufw $0x00, %%mm1, %%mm3                     \n\t" /* p´5 = |p1 - p2| */\
1452                 "pminub %%mm1, %%mm3                            \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\
1453                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1454                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\
1455                 "paddb %%mm5, %%mm1                             \n\t"\
1456                 "psubusb %%mm5, %%mm1                           \n\t"\
1457                 "psrlw $2, %%mm1                                \n\t"\
1458                 "pxor %%mm2, %%mm1                              \n\t"\
1459                 "psubb %%mm2, %%mm1                             \n\t"\
1460                 "pand %%mm6, %%mm1                              \n\t"\
1461                 "psubb %%mm1, %%mm0                             \n\t"\
1462                 "psllq $8, %%mm1                                \n\t"\
1463                 "paddb %%mm1, %%mm0                             \n\t"\
1464                 "movd %%mm0, (%0)                               \n\t"\
1465                 "psrlq $32, %%mm0                               \n\t"\
1466                 "movd %%mm0, 4(%0)                              \n\t"
1467 #else
1468 #define HDF(i)\
1469                 "movq " #i "(%%eax), %%mm0                      \n\t"\
1470                 "movq %%mm0, %%mm1                              \n\t"\
1471                 "movq %%mm0, %%mm2                              \n\t"\
1472                 "psrlq $8, %%mm1                                \n\t"\
1473                 "psubusb %%mm1, %%mm2                           \n\t"\
1474                 "psubusb %%mm0, %%mm1                           \n\t"\
1475                 "por %%mm2, %%mm1                               \n\t" /* p´x = |px - p(x+1)| */\
1476                 "pcmpeqb %%mm7, %%mm2                           \n\t" /* p´x = sgn[px - p(x+1)] */\
1477                 "movq %%mm1, %%mm3                              \n\t"\
1478                 "psllq $32, %%mm3                               \n\t"\
1479                 "movq %%mm3, %%mm4                              \n\t"\
1480                 "psubusb %%mm1, %%mm4                           \n\t"\
1481                 "psubb %%mm4, %%mm3                             \n\t"\
1482                 "psrlq $16, %%mm3                               \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\
1483                 "psubusb %%mm3, %%mm1                   \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\
1484                 "paddb %%mm5, %%mm1                             \n\t"\
1485                 "psubusb %%mm5, %%mm1                           \n\t"\
1486                 "psrlw $2, %%mm1                                \n\t"\
1487                 "pxor %%mm2, %%mm1                              \n\t"\
1488                 "psubb %%mm2, %%mm1                             \n\t"\
1489                 "pand %%mm6, %%mm1                              \n\t"\
1490                 "psubb %%mm1, %%mm0                             \n\t"\
1491                 "psllq $8, %%mm1                                \n\t"\
1492                 "paddb %%mm1, %%mm0                             \n\t"\
1493                 "movd %%mm0, (%0)                               \n\t"\
1494                 "psrlq $32, %%mm0                               \n\t"\
1495                 "movd %%mm0, 4(%0)                              \n\t"
1496 #endif
1497                 HDF(0)
1498                 "addl %1, %0                                    \n\t"
1499                 HDF(8)
1500                 "addl %1, %0                                    \n\t"
1501                 HDF(16)
1502                 "addl %1, %0                                    \n\t"
1503                 HDF(24)
1504                 "addl %1, %0                                    \n\t"
1505                 HDF(32)
1506                 "addl %1, %0                                    \n\t"
1507                 HDF(40)
1508                 "addl %1, %0                                    \n\t"
1509                 HDF(48)
1510                 "addl %1, %0                                    \n\t"
1511                 HDF(56)
1512                 "popl %0                                        \n\t"
1513                 :
1514                 : "r" (dst), "r" (stride), "r" (QP)
1515                 : "%eax"
1516         );
1517 #else
1518         uint8_t *src= tempBlock;
1519
1520         int y;
1521         for(y=0; y<BLOCK_SIZE; y++)
1522         {
1523                 dst[0] = src[0];
1524                 dst[1] = src[1];
1525                 dst[2] = src[2];
1526                 dst[3] = src[3];
1527                 dst[4] = src[4];
1528                 dst[5] = src[5];
1529                 dst[6] = src[6];
1530                 dst[7] = src[7];
1531
1532                 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]);
1533                 if(ABS(middleEnergy) < 8*QP)
1534                 {
1535                         const int q=(src[3] - src[4])/2;
1536                         const int leftEnergy=  5*(src[2] - src[1]) + 2*(src[0] - src[3]);
1537                         const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]);
1538
1539                         int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1540                         d= MAX(d, 0);
1541
1542                         d= (5*d + 32) >> 6;
1543                         d*= SIGN(-middleEnergy);
1544
1545                         if(q>0)
1546                         {
1547                                 d= d<0 ? 0 : d;
1548                                 d= d>q ? q : d;
1549                         }
1550                         else
1551                         {
1552                                 d= d>0 ? 0 : d;
1553                                 d= d<q ? q : d;
1554                         }
1555
1556                         dst[3]-= d;
1557                         dst[4]+= d;
1558                 }
1559                 dst+= stride;
1560                 src+= TEMP_STRIDE;
1561         }
1562 #endif
1563 }
1564
1565 /**
1566  * Do a horizontal low pass filter on the 8x8 block
1567  * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1568  * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
1569  */
1570 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1571 {
1572 //return;
1573 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1574         asm volatile(   //"movv %0 %1 %2\n\t"
1575                 "pushl %0\n\t"
1576                 "pxor %%mm7, %%mm7                                      \n\t"
1577                 "leal tempBlock, %%eax                                  \n\t"
1578 /*
1579 #define HLP1    "movq (%0), %%mm0                                       \n\t"\
1580                 "movq %%mm0, %%mm1                                      \n\t"\
1581                 "psllq $8, %%mm0                                        \n\t"\
1582                 PAVGB(%%mm1, %%mm0)\
1583                 "psrlw $8, %%mm0                                        \n\t"\
1584                 "pxor %%mm1, %%mm1                                      \n\t"\
1585                 "packuswb %%mm1, %%mm0                                  \n\t"\
1586                 "movq %%mm0, %%mm1                                      \n\t"\
1587                 "movq %%mm0, %%mm2                                      \n\t"\
1588                 "psllq $32, %%mm0                                       \n\t"\
1589                 "paddb %%mm0, %%mm1                                     \n\t"\
1590                 "psllq $16, %%mm2                                       \n\t"\
1591                 PAVGB(%%mm2, %%mm0)\
1592                 "movq %%mm0, %%mm3                                      \n\t"\
1593                 "pand bm11001100, %%mm0                                 \n\t"\
1594                 "paddusb %%mm0, %%mm3                                   \n\t"\
1595                 "psrlq $8, %%mm3                                        \n\t"\
1596                 PAVGB(%%mm1, %%mm4)\
1597                 PAVGB(%%mm3, %%mm2)\
1598                 "psrlq $16, %%mm2                                       \n\t"\
1599                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1600                 "movq %%mm2, (%0)                                       \n\t"\
1601
1602 #define HLP2    "movq (%0), %%mm0                                       \n\t"\
1603                 "movq %%mm0, %%mm1                                      \n\t"\
1604                 "psllq $8, %%mm0                                        \n\t"\
1605                 PAVGB(%%mm1, %%mm0)\
1606                 "psrlw $8, %%mm0                                        \n\t"\
1607                 "pxor %%mm1, %%mm1                                      \n\t"\
1608                 "packuswb %%mm1, %%mm0                                  \n\t"\
1609                 "movq %%mm0, %%mm2                                      \n\t"\
1610                 "psllq $32, %%mm0                                       \n\t"\
1611                 "psllq $16, %%mm2                                       \n\t"\
1612                 PAVGB(%%mm2, %%mm0)\
1613                 "movq %%mm0, %%mm3                                      \n\t"\
1614                 "pand bm11001100, %%mm0                                 \n\t"\
1615                 "paddusb %%mm0, %%mm3                                   \n\t"\
1616                 "psrlq $8, %%mm3                                        \n\t"\
1617                 PAVGB(%%mm3, %%mm2)\
1618                 "psrlq $16, %%mm2                                       \n\t"\
1619                 "punpcklbw %%mm2, %%mm2                                 \n\t"\
1620                 "movq %%mm2, (%0)                                       \n\t"\
1621 */
1622 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
1623 /*
1624  31
1625  121
1626   121
1627    121
1628     121
1629      121
1630       121
1631        13
1632 Implemented     Exact 7-Tap
1633  9421           A321
1634  36421          64321
1635  334321         =
1636  1234321        =
1637   1234321       =
1638    123433       =
1639     12463         12346
1640      1249          123A
1641
1642 */
1643 #ifdef HAVE_MMX2
1644 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1645                 "movq %%mm0, %%mm1                                      \n\t"\
1646                 "movq %%mm0, %%mm2                                      \n\t"\
1647                 "movq %%mm0, %%mm3                                      \n\t"\
1648                 "movq %%mm0, %%mm4                                      \n\t"\
1649                 "psllq $8, %%mm1                                        \n\t"\
1650                 "psrlq $8, %%mm2                                        \n\t"\
1651                 "pand bm00000001, %%mm3                                 \n\t"\
1652                 "pand bm10000000, %%mm4                                 \n\t"\
1653                 "por %%mm3, %%mm1                                       \n\t"\
1654                 "por %%mm4, %%mm2                                       \n\t"\
1655                 PAVGB(%%mm2, %%mm1)\
1656                 PAVGB(%%mm1, %%mm0)\
1657 \
1658                 "pshufw $0xF9, %%mm0, %%mm3                             \n\t"\
1659                 "pshufw $0x90, %%mm0, %%mm4                             \n\t"\
1660                 PAVGB(%%mm3, %%mm4)\
1661                 PAVGB(%%mm4, %%mm0)\
1662                 "movd %%mm0, (%0)                                       \n\t"\
1663                 "psrlq $32, %%mm0                                       \n\t"\
1664                 "movd %%mm0, 4(%0)                                      \n\t"
1665 #else
1666 #define HLP3(i) "movq " #i "(%%eax), %%mm0                              \n\t"\
1667                 "movq %%mm0, %%mm1                                      \n\t"\
1668                 "movq %%mm0, %%mm2                                      \n\t"\
1669                 "movq %%mm0, %%mm3                                      \n\t"\
1670                 "movq %%mm0, %%mm4                                      \n\t"\
1671                 "psllq $8, %%mm1                                        \n\t"\
1672                 "psrlq $8, %%mm2                                        \n\t"\
1673                 "pand bm00000001, %%mm3                                 \n\t"\
1674                 "pand bm10000000, %%mm4                                 \n\t"\
1675                 "por %%mm3, %%mm1                                       \n\t"\
1676                 "por %%mm4, %%mm2                                       \n\t"\
1677                 PAVGB(%%mm2, %%mm1)\
1678                 PAVGB(%%mm1, %%mm0)\
1679 \
1680                 "movq %%mm0, %%mm3                                      \n\t"\
1681                 "movq %%mm0, %%mm4                                      \n\t"\
1682                 "movq %%mm0, %%mm5                                      \n\t"\
1683                 "psrlq $16, %%mm3                                       \n\t"\
1684                 "psllq $16, %%mm4                                       \n\t"\
1685                 "pand bm11000000, %%mm5                                 \n\t"\
1686                 "por %%mm5, %%mm3                                       \n\t"\
1687                 "movq %%mm0, %%mm5                                      \n\t"\
1688                 "pand bm00000011, %%mm5                                 \n\t"\
1689                 "por %%mm5, %%mm4                                       \n\t"\
1690                 PAVGB(%%mm3, %%mm4)\
1691                 PAVGB(%%mm4, %%mm0)\
1692                 "movd %%mm0, (%0)                                       \n\t"\
1693                 "psrlq $32, %%mm0                                       \n\t"\
1694                 "movd %%mm0, 4(%0)                                      \n\t"
1695 #endif
1696
1697 #define HLP(i) HLP3(i)
1698
1699                 HLP(0)
1700                 "addl %1, %0                                            \n\t"
1701                 HLP(8)
1702                 "addl %1, %0                                            \n\t"
1703                 HLP(16)
1704                 "addl %1, %0                                            \n\t"
1705                 HLP(24)
1706                 "addl %1, %0                                            \n\t"
1707                 HLP(32)
1708                 "addl %1, %0                                            \n\t"
1709                 HLP(40)
1710                 "addl %1, %0                                            \n\t"
1711                 HLP(48)
1712                 "addl %1, %0                                            \n\t"
1713                 HLP(56)
1714
1715                 "popl %0\n\t"
1716                 :
1717                 : "r" (dst), "r" (stride)
1718                 : "%eax", "%ebx"
1719         );
1720
1721 #else
1722         uint8_t *temp= tempBlock;
1723         int y;
1724         for(y=0; y<BLOCK_SIZE; y++)
1725         {
1726                 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1727                 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1728
1729                 int sums[9];
1730                 sums[0] = first + temp[0];
1731                 sums[1] = temp[0] + temp[1];
1732                 sums[2] = temp[1] + temp[2];
1733                 sums[3] = temp[2] + temp[3];
1734                 sums[4] = temp[3] + temp[4];
1735                 sums[5] = temp[4] + temp[5];
1736                 sums[6] = temp[5] + temp[6];
1737                 sums[7] = temp[6] + temp[7];
1738                 sums[8] = temp[7] + last;
1739
1740                 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1741                 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1742                 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1743                 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1744                 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4;
1745                 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1746                 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1747                 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1748
1749                 dst+= stride;
1750                 temp+= TEMP_STRIDE;
1751         }
1752 #endif
1753 }
1754
1755
1756 static inline void dering(uint8_t src[], int stride, int QP)
1757 {
1758 //FIXME
1759
1760 #ifdef HAVE_MMX2X
1761         asm volatile(
1762                 "leal (%0, %1), %%eax                           \n\t"
1763                 "leal (%%eax, %1, 4), %%ebx                     \n\t"
1764 //      0       1       2       3       4       5       6       7       8       9
1765 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
1766
1767                 "pcmpeq %%mm6, %%mm6                            \n\t"
1768                 "pxor %%mm7, %%mm7                              \n\t"
1769
1770 #define FIND_MIN_MAX(addr)\
1771                 "movq (" #addr "), %%mm0,                       \n\t"\
1772                 "pminub %%mm0, %%mm6                            \n\t"\
1773                 "pmaxub %%mm0, %%mm7                            \n\t"
1774
1775 FIND_MIN_MAX(%0)
1776 FIND_MIN_MAX(%%eax)
1777 FIND_MIN_MAX(%%eax, %1)
1778 FIND_MIN_MAX(%%eax, %1, 2)
1779 FIND_MIN_MAX(%0, %1, 4)
1780 FIND_MIN_MAX(%%ebx)
1781 FIND_MIN_MAX(%%ebx, %1)
1782 FIND_MIN_MAX(%%ebx, %1, 2)
1783 FIND_MIN_MAX(%0, %1, 8)
1784 FIND_MIN_MAX(%%ebx, %1, 2)
1785
1786                 "movq %%mm6, %%mm4                              \n\t"
1787                 "psrlq $32, %%mm6                               \n\t"
1788                 "pminub %%mm4, %%mm6                            \n\t"
1789                 "movq %%mm6, %%mm4                              \n\t"
1790                 "psrlq $16, %%mm6                               \n\t"
1791                 "pminub %%mm4, %%mm6                            \n\t"
1792                 "movq %%mm6, %%mm4                              \n\t"
1793                 "psrlq $8, %%mm6                                \n\t"
1794                 "pminub %%mm4, %%mm6                            \n\t" // min of pixels
1795
1796                 "movq %%mm7, %%mm4                              \n\t"
1797                 "psrlq $32, %%mm7                               \n\t"
1798                 "pmaxub %%mm4, %%mm7                            \n\t"
1799                 "movq %%mm7, %%mm4                              \n\t"
1800                 "psrlq $16, %%mm7                               \n\t"
1801                 "pmaxub %%mm4, %%mm7                            \n\t"
1802                 "movq %%mm7, %%mm4                              \n\t"
1803                 "psrlq $8, %%mm7                                \n\t"
1804                 "pmaxub %%mm4, %%mm7                            \n\t" // max of pixels
1805                 PAVGB(%%mm6, %%mm7)                                   // (max + min)/2
1806
1807
1808                 : : "r" (src), "r" (stride), "r" (QP)
1809                 : "%eax", "%ebx"
1810         );
1811 #else
1812
1813 //FIXME
1814 #endif
1815 }
1816
1817
1818
1819
1820 /**
1821  * ...
1822  * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
1823  * -63 is best quality -1 is worst
1824  */
1825 //extern "C"{
1826 void  postprocess(unsigned char * src[], int src_stride,
1827                  unsigned char * dst[], int dst_stride,
1828                  int horizontal_size,   int vertical_size,
1829                  QP_STORE_T *QP_store,  int QP_stride,
1830                                           int mode)
1831 {
1832
1833         if(mode<0) mode= getModeForQuality(-mode);
1834
1835 /*
1836         long long T= rdtsc();
1837         for(int y=vertical_size-1; y>=0 ; y--)
1838                 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride);
1839 //      memcpy(dst[0], src[0],src_stride*vertical_size);
1840         printf("%4dk\r", (rdtsc()-T)/1000);
1841
1842         return;
1843 */
1844 /*
1845         long long T= rdtsc();
1846         while( (rdtsc() - T)/1000 < 4000);
1847
1848         return;
1849 */
1850         postProcess(src[0], src_stride, dst[0], dst_stride,
1851                 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode);
1852
1853         horizontal_size >>= 1;
1854         vertical_size   >>= 1;
1855         src_stride      >>= 1;
1856         dst_stride      >>= 1;
1857         mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
1858
1859         if(1)
1860         {
1861                 postProcess(src[1], src_stride, dst[1], dst_stride,
1862                         horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1863                 postProcess(src[2], src_stride, dst[2], dst_stride,
1864                         horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1865         }
1866         else
1867         {
1868                 memcpy(dst[1], src[1], src_stride*horizontal_size);
1869                 memcpy(dst[2], src[2], src_stride*horizontal_size);
1870         }
1871 }
1872 /**
1873  * gets the mode flags for a given quality (larger values mean slower but better postprocessing)
1874  * 0 <= quality < 64
1875  */
1876 int getModeForQuality(int quality){
1877         int modes[6]= {
1878                 LUM_V_DEBLOCK,
1879                 LUM_V_DEBLOCK | LUM_H_DEBLOCK,
1880                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK,
1881                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK,
1882                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING,
1883                 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING
1884                 };
1885
1886         return modes[ (quality*6) >>6 ];
1887 }
1888
1889 //} // extern "C"
1890
1891 /**
1892  * Copies a block from src to dst and fixes the blacklevel
1893  * numLines must be a multiple of 4
1894  * levelFix == 0 -> dont touch the brighness & contrast
1895  */
1896 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
1897         int numLines, int levelFix)
1898 {
1899         int i;
1900         if(levelFix)
1901         {
1902 #ifdef HAVE_MMX
1903                                         asm volatile(
1904                                                 "movl %4, %%eax \n\t"
1905                                                 "movl %%eax, temp0\n\t"
1906                                                 "pushl %0 \n\t"
1907                                                 "pushl %1 \n\t"
1908                                                 "leal (%2,%2), %%eax    \n\t"
1909                                                 "leal (%3,%3), %%ebx    \n\t"
1910                                                 "movq packedYOffset, %%mm2      \n\t"
1911                                                 "movq packedYScale, %%mm3       \n\t"
1912
1913 #define SCALED_CPY                                      \
1914                                                 "movq (%0), %%mm0       \n\t"\
1915                                                 "movq (%0,%2), %%mm1    \n\t"\
1916                                                 "psubusb %%mm2, %%mm0   \n\t"\
1917                                                 "psubusb %%mm2, %%mm1   \n\t"\
1918                                                 "pxor %%mm4, %%mm4      \n\t"\
1919                                                 "pxor %%mm5, %%mm5      \n\t"\
1920                                                 "punpcklbw %%mm0, %%mm4 \n\t"\
1921                                                 "punpckhbw %%mm0, %%mm5 \n\t"\
1922                                                 "pmulhuw %%mm3, %%mm4   \n\t"\
1923                                                 "pmulhuw %%mm3, %%mm5   \n\t"\
1924                                                 "packuswb %%mm5, %%mm4  \n\t"\
1925                                                 "movq %%mm4, (%1)       \n\t"\
1926                                                 "pxor %%mm4, %%mm4      \n\t"\
1927                                                 "pxor %%mm5, %%mm5      \n\t"\
1928                                                 "punpcklbw %%mm1, %%mm4 \n\t"\
1929                                                 "punpckhbw %%mm1, %%mm5 \n\t"\
1930                                                 "pmulhuw %%mm3, %%mm4   \n\t"\
1931                                                 "pmulhuw %%mm3, %%mm5   \n\t"\
1932                                                 "packuswb %%mm5, %%mm4  \n\t"\
1933                                                 "movq %%mm4, (%1, %3)   \n\t"\
1934
1935                                                 "1:                     \n\t"
1936 SCALED_CPY
1937                                                 "addl %%eax, %0         \n\t"
1938                                                 "addl %%ebx, %1         \n\t"
1939 SCALED_CPY
1940                                                 "addl %%eax, %0         \n\t"
1941                                                 "addl %%ebx, %1         \n\t"
1942                                                 "decl temp0             \n\t"
1943                                                 "jnz 1b                 \n\t"
1944
1945                                                 "popl %1 \n\t"
1946                                                 "popl %0 \n\t"
1947                                                 : : "r" (src),
1948                                                 "r" (dst),
1949                                                 "r" (srcStride),
1950                                                 "r" (dstStride),
1951                                                 "m" (numLines>>2)
1952                                                 : "%eax", "%ebx"
1953                                         );
1954 #else
1955                                 for(i=0; i<numLines; i++)
1956                                         memcpy( &(dst[dstStride*i]),
1957                                                 &(src[srcStride*i]), BLOCK_SIZE);
1958 #endif
1959         }
1960         else
1961         {
1962 #ifdef HAVE_MMX
1963                                         asm volatile(
1964                                                 "movl %4, %%eax \n\t"
1965                                                 "movl %%eax, temp0\n\t"
1966                                                 "pushl %0 \n\t"
1967                                                 "pushl %1 \n\t"
1968                                                 "leal (%2,%2), %%eax    \n\t"
1969                                                 "leal (%3,%3), %%ebx    \n\t"
1970                                                 "movq packedYOffset, %%mm2      \n\t"
1971                                                 "movq packedYScale, %%mm3       \n\t"
1972
1973 #define SIMPLE_CPY                                      \
1974                                                 "movq (%0), %%mm0       \n\t"\
1975                                                 "movq (%0,%2), %%mm1    \n\t"\
1976                                                 "movq %%mm0, (%1)       \n\t"\
1977                                                 "movq %%mm1, (%1, %3)   \n\t"\
1978
1979                                                 "1:                     \n\t"
1980 SIMPLE_CPY
1981                                                 "addl %%eax, %0         \n\t"
1982                                                 "addl %%ebx, %1         \n\t"
1983 SIMPLE_CPY
1984                                                 "addl %%eax, %0         \n\t"
1985                                                 "addl %%ebx, %1         \n\t"
1986                                                 "decl temp0             \n\t"
1987                                                 "jnz 1b                 \n\t"
1988
1989                                                 "popl %1 \n\t"
1990                                                 "popl %0 \n\t"
1991                                                 : : "r" (src),
1992                                                 "r" (dst),
1993                                                 "r" (srcStride),
1994                                                 "r" (dstStride),
1995                                                 "m" (numLines>>2)
1996                                                 : "%eax", "%ebx"
1997                                         );
1998 #else
1999                                 for(i=0; i<numLines; i++)
2000                                         memcpy( &(dst[dstStride*i]),
2001                                                 &(src[srcStride*i]), BLOCK_SIZE);
2002 #endif
2003         }
2004 }
2005
2006
2007 /**
2008  * Filters array of bytes (Y or U or V values)
2009  */
2010 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2011         QP_STORE_T QPs[], int QPStride, int isColor, int mode)
2012 {
2013         int x,y;
2014         /* we need 64bit here otherwise we´ll going to have a problem
2015            after watching a black picture for 5 hours*/
2016         static uint64_t *yHistogram= NULL;
2017         int black=0, white=255; // blackest black and whitest white in the picture
2018
2019 #ifdef TIMEING
2020         long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0;
2021         sumTime= rdtsc();
2022 #endif
2023
2024         if(!yHistogram)
2025         {
2026                 int i;
2027                 yHistogram= (uint64_t*)malloc(8*256);
2028                 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256;
2029         }
2030
2031         if(!isColor)
2032         {
2033                 uint64_t sum= 0;
2034                 int i;
2035                 static int framenum= -1;
2036                 uint64_t maxClipped;
2037                 uint64_t clipped;
2038                 double scale;
2039
2040                 framenum++;
2041                 if(framenum == 1) yHistogram[0]= width*height/64*15/256;
2042
2043                 for(i=0; i<256; i++)
2044                 {
2045                         sum+= yHistogram[i];
2046 //                      printf("%d ", yHistogram[i]);
2047                 }
2048 //              printf("\n\n");
2049
2050                 /* we allways get a completly black picture first */
2051
2052                 maxClipped= (uint64_t)(sum * maxClippedThreshold);
2053
2054                 clipped= sum;
2055                 for(black=255; black>0; black--)
2056                 {
2057                         if(clipped < maxClipped) break;
2058                         clipped-= yHistogram[black];
2059                 }
2060
2061                 clipped= sum;
2062                 for(white=0; white<256; white++)
2063                 {
2064                         if(clipped < maxClipped) break;
2065                         clipped-= yHistogram[white];
2066                 }
2067
2068                 // we cant handle negative correctures
2069                 packedYOffset= MAX(black - minAllowedY, 0);
2070                 packedYOffset|= packedYOffset<<32;
2071                 packedYOffset|= packedYOffset<<16;
2072                 packedYOffset|= packedYOffset<<8;
2073
2074                 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black);
2075
2076                 packedYScale= (uint16_t)(scale*256.0 + 0.5);
2077                 packedYScale|= packedYScale<<32;
2078                 packedYScale|= packedYScale<<16;
2079         }
2080         else
2081         {
2082                 packedYScale= 0x0100010001000100LL;
2083                 packedYOffset= 0;
2084         }
2085
2086         for(x=0; x<width; x+=BLOCK_SIZE)
2087                 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2088
2089         for(y=0; y<height; y+=BLOCK_SIZE)
2090         {
2091                 //1% speedup if these are here instead of the inner loop
2092                 uint8_t *srcBlock= &(src[y*srcStride]);
2093                 uint8_t *dstBlock= &(dst[y*dstStride]);
2094                 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start
2095                 uint8_t *vertBlock= &(dstBlock[dstStride*3]);
2096
2097                 // finish 1 block before the next otherwise we´ll might have a problem
2098                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2099                 for(x=0; x<width; x+=BLOCK_SIZE)
2100                 {
2101                         const int stride= dstStride;
2102                         int QP= isColor ?
2103                                 QPs[(y>>3)*QPStride + (x>>3)]:
2104                                 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8;
2105 #ifdef HAVE_MMX
2106                 asm volatile(
2107                         "movd %0, %%mm7                                 \n\t"
2108                         "packuswb %%mm7, %%mm7                          \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2109                         "packuswb %%mm7, %%mm7                          \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2110                         "packuswb %%mm7, %%mm7                          \n\t" // QP,..., QP
2111                         "movq %%mm7, pQPb                               \n\t"
2112                         : : "r" (QP)
2113                 );
2114 #endif
2115
2116
2117                         if(y + 12 < height)
2118                         {
2119 #ifdef MORE_TIMEING
2120                                 T0= rdtsc();
2121 #endif
2122
2123 #ifdef HAVE_MMX2
2124                                 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
2125                                 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
2126                                 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
2127                                 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
2128 #elif defined(HAVE_3DNOW)
2129 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2130 /*                              prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32);
2131                                 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32);
2132                                 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32);
2133                                 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32);
2134 */
2135 #endif
2136                                 if(!isColor) yHistogram[ srcBlock[0] ]++;
2137
2138                                 blockCopy(vertBlock + dstStride*2, dstStride,
2139                                         vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
2140
2141
2142 #ifdef MORE_TIMEING
2143                                 T1= rdtsc();
2144                                 memcpyTime+= T1-T0;
2145                                 T0=T1;
2146 #endif
2147                                 if(mode & V_DEBLOCK)
2148                                 {
2149                                         if(mode & V_RK1_FILTER)
2150                                                 vertRK1Filter(vertBlock, stride, QP);
2151                                         else if(mode & V_X1_FILTER)
2152                                                 vertX1Filter(vertBlock, stride, QP);
2153                                         else
2154                                         {
2155                                                 if( isVertDC(vertBlock, stride))
2156                                                 {
2157                                                         if(isVertMinMaxOk(vertBlock, stride, QP))
2158                                                                 doVertLowPass(vertBlock, stride, QP);
2159                                                 }
2160                                                 else
2161                                                         doVertDefFilter(vertBlock, stride, QP);
2162                                         }
2163                                 }
2164 #ifdef MORE_TIMEING
2165                                 T1= rdtsc();
2166                                 vertTime+= T1-T0;
2167                                 T0=T1;
2168 #endif
2169                         }
2170                         else
2171                                 blockCopy(vertBlock + dstStride*1, dstStride,
2172                                         vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
2173
2174
2175                         if(x - 8 >= 0 && x<width)
2176                         {
2177 #ifdef MORE_TIMEING
2178                                 T0= rdtsc();
2179 #endif
2180                                 if(mode & H_DEBLOCK)
2181                                 {
2182                                         if(mode & H_X1_FILTER)
2183                                                 horizX1Filter(dstBlock-4, stride, QP);
2184                                         else
2185                                         {
2186                                                 if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
2187                                                 {
2188                                                         if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
2189                                                                 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
2190                                                 }
2191                                                 else
2192                                                         doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
2193                                         }
2194                                 }
2195 #ifdef MORE_TIMEING
2196                                 T1= rdtsc();
2197                                 horizTime+= T1-T0;
2198                                 T0=T1;
2199 #endif
2200                                 dering(dstBlock - 9 - stride, stride, QP);
2201                         }
2202                         else if(y!=0)
2203                                 dering(dstBlock - stride*9 + width-9, stride, QP);
2204                         //FIXME dering filter will not be applied to last block (bottom right)
2205
2206
2207                         dstBlock+=8;
2208                         srcBlock+=8;
2209                         vertBlock+=8;
2210                         vertSrcBlock+=8;
2211                 }
2212         }
2213 #ifdef HAVE_3DNOW
2214         asm volatile("femms");
2215 #elif defined (HAVE_MMX)
2216         asm volatile("emms");
2217 #endif
2218
2219 #ifdef TIMEING
2220         // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...)
2221         sumTime= rdtsc() - sumTime;
2222         if(!isColor)
2223                 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d    \r",
2224                         (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000),
2225                         (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000)
2226                         , black, white);
2227 #endif
2228 }
2229
2230