git.sesse.net Git - ffmpeg/blob - libavcodec/i386/fdct_mmx.c

   1 /*
   2  * MMX optimized forward DCT
   3  * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
   4  * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
   6  *
   7  * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
   8  *
   9  *  Intel Application Note AP-922 - fast, precise implementation of DCT
  10  *        http://developer.intel.com/vtune/cbts/appnotes.htm
  11  *
  12  * Also of inspiration:
  13  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
  14  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
  15  *
  16  * This file is part of FFmpeg.
  17  *
  18  * FFmpeg is free software; you can redistribute it and/or
  19  * modify it under the terms of the GNU Lesser General Public
  20  * License as published by the Free Software Foundation; either
  21  * version 2.1 of the License, or (at your option) any later version.
  22  *
  23  * FFmpeg is distributed in the hope that it will be useful,
  24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26  * Lesser General Public License for more details.
  27  *
  28  * You should have received a copy of the GNU Lesser General Public
  29  * License along with FFmpeg; if not, write to the Free Software
  30  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  31  */
  32 #include "common.h"
  33 #include "../dsputil.h"
  34 #include "mmx.h"
  35
  36 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
  37
  38 //////////////////////////////////////////////////////////////////////
  39 //
  40 // constants for the forward DCT
  41 // -----------------------------
  42 //
  43 // Be sure to check that your compiler is aligning all constants to QWORD
  44 // (8-byte) memory boundaries!  Otherwise the unaligned memory access will
  45 // severely stall MMX execution.
  46 //
  47 //////////////////////////////////////////////////////////////////////
  48
  49 #define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
  50 #define SHIFT_FRW_COL  BITS_FRW_ACC
  51 #define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
  52 #define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
  53 //#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
  54
  55 //concatenated table, for forward DCT transformation
  56 static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
  57     13036,  13036,  13036,  13036,        // tg * (2<<16) + 0.5
  58     27146,  27146,  27146,  27146,        // tg * (2<<16) + 0.5
  59    -21746, -21746, -21746, -21746,        // tg * (2<<16) + 0.5
  60 };
  61
  62 static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
  63     23170, 23170, 23170, 23170,           //cos * (2<<15) + 0.5
  64 };
  65
  66 static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
  67
  68 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
  69
  70 static struct
  71 {
  72  const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16);
  73 } fdct_r_row_sse2 ATTR_ALIGN(16)=
  74 {{
  75  RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
  76 }};
  77 //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
  78
  79 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
  80   16384,   16384,   22725,   19266,
  81   16384,   16384,   12873,    4520,
  82   21407,    8867,   19266,   -4520,
  83   -8867,  -21407,  -22725,  -12873,
  84   16384,  -16384,   12873,  -22725,
  85  -16384,   16384,    4520,   19266,
  86    8867,  -21407,    4520,  -12873,
  87   21407,   -8867,   19266,  -22725,
  88
  89   22725,   22725,   31521,   26722,
  90   22725,   22725,   17855,    6270,
  91   29692,   12299,   26722,   -6270,
  92  -12299,  -29692,  -31521,  -17855,
  93   22725,  -22725,   17855,  -31521,
  94  -22725,   22725,    6270,   26722,
  95   12299,  -29692,    6270,  -17855,
  96   29692,  -12299,   26722,  -31521,
  97
  98   21407,   21407,   29692,   25172,
  99   21407,   21407,   16819,    5906,
 100   27969,   11585,   25172,   -5906,
 101  -11585,  -27969,  -29692,  -16819,
 102   21407,  -21407,   16819,  -29692,
 103  -21407,   21407,    5906,   25172,
 104   11585,  -27969,    5906,  -16819,
 105   27969,  -11585,   25172,  -29692,
 106
 107   19266,   19266,   26722,   22654,
 108   19266,   19266,   15137,    5315,
 109   25172,   10426,   22654,   -5315,
 110  -10426,  -25172,  -26722,  -15137,
 111   19266,  -19266,   15137,  -26722,
 112  -19266,   19266,    5315,   22654,
 113   10426,  -25172,    5315,  -15137,
 114   25172,  -10426,   22654,  -26722,
 115
 116   16384,   16384,   22725,   19266,
 117   16384,   16384,   12873,    4520,
 118   21407,    8867,   19266,   -4520,
 119   -8867,  -21407,  -22725,  -12873,
 120   16384,  -16384,   12873,  -22725,
 121  -16384,   16384,    4520,   19266,
 122    8867,  -21407,    4520,  -12873,
 123   21407,   -8867,   19266,  -22725,
 124
 125   19266,   19266,   26722,   22654,
 126   19266,   19266,   15137,    5315,
 127   25172,   10426,   22654,   -5315,
 128  -10426,  -25172,  -26722,  -15137,
 129   19266,  -19266,   15137,  -26722,
 130  -19266,   19266,    5315,   22654,
 131   10426,  -25172,    5315,  -15137,
 132   25172,  -10426,   22654,  -26722,
 133
 134   21407,   21407,   29692,   25172,
 135   21407,   21407,   16819,    5906,
 136   27969,   11585,   25172,   -5906,
 137  -11585,  -27969,  -29692,  -16819,
 138   21407,  -21407,   16819,  -29692,
 139  -21407,   21407,    5906,   25172,
 140   11585,  -27969,    5906,  -16819,
 141   27969,  -11585,   25172,  -29692,
 142
 143   22725,   22725,   31521,   26722,
 144   22725,   22725,   17855,    6270,
 145   29692,   12299,   26722,   -6270,
 146  -12299,  -29692,  -31521,  -17855,
 147   22725,  -22725,   17855,  -31521,
 148  -22725,   22725,    6270,   26722,
 149   12299,  -29692,    6270,  -17855,
 150   29692,  -12299,   26722,  -31521,
 151 };
 152
 153 static struct
 154 {
 155  const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16);
 156 } tab_frw_01234567_sse2 ATTR_ALIGN(16) =
 157 {{
 158 //static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = {  // forward_dct coeff table
 159 #define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
 160                    C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
 161                   -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
 162                    C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
 163 // c1..c7 * cos(pi/4) * 2^15
 164 #define C1 22725
 165 #define C2 21407
 166 #define C3 19266
 167 #define C4 16384
 168 #define C5 12873
 169 #define C6 8867
 170 #define C7 4520
 171 TABLE_SSE2
 172
 173 #undef C1
 174 #undef C2
 175 #undef C3
 176 #undef C4
 177 #undef C5
 178 #undef C6
 179 #undef C7
 180 #define C1 31521
 181 #define C2 29692
 182 #define C3 26722
 183 #define C4 22725
 184 #define C5 17855
 185 #define C6 12299
 186 #define C7 6270
 187 TABLE_SSE2
 188
 189 #undef C1
 190 #undef C2
 191 #undef C3
 192 #undef C4
 193 #undef C5
 194 #undef C6
 195 #undef C7
 196 #define C1 29692
 197 #define C2 27969
 198 #define C3 25172
 199 #define C4 21407
 200 #define C5 16819
 201 #define C6 11585
 202 #define C7 5906
 203 TABLE_SSE2
 204
 205 #undef C1
 206 #undef C2
 207 #undef C3
 208 #undef C4
 209 #undef C5
 210 #undef C6
 211 #undef C7
 212 #define C1 26722
 213 #define C2 25172
 214 #define C3 22654
 215 #define C4 19266
 216 #define C5 15137
 217 #define C6 10426
 218 #define C7 5315
 219 TABLE_SSE2
 220
 221 #undef C1
 222 #undef C2
 223 #undef C3
 224 #undef C4
 225 #undef C5
 226 #undef C6
 227 #undef C7
 228 #define C1 22725
 229 #define C2 21407
 230 #define C3 19266
 231 #define C4 16384
 232 #define C5 12873
 233 #define C6 8867
 234 #define C7 4520
 235 TABLE_SSE2
 236
 237 #undef C1
 238 #undef C2
 239 #undef C3
 240 #undef C4
 241 #undef C5
 242 #undef C6
 243 #undef C7
 244 #define C1 26722
 245 #define C2 25172
 246 #define C3 22654
 247 #define C4 19266
 248 #define C5 15137
 249 #define C6 10426
 250 #define C7 5315
 251 TABLE_SSE2
 252
 253 #undef C1
 254 #undef C2
 255 #undef C3
 256 #undef C4
 257 #undef C5
 258 #undef C6
 259 #undef C7
 260 #define C1 29692
 261 #define C2 27969
 262 #define C3 25172
 263 #define C4 21407
 264 #define C5 16819
 265 #define C6 11585
 266 #define C7 5906
 267 TABLE_SSE2
 268
 269 #undef C1
 270 #undef C2
 271 #undef C3
 272 #undef C4
 273 #undef C5
 274 #undef C6
 275 #undef C7
 276 #define C1 31521
 277 #define C2 29692
 278 #define C3 26722
 279 #define C4 22725
 280 #define C5 17855
 281 #define C6 12299
 282 #define C7 6270
 283 TABLE_SSE2
 284 }};
 285
 286
 287 static av_always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
 288 {
 289     movq_m2r(*(in + offset + 1 * 8), mm0);
 290     movq_m2r(*(in + offset + 6 * 8), mm1);
 291     movq_r2r(mm0, mm2);
 292     movq_m2r(*(in + offset + 2 * 8), mm3);
 293     paddsw_r2r(mm1, mm0);
 294     movq_m2r(*(in + offset + 5 * 8), mm4);
 295     psllw_i2r(SHIFT_FRW_COL, mm0);
 296     movq_m2r(*(in + offset + 0 * 8), mm5);
 297     paddsw_r2r(mm3, mm4);
 298     paddsw_m2r(*(in + offset + 7 * 8), mm5);
 299     psllw_i2r(SHIFT_FRW_COL, mm4);
 300     movq_r2r(mm0, mm6);
 301     psubsw_r2r(mm1, mm2);
 302     movq_m2r(*(fdct_tg_all_16 + 4), mm1);
 303     psubsw_r2r(mm4, mm0);
 304     movq_m2r(*(in + offset + 3 * 8), mm7);
 305     pmulhw_r2r(mm0, mm1);
 306     paddsw_m2r(*(in + offset + 4 * 8), mm7);
 307     psllw_i2r(SHIFT_FRW_COL, mm5);
 308     paddsw_r2r(mm4, mm6);
 309     psllw_i2r(SHIFT_FRW_COL, mm7);
 310     movq_r2r(mm5, mm4);
 311     psubsw_r2r(mm7, mm5);
 312     paddsw_r2r(mm5, mm1);
 313     paddsw_r2r(mm7, mm4);
 314     por_m2r(fdct_one_corr, mm1);
 315     psllw_i2r(SHIFT_FRW_COL + 1, mm2);
 316     pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);
 317     movq_r2r(mm4, mm7);
 318     psubsw_m2r(*(in + offset + 5 * 8), mm3);
 319     psubsw_r2r(mm6, mm4);
 320     movq_r2m(mm1, *(out + offset + 2 * 8));
 321     paddsw_r2r(mm6, mm7);
 322     movq_m2r(*(in + offset + 3 * 8), mm1);
 323     psllw_i2r(SHIFT_FRW_COL + 1, mm3);
 324     psubsw_m2r(*(in + offset + 4 * 8), mm1);
 325     movq_r2r(mm2, mm6);
 326     movq_r2m(mm4, *(out + offset + 4 * 8));
 327     paddsw_r2r(mm3, mm2);
 328     pmulhw_m2r(*ocos_4_16, mm2);
 329     psubsw_r2r(mm3, mm6);
 330     pmulhw_m2r(*ocos_4_16, mm6);
 331     psubsw_r2r(mm0, mm5);
 332     por_m2r(fdct_one_corr, mm5);
 333     psllw_i2r(SHIFT_FRW_COL, mm1);
 334     por_m2r(fdct_one_corr, mm2);
 335     movq_r2r(mm1, mm4);
 336     movq_m2r(*(in + offset + 0 * 8), mm3);
 337     paddsw_r2r(mm6, mm1);
 338     psubsw_m2r(*(in + offset + 7 * 8), mm3);
 339     psubsw_r2r(mm6, mm4);
 340     movq_m2r(*(fdct_tg_all_16 + 0), mm0);
 341     psllw_i2r(SHIFT_FRW_COL, mm3);
 342     movq_m2r(*(fdct_tg_all_16 + 8), mm6);
 343     pmulhw_r2r(mm1, mm0);
 344     movq_r2m(mm7, *(out + offset + 0 * 8));
 345     pmulhw_r2r(mm4, mm6);
 346     movq_r2m(mm5, *(out + offset + 6 * 8));
 347     movq_r2r(mm3, mm7);
 348     movq_m2r(*(fdct_tg_all_16 + 8), mm5);
 349     psubsw_r2r(mm2, mm7);
 350     paddsw_r2r(mm2, mm3);
 351     pmulhw_r2r(mm7, mm5);
 352     paddsw_r2r(mm3, mm0);
 353     paddsw_r2r(mm4, mm6);
 354     pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);
 355     por_m2r(fdct_one_corr, mm0);
 356     paddsw_r2r(mm7, mm5);
 357     psubsw_r2r(mm6, mm7);
 358     movq_r2m(mm0, *(out + offset + 1 * 8));
 359     paddsw_r2r(mm4, mm5);
 360     movq_r2m(mm7, *(out + offset + 3 * 8));
 361     psubsw_r2r(mm1, mm3);
 362     movq_r2m(mm5, *(out + offset + 5 * 8));
 363     movq_r2m(mm3, *(out + offset + 7 * 8));
 364 }
 365
 366
 367 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
 368 {
 369     asm volatile(
 370 #define FDCT_ROW_SSE2_H1(i,t)                    \
 371         "movq      " #i "(%0), %%xmm2      \n\t" \
 372         "movq      " #i "+8(%0), %%xmm0    \n\t" \
 373         "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
 374         "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
 375         "movdqa    " #t "(%1), %%xmm4      \n\t" \
 376         "movdqa    " #t "+16(%1), %%xmm5   \n\t"
 377
 378 #define FDCT_ROW_SSE2_H2(i,t)                    \
 379         "movq      " #i "(%0), %%xmm2      \n\t" \
 380         "movq      " #i "+8(%0), %%xmm0    \n\t" \
 381         "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
 382         "movdqa    " #t "+48(%1), %%xmm7   \n\t"
 383
 384 #define FDCT_ROW_SSE2(i)                      \
 385         "movq      %%xmm2, %%xmm1       \n\t" \
 386         "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
 387         "paddsw    %%xmm0, %%xmm1       \n\t" \
 388         "psubsw    %%xmm0, %%xmm2       \n\t" \
 389         "punpckldq %%xmm2, %%xmm1       \n\t" \
 390         "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
 391         "pmaddwd   %%xmm2, %%xmm3       \n\t" \
 392         "pmaddwd   %%xmm1, %%xmm7       \n\t" \
 393         "pmaddwd   %%xmm5, %%xmm2       \n\t" \
 394         "pmaddwd   %%xmm4, %%xmm1       \n\t" \
 395         "paddd     %%xmm7, %%xmm3       \n\t" \
 396         "paddd     %%xmm2, %%xmm1       \n\t" \
 397         "paddd     %%xmm6, %%xmm3       \n\t" \
 398         "paddd     %%xmm6, %%xmm1       \n\t" \
 399         "psrad     %3, %%xmm3           \n\t" \
 400         "psrad     %3, %%xmm1           \n\t" \
 401         "packssdw  %%xmm3, %%xmm1       \n\t" \
 402         "movdqa    %%xmm1, " #i "(%4)   \n\t"
 403
 404         "movdqa    (%2), %%xmm6         \n\t"
 405         FDCT_ROW_SSE2_H1(0,0)
 406         FDCT_ROW_SSE2(0)
 407         FDCT_ROW_SSE2_H2(64,0)
 408         FDCT_ROW_SSE2(64)
 409
 410         FDCT_ROW_SSE2_H1(16,64)
 411         FDCT_ROW_SSE2(16)
 412         FDCT_ROW_SSE2_H2(112,64)
 413         FDCT_ROW_SSE2(112)
 414
 415         FDCT_ROW_SSE2_H1(32,128)
 416         FDCT_ROW_SSE2(32)
 417         FDCT_ROW_SSE2_H2(96,128)
 418         FDCT_ROW_SSE2(96)
 419
 420         FDCT_ROW_SSE2_H1(48,192)
 421         FDCT_ROW_SSE2(48)
 422         FDCT_ROW_SSE2_H2(80,192)
 423         FDCT_ROW_SSE2(80)
 424         :
 425         : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
 426     );
 427 }
 428
 429 static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
 430 {
 431     pshufw_m2r(*(in + 4), mm5, 0x1B);
 432     movq_m2r(*(in + 0), mm0);
 433     movq_r2r(mm0, mm1);
 434     paddsw_r2r(mm5, mm0);
 435     psubsw_r2r(mm5, mm1);
 436     movq_r2r(mm0, mm2);
 437     punpckldq_r2r(mm1, mm0);
 438     punpckhdq_r2r(mm1, mm2);
 439     movq_m2r(*(table + 0), mm1);
 440     movq_m2r(*(table + 4), mm3);
 441     movq_m2r(*(table + 8), mm4);
 442     movq_m2r(*(table + 12), mm5);
 443     movq_m2r(*(table + 16), mm6);
 444     movq_m2r(*(table + 20), mm7);
 445     pmaddwd_r2r(mm0, mm1);
 446     pmaddwd_r2r(mm2, mm3);
 447     pmaddwd_r2r(mm0, mm4);
 448     pmaddwd_r2r(mm2, mm5);
 449     pmaddwd_r2r(mm0, mm6);
 450     pmaddwd_r2r(mm2, mm7);
 451     pmaddwd_m2r(*(table + 24), mm0);
 452     pmaddwd_m2r(*(table + 28), mm2);
 453     paddd_r2r(mm1, mm3);
 454     paddd_r2r(mm4, mm5);
 455     paddd_r2r(mm6, mm7);
 456     paddd_r2r(mm0, mm2);
 457     movq_m2r(*fdct_r_row, mm0);
 458     paddd_r2r(mm0, mm3);
 459     paddd_r2r(mm0, mm5);
 460     paddd_r2r(mm0, mm7);
 461     paddd_r2r(mm0, mm2);
 462     psrad_i2r(SHIFT_FRW_ROW, mm3);
 463     psrad_i2r(SHIFT_FRW_ROW, mm5);
 464     psrad_i2r(SHIFT_FRW_ROW, mm7);
 465     psrad_i2r(SHIFT_FRW_ROW, mm2);
 466     packssdw_r2r(mm5, mm3);
 467     packssdw_r2r(mm2, mm7);
 468     movq_r2m(mm3, *(out + 0));
 469     movq_r2m(mm7, *(out + 4));
 470 }
 471
 472 static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
 473 {
 474 //FIXME reorder (i dont have a old mmx only cpu here to benchmark ...)
 475     movd_m2r(*(in + 6), mm1);
 476     punpcklwd_m2r(*(in + 4), mm1);
 477     movq_r2r(mm1, mm2);
 478     psrlq_i2r(0x20, mm1);
 479     movq_m2r(*(in + 0), mm0);
 480     punpcklwd_r2r(mm2, mm1);
 481     movq_r2r(mm0, mm5);
 482     paddsw_r2r(mm1, mm0);
 483     psubsw_r2r(mm1, mm5);
 484     movq_r2r(mm0, mm2);
 485     punpckldq_r2r(mm5, mm0);
 486     punpckhdq_r2r(mm5, mm2);
 487     movq_m2r(*(table + 0), mm1);
 488     movq_m2r(*(table + 4), mm3);
 489     movq_m2r(*(table + 8), mm4);
 490     movq_m2r(*(table + 12), mm5);
 491     movq_m2r(*(table + 16), mm6);
 492     movq_m2r(*(table + 20), mm7);
 493     pmaddwd_r2r(mm0, mm1);
 494     pmaddwd_r2r(mm2, mm3);
 495     pmaddwd_r2r(mm0, mm4);
 496     pmaddwd_r2r(mm2, mm5);
 497     pmaddwd_r2r(mm0, mm6);
 498     pmaddwd_r2r(mm2, mm7);
 499     pmaddwd_m2r(*(table + 24), mm0);
 500     pmaddwd_m2r(*(table + 28), mm2);
 501     paddd_r2r(mm1, mm3);
 502     paddd_r2r(mm4, mm5);
 503     paddd_r2r(mm6, mm7);
 504     paddd_r2r(mm0, mm2);
 505     movq_m2r(*fdct_r_row, mm0);
 506     paddd_r2r(mm0, mm3);
 507     paddd_r2r(mm0, mm5);
 508     paddd_r2r(mm0, mm7);
 509     paddd_r2r(mm0, mm2);
 510     psrad_i2r(SHIFT_FRW_ROW, mm3);
 511     psrad_i2r(SHIFT_FRW_ROW, mm5);
 512     psrad_i2r(SHIFT_FRW_ROW, mm7);
 513     psrad_i2r(SHIFT_FRW_ROW, mm2);
 514     packssdw_r2r(mm5, mm3);
 515     packssdw_r2r(mm2, mm7);
 516     movq_r2m(mm3, *(out + 0));
 517     movq_r2m(mm7, *(out + 4));
 518 }
 519
 520 void ff_fdct_mmx(int16_t *block)
 521 {
 522     int64_t align_tmp[16] ATTR_ALIGN(8);
 523     int16_t * block1= (int16_t*)align_tmp;
 524     const int16_t *table= tab_frw_01234567;
 525     int i;
 526
 527     fdct_col(block, block1, 0);
 528     fdct_col(block, block1, 4);
 529
 530     for(i=8;i>0;i--) {
 531         fdct_row_mmx(block1, block, table);
 532         block1 += 8;
 533         table += 32;
 534         block += 8;
 535     }
 536 }
 537
 538 void ff_fdct_mmx2(int16_t *block)
 539 {
 540     int64_t align_tmp[16] ATTR_ALIGN(8);
 541     int16_t *block1= (int16_t*)align_tmp;
 542     const int16_t *table= tab_frw_01234567;
 543     int i;
 544
 545     fdct_col(block, block1, 0);
 546     fdct_col(block, block1, 4);
 547
 548     for(i=8;i>0;i--) {
 549         fdct_row_mmx2(block1, block, table);
 550         block1 += 8;
 551         table += 32;
 552         block += 8;
 553     }
 554 }
 555
 556 void ff_fdct_sse2(int16_t *block)
 557 {
 558     int64_t align_tmp[16] ATTR_ALIGN(16);
 559     int16_t * const block1= (int16_t*)align_tmp;
 560
 561     fdct_col(block, block1, 0);
 562     fdct_col(block, block1, 4);
 563
 564     fdct_row_sse2(block1, block);
 565 }
 566