git.sesse.net Git - ffmpeg/blob - libavcodec/i386/vp3dsp_mmx.c

   1 /*
   2  * Copyright (C) 2004 the ffmpeg project
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17  */
  18
  19 /**
  20  * @file vp3dsp_mmx.c
  21  * MMX-optimized functions cribbed from the original VP3 source code.
  22  */
  23
  24 #include "../dsputil.h"
  25 #include "mmx.h"
  26
  27 #define IdctAdjustBeforeShift 8
  28
  29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
  30  * idct_constants[0..15] = Mask table (M(I))
  31  * idct_constants[16..43] = Cosine table (C(I))
  32  * idct_constants[44..47] = 8
  33  */
  34 static uint16_t idct_constants[(4 + 7 + 1) * 4];
  35 static const uint16_t idct_cosine_table[7] = {
  36     64277, 60547, 54491, 46341, 36410, 25080, 12785
  37 };
  38
  39 #define r0 mm0
  40 #define r1 mm1
  41 #define r2 mm2
  42 #define r3 mm3
  43 #define r4 mm4
  44 #define r5 mm5
  45 #define r6 mm6
  46 #define r7 mm7
  47
  48 /* from original comments: The Macro does IDct on 4 1-D Dcts */
  49 #define BeginIDCT() { \
  50     movq_m2r(*I(3), r2); \
  51     movq_m2r(*C(3), r6); \
  52     movq_r2r(r2, r4); \
  53     movq_m2r(*J(5), r7); \
  54     pmulhw_r2r(r6, r4);       /* r4 = c3*i3 - i3 */ \
  55     movq_m2r(*C(5), r1); \
  56     pmulhw_r2r(r7, r6);       /* r6 = c3*i5 - i5 */ \
  57     movq_r2r(r1, r5); \
  58     pmulhw_r2r(r2, r1);       /* r1 = c5*i3 - i3 */ \
  59     movq_m2r(*I(1), r3); \
  60     pmulhw_r2r(r7, r5);       /* r5 = c5*i5 - i5 */ \
  61     movq_m2r(*C(1), r0);      /* (all registers are in use) */ \
  62     paddw_r2r(r2, r4);        /* r4 = c3*i3 */ \
  63     paddw_r2r(r7, r6);        /* r6 = c3*i5 */ \
  64     paddw_r2r(r1, r2);        /* r2 = c5*i3 */ \
  65     movq_m2r(*J(7), r1); \
  66     paddw_r2r(r5, r7);        /* r7 = c5*i5 */ \
  67     movq_r2r(r0, r5);         /* r5 = c1 */ \
  68     pmulhw_r2r(r3, r0);       /* r0 = c1*i1 - i1 */ \
  69     paddsw_r2r(r7, r4);       /* r4 = C = c3*i3 + c5*i5 */ \
  70     pmulhw_r2r(r1, r5);       /* r5 = c1*i7 - i7 */ \
  71     movq_m2r(*C(7), r7); \
  72     psubsw_r2r(r2, r6);       /* r6 = D = c3*i5 - c5*i3 */ \
  73     paddw_r2r(r3, r0);        /* r0 = c1*i1 */ \
  74     pmulhw_r2r(r7, r3);       /* r3 = c7*i1 */ \
  75     movq_m2r(*I(2), r2); \
  76     pmulhw_r2r(r1, r7);       /* r7 = c7*i7 */ \
  77     paddw_r2r(r1, r5);        /* r5 = c1*i7 */ \
  78     movq_r2r(r2, r1);         /* r1 = i2 */ \
  79     pmulhw_m2r(*C(2), r2);    /* r2 = c2*i2 - i2 */ \
  80     psubsw_r2r(r5, r3);       /* r3 = B = c7*i1 - c1*i7 */ \
  81     movq_m2r(*J(6), r5); \
  82     paddsw_r2r(r7, r0);       /* r0 = A = c1*i1 + c7*i7 */ \
  83     movq_r2r(r5, r7);         /* r7 = i6 */ \
  84     psubsw_r2r(r4, r0);       /* r0 = A - C */ \
  85     pmulhw_m2r(*C(2), r5);    /* r5 = c2*i6 - i6 */ \
  86     paddw_r2r(r1, r2);        /* r2 = c2*i2 */ \
  87     pmulhw_m2r(*C(6), r1);    /* r1 = c6*i2 */ \
  88     paddsw_r2r(r4, r4);       /* r4 = C + C */ \
  89     paddsw_r2r(r0, r4);       /* r4 = C. = A + C */ \
  90     psubsw_r2r(r6, r3);       /* r3 = B - D */ \
  91     paddw_r2r(r7, r5);        /* r5 = c2*i6 */ \
  92     paddsw_r2r(r6, r6);       /* r6 = D + D */ \
  93     pmulhw_m2r(*C(6), r7);    /* r7 = c6*i6 */ \
  94     paddsw_r2r(r3, r6);       /* r6 = D. = B + D */ \
  95     movq_r2m(r4, *I(1));      /* save C. at I(1) */ \
  96     psubsw_r2r(r5, r1);       /* r1 = H = c6*i2 - c2*i6 */ \
  97     movq_m2r(*C(4), r4); \
  98     movq_r2r(r3, r5);         /* r5 = B - D */ \
  99     pmulhw_r2r(r4, r3);       /* r3 = (c4 - 1) * (B - D) */ \
 100     paddsw_r2r(r2, r7);       /* r7 = G = c6*i6 + c2*i2 */ \
 101     movq_r2m(r6, *I(2));      /* save D. at I(2) */ \
 102     movq_r2r(r0, r2);         /* r2 = A - C */ \
 103     movq_m2r(*I(0), r6); \
 104     pmulhw_r2r(r4, r0);       /* r0 = (c4 - 1) * (A - C) */ \
 105     paddw_r2r(r3, r5);        /* r5 = B. = c4 * (B - D) */ \
 106     movq_m2r(*J(4), r3); \
 107     psubsw_r2r(r1, r5);       /* r5 = B.. = B. - H */ \
 108     paddw_r2r(r0, r2);        /* r0 = A. = c4 * (A - C) */ \
 109     psubsw_r2r(r3, r6);       /* r6 = i0 - i4 */ \
 110     movq_r2r(r6, r0); \
 111     pmulhw_r2r(r4, r6);       /* r6 = (c4 - 1) * (i0 - i4) */ \
 112     paddsw_r2r(r3, r3);       /* r3 = i4 + i4 */ \
 113     paddsw_r2r(r1, r1);       /* r1 = H + H */ \
 114     paddsw_r2r(r0, r3);       /* r3 = i0 + i4 */ \
 115     paddsw_r2r(r5, r1);       /* r1 = H. = B + H */ \
 116     pmulhw_r2r(r3, r4);       /* r4 = (c4 - 1) * (i0 + i4) */ \
 117     paddsw_r2r(r0, r6);       /* r6 = F = c4 * (i0 - i4) */ \
 118     psubsw_r2r(r2, r6);       /* r6 = F. = F - A. */ \
 119     paddsw_r2r(r2, r2);       /* r2 = A. + A. */ \
 120     movq_m2r(*I(1), r0);      /* r0 = C. */ \
 121     paddsw_r2r(r6, r2);       /* r2 = A.. = F + A. */ \
 122     paddw_r2r(r3, r4);        /* r4 = E = c4 * (i0 + i4) */ \
 123     psubsw_r2r(r1, r2);       /* r2 = R2 = A.. - H. */ \
 124 }
 125
 126 /* RowIDCT gets ready to transpose */
 127 #define RowIDCT() { \
 128     \
 129     BeginIDCT(); \
 130     \
 131     movq_m2r(*I(2), r3);   /* r3 = D. */ \
 132     psubsw_r2r(r7, r4);    /* r4 = E. = E - G */ \
 133     paddsw_r2r(r1, r1);    /* r1 = H. + H. */ \
 134     paddsw_r2r(r7, r7);    /* r7 = G + G */ \
 135     paddsw_r2r(r2, r1);    /* r1 = R1 = A.. + H. */ \
 136     paddsw_r2r(r4, r7);    /* r7 = G. = E + G */ \
 137     psubsw_r2r(r3, r4);    /* r4 = R4 = E. - D. */ \
 138     paddsw_r2r(r3, r3); \
 139     psubsw_r2r(r5, r6);    /* r6 = R6 = F. - B.. */ \
 140     paddsw_r2r(r5, r5); \
 141     paddsw_r2r(r4, r3);    /* r3 = R3 = E. + D. */ \
 142     paddsw_r2r(r6, r5);    /* r5 = R5 = F. + B.. */ \
 143     psubsw_r2r(r0, r7);    /* r7 = R7 = G. - C. */ \
 144     paddsw_r2r(r0, r0); \
 145     movq_r2m(r1, *I(1));   /* save R1 */ \
 146     paddsw_r2r(r7, r0);    /* r0 = R0 = G. + C. */ \
 147 }
 148
 149 /* Column IDCT normalizes and stores final results */
 150 #define ColumnIDCT() { \
 151     \
 152     BeginIDCT(); \
 153     \
 154     paddsw_m2r(*Eight, r2);    /* adjust R2 (and R1) for shift */ \
 155     paddsw_r2r(r1, r1);        /* r1 = H. + H. */ \
 156     paddsw_r2r(r2, r1);        /* r1 = R1 = A.. + H. */ \
 157     psraw_i2r(4, r2);          /* r2 = NR2 */ \
 158     psubsw_r2r(r7, r4);        /* r4 = E. = E - G */ \
 159     psraw_i2r(4, r1);          /* r1 = NR1 */ \
 160     movq_m2r(*I(2), r3);       /* r3 = D. */ \
 161     paddsw_r2r(r7, r7);        /* r7 = G + G */ \
 162     movq_r2m(r2, *I(2));       /* store NR2 at I2 */ \
 163     paddsw_r2r(r4, r7);        /* r7 = G. = E + G */ \
 164     movq_r2m(r1, *I(1));       /* store NR1 at I1 */ \
 165     psubsw_r2r(r3, r4);        /* r4 = R4 = E. - D. */ \
 166     paddsw_m2r(*Eight, r4);    /* adjust R4 (and R3) for shift */ \
 167     paddsw_r2r(r3, r3);        /* r3 = D. + D. */ \
 168     paddsw_r2r(r4, r3);        /* r3 = R3 = E. + D. */ \
 169     psraw_i2r(4, r4);          /* r4 = NR4 */ \
 170     psubsw_r2r(r5, r6);        /* r6 = R6 = F. - B.. */ \
 171     psraw_i2r(4, r3);          /* r3 = NR3 */ \
 172     paddsw_m2r(*Eight, r6);    /* adjust R6 (and R5) for shift */ \
 173     paddsw_r2r(r5, r5);        /* r5 = B.. + B.. */ \
 174     paddsw_r2r(r6, r5);        /* r5 = R5 = F. + B.. */ \
 175     psraw_i2r(4, r6);          /* r6 = NR6 */ \
 176     movq_r2m(r4, *J(4));       /* store NR4 at J4 */ \
 177     psraw_i2r(4, r5);          /* r5 = NR5 */ \
 178     movq_r2m(r3, *I(3));       /* store NR3 at I3 */ \
 179     psubsw_r2r(r0, r7);        /* r7 = R7 = G. - C. */ \
 180     paddsw_m2r(*Eight, r7);    /* adjust R7 (and R0) for shift */ \
 181     paddsw_r2r(r0, r0);        /* r0 = C. + C. */ \
 182     paddsw_r2r(r7, r0);        /* r0 = R0 = G. + C. */ \
 183     psraw_i2r(4, r7);          /* r7 = NR7 */ \
 184     movq_r2m(r6, *J(6));       /* store NR6 at J6 */ \
 185     psraw_i2r(4, r0);          /* r0 = NR0 */ \
 186     movq_r2m(r5, *J(5));       /* store NR5 at J5 */ \
 187     movq_r2m(r7, *J(7));       /* store NR7 at J7 */ \
 188     movq_r2m(r0, *I(0));       /* store NR0 at I0 */ \
 189 }
 190
 191 /* Following macro does two 4x4 transposes in place.
 192
 193   At entry (we assume):
 194
 195     r0 = a3 a2 a1 a0
 196     I(1) = b3 b2 b1 b0
 197     r2 = c3 c2 c1 c0
 198     r3 = d3 d2 d1 d0
 199
 200     r4 = e3 e2 e1 e0
 201     r5 = f3 f2 f1 f0
 202     r6 = g3 g2 g1 g0
 203     r7 = h3 h2 h1 h0
 204
 205   At exit, we have:
 206
 207     I(0) = d0 c0 b0 a0
 208     I(1) = d1 c1 b1 a1
 209     I(2) = d2 c2 b2 a2
 210     I(3) = d3 c3 b3 a3
 211
 212     J(4) = h0 g0 f0 e0
 213     J(5) = h1 g1 f1 e1
 214     J(6) = h2 g2 f2 e2
 215     J(7) = h3 g3 f3 e3
 216
 217    I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
 218    J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
 219
 220    Since r1 is free at entry, we calculate the Js first. */
 221
 222 #define Transpose() { \
 223     movq_r2r(r4, r1);         /* r1 = e3 e2 e1 e0 */ \
 224     punpcklwd_r2r(r5, r4);    /* r4 = f1 e1 f0 e0 */ \
 225     movq_r2m(r0, *I(0));      /* save a3 a2 a1 a0 */ \
 226     punpckhwd_r2r(r5, r1);    /* r1 = f3 e3 f2 e2 */ \
 227     movq_r2r(r6, r0);         /* r0 = g3 g2 g1 g0 */ \
 228     punpcklwd_r2r(r7, r6);    /* r6 = h1 g1 h0 g0 */ \
 229     movq_r2r(r4, r5);         /* r5 = f1 e1 f0 e0 */ \
 230     punpckldq_r2r(r6, r4);    /* r4 = h0 g0 f0 e0 = R4 */ \
 231     punpckhdq_r2r(r6, r5);    /* r5 = h1 g1 f1 e1 = R5 */ \
 232     movq_r2r(r1, r6);         /* r6 = f3 e3 f2 e2 */ \
 233     movq_r2m(r4, *J(4)); \
 234     punpckhwd_r2r(r7, r0);    /* r0 = h3 g3 h2 g2 */ \
 235     movq_r2m(r5, *J(5)); \
 236     punpckhdq_r2r(r0, r6);    /* r6 = h3 g3 f3 e3 = R7 */ \
 237     movq_m2r(*I(0), r4);      /* r4 = a3 a2 a1 a0 */ \
 238     punpckldq_r2r(r0, r1);    /* r1 = h2 g2 f2 e2 = R6 */ \
 239     movq_m2r(*I(1), r5);      /* r5 = b3 b2 b1 b0 */ \
 240     movq_r2r(r4, r0);         /* r0 = a3 a2 a1 a0 */ \
 241     movq_r2m(r6, *J(7)); \
 242     punpcklwd_r2r(r5, r0);    /* r0 = b1 a1 b0 a0 */ \
 243     movq_r2m(r1, *J(6)); \
 244     punpckhwd_r2r(r5, r4);    /* r4 = b3 a3 b2 a2 */ \
 245     movq_r2r(r2, r5);         /* r5 = c3 c2 c1 c0 */ \
 246     punpcklwd_r2r(r3, r2);    /* r2 = d1 c1 d0 c0 */ \
 247     movq_r2r(r0, r1);         /* r1 = b1 a1 b0 a0 */ \
 248     punpckldq_r2r(r2, r0);    /* r0 = d0 c0 b0 a0 = R0 */ \
 249     punpckhdq_r2r(r2, r1);    /* r1 = d1 c1 b1 a1 = R1 */ \
 250     movq_r2r(r4, r2);         /* r2 = b3 a3 b2 a2 */ \
 251     movq_r2m(r0, *I(0)); \
 252     punpckhwd_r2r(r3, r5);    /* r5 = d3 c3 d2 c2 */ \
 253     movq_r2m(r1, *I(1)); \
 254     punpckhdq_r2r(r5, r4);    /* r4 = d3 c3 b3 a3 = R3 */ \
 255     punpckldq_r2r(r5, r2);    /* r2 = d2 c2 b2 a2 = R2 */ \
 256     movq_r2m(r4, *I(3)); \
 257     movq_r2m(r2, *I(2)); \
 258 }
 259
 260 void ff_vp3_dsp_init_mmx(void)
 261 {
 262     int j = 16;
 263     uint16_t *p;
 264
 265     j = 1;
 266     do {
 267         p = idct_constants + ((j + 3) << 2);
 268         p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
 269     } while (++j <= 7);
 270
 271     idct_constants[44] = idct_constants[45] =
 272     idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
 273 }
 274
 275 void ff_vp3_idct_mmx(int16_t *output_data)
 276 {
 277     /* eax = quantized input
 278      * ebx = dequantizer matrix
 279      * ecx = IDCT constants
 280      *  M(I) = ecx + MaskOffset(0) + I * 8
 281      *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
 282      * edx = output
 283      * r0..r7 = mm0..mm7
 284      */
 285
 286 #define C(x) (idct_constants + 16 + (x - 1) * 4)
 287 #define Eight (idct_constants + 44)
 288
 289     /* at this point, function has completed dequantization + dezigzag +
 290      * partial transposition; now do the idct itself */
 291 #define I(K) (output_data + K * 8)
 292 #define J(K) (output_data + ((K - 4) * 8) + 4)
 293
 294     RowIDCT();
 295     Transpose();
 296
 297 #undef I
 298 #undef J
 299 #define I(K) (output_data + (K * 8) + 32)
 300 #define J(K) (output_data + ((K - 4) * 8) + 36)
 301
 302     RowIDCT();
 303     Transpose();
 304
 305 #undef I
 306 #undef J
 307 #define I(K) (output_data + K * 8)
 308 #define J(K) (output_data + K * 8)
 309
 310     ColumnIDCT();
 311
 312 #undef I
 313 #undef J
 314 #define I(K) (output_data + (K * 8) + 4)
 315 #define J(K) (output_data + (K * 8) + 4)
 316
 317     ColumnIDCT();
 318
 319 #undef I
 320 #undef J
 321
 322 }