git.sesse.net Git - ffmpeg/blob - libavcodec/alpha/dsputil_alpha.c

   1 /*
   2  * Alpha optimized DSP utils
   3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  */
  19
  20 #include "asm.h"
  21 #include "../dsputil.h"
  22
  23 void simple_idct_axp(DCTELEM *block);
  24
  25 void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
  26                         int line_size, int h);
  27 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
  28                                 int line_size);
  29 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
  30                                 int line_size);
  31
  32 #if 0
  33 /* These functions were the base for the optimized assembler routines,
  34    and remain here for documentation purposes.  */
  35 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
  36                                    int line_size)
  37 {
  38     int i = 8;
  39     uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
  40
  41     ASM_ACCEPT_MVI;
  42
  43     do {
  44         uint64_t shorts0, shorts1;
  45
  46         shorts0 = ldq(block);
  47         shorts0 = maxsw4(shorts0, 0);
  48         shorts0 = minsw4(shorts0, clampmask);
  49         stl(pkwb(shorts0), pixels);
  50
  51         shorts1 = ldq(block + 4);
  52         shorts1 = maxsw4(shorts1, 0);
  53         shorts1 = minsw4(shorts1, clampmask);
  54         stl(pkwb(shorts1), pixels + 4);
  55
  56         pixels += line_size;
  57         block += 8;
  58     } while (--i);
  59 }
  60
  61 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
  62                             int line_size)
  63 {
  64     int h = 8;
  65     /* Keep this function a leaf function by generating the constants
  66        manually (mainly for the hack value ;-).  */
  67     uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
  68     uint64_t signmask  = zap(-1, 0x33);
  69     signmask ^= signmask >> 1;  /* 0x8000800080008000 */
  70
  71     ASM_ACCEPT_MVI;
  72
  73     do {
  74         uint64_t shorts0, pix0, signs0;
  75         uint64_t shorts1, pix1, signs1;
  76
  77         shorts0 = ldq(block);
  78         shorts1 = ldq(block + 4);
  79
  80         pix0    = unpkbw(ldl(pixels));
  81         /* Signed subword add (MMX paddw).  */
  82         signs0  = shorts0 & signmask;
  83         shorts0 &= ~signmask;
  84         shorts0 += pix0;
  85         shorts0 ^= signs0;
  86         /* Clamp. */
  87         shorts0 = maxsw4(shorts0, 0);
  88         shorts0 = minsw4(shorts0, clampmask);
  89
  90         /* Next 4.  */
  91         pix1    = unpkbw(ldl(pixels + 4));
  92         signs1  = shorts1 & signmask;
  93         shorts1 &= ~signmask;
  94         shorts1 += pix1;
  95         shorts1 ^= signs1;
  96         shorts1 = maxsw4(shorts1, 0);
  97         shorts1 = minsw4(shorts1, clampmask);
  98
  99         stl(pkwb(shorts0), pixels);
 100         stl(pkwb(shorts1), pixels + 4);
 101
 102         pixels += line_size;
 103         block += 8;
 104     } while (--h);
 105 }
 106 #endif
 107
 108 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
 109 {
 110     return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
 111 }
 112
 113 static inline uint64_t avg2(uint64_t a, uint64_t b)
 114 {
 115     return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
 116 }
 117
 118 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
 119 {
 120     uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
 121                 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
 122                 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
 123                 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
 124     uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
 125                     + (l2 & BYTE_VEC(0x03))
 126                     + (l3 & BYTE_VEC(0x03))
 127                     + (l4 & BYTE_VEC(0x03))
 128                     + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
 129     return r1 + r2;
 130 }
 131
 132 static inline uint64_t avg4_no_rnd(uint64_t l1, uint64_t l2,
 133                                    uint64_t l3, uint64_t l4)
 134 {
 135     uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
 136                 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
 137                 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
 138                 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
 139     uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
 140                     + (l2 & BYTE_VEC(0x03))
 141                     + (l3 & BYTE_VEC(0x03))
 142                     + (l4 & BYTE_VEC(0x03))
 143                     + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
 144     return r1 + r2;
 145 }
 146
 147 #define OP(LOAD, STORE, INCR)                   \
 148     do {                                        \
 149         STORE(LOAD(pixels), block);             \
 150         pixels += line_size;                    \
 151         block += INCR;                          \
 152     } while (--h)
 153
 154 #define OP_X2(LOAD, STORE, INCR)                                \
 155     do {                                                        \
 156         uint64_t pix1, pix2;                                    \
 157                                                                 \
 158         pix1 = LOAD(pixels);                                    \
 159         pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
 160         STORE(AVG2(pix1, pix2), block);                         \
 161         pixels += line_size;                                    \
 162         block += INCR;                                          \
 163     } while (--h)
 164
 165 #define OP_Y2(LOAD, STORE, INCR)                \
 166     do {                                        \
 167         uint64_t pix = LOAD(pixels);            \
 168         do {                                    \
 169             uint64_t next_pix;                  \
 170                                                 \
 171             pixels += line_size;                \
 172             next_pix = LOAD(pixels);            \
 173             STORE(AVG2(pix, next_pix), block);  \
 174             block += INCR;                      \
 175             pix = next_pix;                     \
 176         } while (--h);                          \
 177     } while (0)
 178
 179 #define OP_XY2(LOAD, STORE, INCR)                                       \
 180     do {                                                                \
 181         uint64_t pix1 = LOAD(pixels);                                   \
 182         uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);       \
 183                                                                         \
 184         do {                                                            \
 185             uint64_t next_pix1, next_pix2;                              \
 186                                                                         \
 187             pixels += line_size;                                        \
 188             next_pix1 = LOAD(pixels);                                   \
 189             next_pix2 = next_pix1 >> 8 | ((uint64_t) pixels[8] << 56);  \
 190                                                                         \
 191             STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);       \
 192                                                                         \
 193             block += INCR;                                              \
 194             pix1 = next_pix1;                                           \
 195             pix2 = next_pix2;                                           \
 196         } while (--h);                                                  \
 197     } while (0)
 198
 199 #define MAKE_OP(BTYPE, OPNAME, SUFF, OPKIND, STORE, INCR)               \
 200 static void OPNAME ## _pixels ## SUFF ## _axp(BTYPE *block,             \
 201                                               const uint8_t *pixels,    \
 202                                               int line_size, int h)     \
 203 {                                                                       \
 204     if ((size_t) pixels & 0x7) {                                        \
 205         OPKIND(uldq, STORE, INCR);                                      \
 206     } else {                                                            \
 207         OPKIND(ldq, STORE, INCR);                                       \
 208     }                                                                   \
 209 }
 210
 211 #define PIXOP(BTYPE, OPNAME, STORE, INCR)               \
 212     MAKE_OP(BTYPE, OPNAME, ,     OP,     STORE, INCR);  \
 213     MAKE_OP(BTYPE, OPNAME, _x2,  OP_X2,  STORE, INCR);  \
 214     MAKE_OP(BTYPE, OPNAME, _y2,  OP_Y2,  STORE, INCR);  \
 215     MAKE_OP(BTYPE, OPNAME, _xy2, OP_XY2, STORE, INCR);
 216
 217 /* Rounding primitives.  */
 218 #define AVG2 avg2
 219 #define AVG4 avg4
 220 #define STORE(l, b) stq(l, b)
 221 PIXOP(uint8_t, put, STORE, line_size);
 222
 223 #undef STORE
 224 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
 225 PIXOP(uint8_t, avg, STORE, line_size);
 226
 227 /* Not rounding primitives.  */
 228 #undef AVG2
 229 #undef AVG4
 230 #undef STORE
 231 #define AVG2 avg2_no_rnd
 232 #define AVG4 avg4_no_rnd
 233 #define STORE(l, b) stq(l, b)
 234 PIXOP(uint8_t, put_no_rnd, STORE, line_size);
 235
 236 #undef STORE
 237 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
 238 PIXOP(uint8_t, avg_no_rnd, STORE, line_size);
 239
 240 void dsputil_init_alpha(void)
 241 {
 242     put_pixels_tab[0] = put_pixels_axp_asm;
 243     put_pixels_tab[1] = put_pixels_x2_axp;
 244     put_pixels_tab[2] = put_pixels_y2_axp;
 245     put_pixels_tab[3] = put_pixels_xy2_axp;
 246
 247     put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
 248     put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
 249     put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
 250     put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
 251
 252     avg_pixels_tab[0] = avg_pixels_axp;
 253     avg_pixels_tab[1] = avg_pixels_x2_axp;
 254     avg_pixels_tab[2] = avg_pixels_y2_axp;
 255     avg_pixels_tab[3] = avg_pixels_xy2_axp;
 256
 257     avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
 258     avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
 259     avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
 260     avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
 261
 262     /* amask clears all bits that correspond to present features.  */
 263     if (amask(AMASK_MVI) == 0) {
 264         put_pixels_clamped = put_pixels_clamped_mvi_asm;
 265         add_pixels_clamped = add_pixels_clamped_mvi_asm;
 266     }
 267 }