git.sesse.net Git - ffmpeg/blob - libavcodec/vc1dsp.c

   1 /*
   2  * VC-1 and WMV3 decoder - DSP functions
   3  * Copyright (c) 2006 Konstantin Shishkov
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * VC-1 and WMV3 decoder
  25  *
  26  */
  27
  28 #include "libavutil/common.h"
  29 #include "h264chroma.h"
  30 #include "vc1dsp.h"
  31
  32 /* Apply overlap transform to horizontal edge */
  33 static void vc1_v_overlap_c(uint8_t *src, int stride)
  34 {
  35     int i;
  36     int a, b, c, d;
  37     int d1, d2;
  38     int rnd = 1;
  39     for (i = 0; i < 8; i++) {
  40         a  = src[-2 * stride];
  41         b  = src[-stride];
  42         c  = src[0];
  43         d  = src[stride];
  44         d1 = (a - d + 3 + rnd) >> 3;
  45         d2 = (a - d + b - c + 4 - rnd) >> 3;
  46
  47         src[-2 * stride] = a - d1;
  48         src[-stride]     = av_clip_uint8(b - d2);
  49         src[0]           = av_clip_uint8(c + d2);
  50         src[stride]      = d + d1;
  51         src++;
  52         rnd = !rnd;
  53     }
  54 }
  55
  56 /* Apply overlap transform to vertical edge */
  57 static void vc1_h_overlap_c(uint8_t *src, int stride)
  58 {
  59     int i;
  60     int a, b, c, d;
  61     int d1, d2;
  62     int rnd = 1;
  63     for (i = 0; i < 8; i++) {
  64         a  = src[-2];
  65         b  = src[-1];
  66         c  = src[0];
  67         d  = src[1];
  68         d1 = (a - d + 3 + rnd) >> 3;
  69         d2 = (a - d + b - c + 4 - rnd) >> 3;
  70
  71         src[-2] = a - d1;
  72         src[-1] = av_clip_uint8(b - d2);
  73         src[0]  = av_clip_uint8(c + d2);
  74         src[1]  = d + d1;
  75         src    += stride;
  76         rnd     = !rnd;
  77     }
  78 }
  79
  80 static void vc1_v_s_overlap_c(int16_t *top, int16_t *bottom)
  81 {
  82     int i;
  83     int a, b, c, d;
  84     int d1, d2;
  85     int rnd1 = 4, rnd2 = 3;
  86     for (i = 0; i < 8; i++) {
  87         a  = top[48];
  88         b  = top[56];
  89         c  = bottom[0];
  90         d  = bottom[8];
  91         d1 = a - d;
  92         d2 = a - d + b - c;
  93
  94         top[48]   = ((a << 3) - d1 + rnd1) >> 3;
  95         top[56]   = ((b << 3) - d2 + rnd2) >> 3;
  96         bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
  97         bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
  98
  99         bottom++;
 100         top++;
 101         rnd2 = 7 - rnd2;
 102         rnd1 = 7 - rnd1;
 103     }
 104 }
 105
 106 static void vc1_h_s_overlap_c(int16_t *left, int16_t *right)
 107 {
 108     int i;
 109     int a, b, c, d;
 110     int d1, d2;
 111     int rnd1 = 4, rnd2 = 3;
 112     for (i = 0; i < 8; i++) {
 113         a  = left[6];
 114         b  = left[7];
 115         c  = right[0];
 116         d  = right[1];
 117         d1 = a - d;
 118         d2 = a - d + b - c;
 119
 120         left[6]  = ((a << 3) - d1 + rnd1) >> 3;
 121         left[7]  = ((b << 3) - d2 + rnd2) >> 3;
 122         right[0] = ((c << 3) + d2 + rnd1) >> 3;
 123         right[1] = ((d << 3) + d1 + rnd2) >> 3;
 124
 125         right += 8;
 126         left  += 8;
 127         rnd2   = 7 - rnd2;
 128         rnd1   = 7 - rnd1;
 129     }
 130 }
 131
 132 /**
 133  * VC-1 in-loop deblocking filter for one line
 134  * @param src source block type
 135  * @param stride block stride
 136  * @param pq block quantizer
 137  * @return whether other 3 pairs should be filtered or not
 138  * @see 8.6
 139  */
 140 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
 141 {
 142     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
 143               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
 144     int a0_sign = a0 >> 31;        /* Store sign */
 145
 146     a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
 147     if (a0 < pq) {
 148         int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
 149                         5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
 150         int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
 151                         5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
 152         if (a1 < a0 || a2 < a0) {
 153             int clip      = src[-1 * stride] - src[0 * stride];
 154             int clip_sign = clip >> 31;
 155
 156             clip = ((clip ^ clip_sign) - clip_sign) >> 1;
 157             if (clip) {
 158                 int a3     = FFMIN(a1, a2);
 159                 int d      = 5 * (a3 - a0);
 160                 int d_sign = (d >> 31);
 161
 162                 d       = ((d ^ d_sign) - d_sign) >> 3;
 163                 d_sign ^= a0_sign;
 164
 165                 if (d_sign ^ clip_sign)
 166                     d = 0;
 167                 else {
 168                     d = FFMIN(d, clip);
 169                     d = (d ^ d_sign) - d_sign; /* Restore sign */
 170                     src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
 171                     src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
 172                 }
 173                 return 1;
 174             }
 175         }
 176     }
 177     return 0;
 178 }
 179
 180 /**
 181  * VC-1 in-loop deblocking filter
 182  * @param src source block type
 183  * @param step distance between horizontally adjacent elements
 184  * @param stride distance between vertically adjacent elements
 185  * @param len edge length to filter (4 or 8 pixels)
 186  * @param pq block quantizer
 187  * @see 8.6
 188  */
 189 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
 190                                    int len, int pq)
 191 {
 192     int i;
 193     int filt3;
 194
 195     for (i = 0; i < len; i += 4) {
 196         filt3 = vc1_filter_line(src + 2 * step, stride, pq);
 197         if (filt3) {
 198             vc1_filter_line(src + 0 * step, stride, pq);
 199             vc1_filter_line(src + 1 * step, stride, pq);
 200             vc1_filter_line(src + 3 * step, stride, pq);
 201         }
 202         src += step * 4;
 203     }
 204 }
 205
 206 static void vc1_v_loop_filter4_c(uint8_t *src, int stride, int pq)
 207 {
 208     vc1_loop_filter(src, 1, stride, 4, pq);
 209 }
 210
 211 static void vc1_h_loop_filter4_c(uint8_t *src, int stride, int pq)
 212 {
 213     vc1_loop_filter(src, stride, 1, 4, pq);
 214 }
 215
 216 static void vc1_v_loop_filter8_c(uint8_t *src, int stride, int pq)
 217 {
 218     vc1_loop_filter(src, 1, stride, 8, pq);
 219 }
 220
 221 static void vc1_h_loop_filter8_c(uint8_t *src, int stride, int pq)
 222 {
 223     vc1_loop_filter(src, stride, 1, 8, pq);
 224 }
 225
 226 static void vc1_v_loop_filter16_c(uint8_t *src, int stride, int pq)
 227 {
 228     vc1_loop_filter(src, 1, stride, 16, pq);
 229 }
 230
 231 static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq)
 232 {
 233     vc1_loop_filter(src, stride, 1, 16, pq);
 234 }
 235
 236 /* Do inverse transform on 8x8 block */
 237 static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, int16_t *block)
 238 {
 239     int i;
 240     int dc = block[0];
 241
 242     dc = (3 * dc +  1) >> 1;
 243     dc = (3 * dc + 16) >> 5;
 244
 245     for (i = 0; i < 8; i++) {
 246         dest[0] = av_clip_uint8(dest[0] + dc);
 247         dest[1] = av_clip_uint8(dest[1] + dc);
 248         dest[2] = av_clip_uint8(dest[2] + dc);
 249         dest[3] = av_clip_uint8(dest[3] + dc);
 250         dest[4] = av_clip_uint8(dest[4] + dc);
 251         dest[5] = av_clip_uint8(dest[5] + dc);
 252         dest[6] = av_clip_uint8(dest[6] + dc);
 253         dest[7] = av_clip_uint8(dest[7] + dc);
 254         dest += linesize;
 255     }
 256 }
 257
 258 static void vc1_inv_trans_8x8_c(int16_t block[64])
 259 {
 260     int i;
 261     register int t1, t2, t3, t4, t5, t6, t7, t8;
 262     int16_t *src, *dst, temp[64];
 263
 264     src = block;
 265     dst = temp;
 266     for (i = 0; i < 8; i++) {
 267         t1 = 12 * (src[ 0] + src[32]) + 4;
 268         t2 = 12 * (src[ 0] - src[32]) + 4;
 269         t3 = 16 * src[16] +  6 * src[48];
 270         t4 =  6 * src[16] - 16 * src[48];
 271
 272         t5 = t1 + t3;
 273         t6 = t2 + t4;
 274         t7 = t2 - t4;
 275         t8 = t1 - t3;
 276
 277         t1 = 16 * src[ 8] + 15 * src[24] +  9 * src[40] +  4 * src[56];
 278         t2 = 15 * src[ 8] -  4 * src[24] - 16 * src[40] -  9 * src[56];
 279         t3 =  9 * src[ 8] - 16 * src[24] +  4 * src[40] + 15 * src[56];
 280         t4 =  4 * src[ 8] -  9 * src[24] + 15 * src[40] - 16 * src[56];
 281
 282         dst[0] = (t5 + t1) >> 3;
 283         dst[1] = (t6 + t2) >> 3;
 284         dst[2] = (t7 + t3) >> 3;
 285         dst[3] = (t8 + t4) >> 3;
 286         dst[4] = (t8 - t4) >> 3;
 287         dst[5] = (t7 - t3) >> 3;
 288         dst[6] = (t6 - t2) >> 3;
 289         dst[7] = (t5 - t1) >> 3;
 290
 291         src += 1;
 292         dst += 8;
 293     }
 294
 295     src = temp;
 296     dst = block;
 297     for (i = 0; i < 8; i++) {
 298         t1 = 12 * (src[ 0] + src[32]) + 64;
 299         t2 = 12 * (src[ 0] - src[32]) + 64;
 300         t3 = 16 * src[16] +  6 * src[48];
 301         t4 =  6 * src[16] - 16 * src[48];
 302
 303         t5 = t1 + t3;
 304         t6 = t2 + t4;
 305         t7 = t2 - t4;
 306         t8 = t1 - t3;
 307
 308         t1 = 16 * src[ 8] + 15 * src[24] +  9 * src[40] +  4 * src[56];
 309         t2 = 15 * src[ 8] -  4 * src[24] - 16 * src[40] -  9 * src[56];
 310         t3 =  9 * src[ 8] - 16 * src[24] +  4 * src[40] + 15 * src[56];
 311         t4 =  4 * src[ 8] -  9 * src[24] + 15 * src[40] - 16 * src[56];
 312
 313         dst[ 0] = (t5 + t1) >> 7;
 314         dst[ 8] = (t6 + t2) >> 7;
 315         dst[16] = (t7 + t3) >> 7;
 316         dst[24] = (t8 + t4) >> 7;
 317         dst[32] = (t8 - t4 + 1) >> 7;
 318         dst[40] = (t7 - t3 + 1) >> 7;
 319         dst[48] = (t6 - t2 + 1) >> 7;
 320         dst[56] = (t5 - t1 + 1) >> 7;
 321
 322         src++;
 323         dst++;
 324     }
 325 }
 326
 327 /* Do inverse transform on 8x4 part of block */
 328 static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, int16_t *block)
 329 {
 330     int i;
 331     int dc = block[0];
 332
 333     dc =  (3 * dc +  1) >> 1;
 334     dc = (17 * dc + 64) >> 7;
 335
 336     for (i = 0; i < 4; i++) {
 337         dest[0] = av_clip_uint8(dest[0] + dc);
 338         dest[1] = av_clip_uint8(dest[1] + dc);
 339         dest[2] = av_clip_uint8(dest[2] + dc);
 340         dest[3] = av_clip_uint8(dest[3] + dc);
 341         dest[4] = av_clip_uint8(dest[4] + dc);
 342         dest[5] = av_clip_uint8(dest[5] + dc);
 343         dest[6] = av_clip_uint8(dest[6] + dc);
 344         dest[7] = av_clip_uint8(dest[7] + dc);
 345         dest += linesize;
 346     }
 347 }
 348
 349 static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, int16_t *block)
 350 {
 351     int i;
 352     register int t1, t2, t3, t4, t5, t6, t7, t8;
 353     int16_t *src, *dst;
 354
 355     src = block;
 356     dst = block;
 357
 358     for (i = 0; i < 4; i++) {
 359         t1 = 12 * (src[0] + src[4]) + 4;
 360         t2 = 12 * (src[0] - src[4]) + 4;
 361         t3 = 16 * src[2] +  6 * src[6];
 362         t4 =  6 * src[2] - 16 * src[6];
 363
 364         t5 = t1 + t3;
 365         t6 = t2 + t4;
 366         t7 = t2 - t4;
 367         t8 = t1 - t3;
 368
 369         t1 = 16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7];
 370         t2 = 15 * src[1] -  4 * src[3] - 16 * src[5] -  9 * src[7];
 371         t3 =  9 * src[1] - 16 * src[3] +  4 * src[5] + 15 * src[7];
 372         t4 =  4 * src[1] -  9 * src[3] + 15 * src[5] - 16 * src[7];
 373
 374         dst[0] = (t5 + t1) >> 3;
 375         dst[1] = (t6 + t2) >> 3;
 376         dst[2] = (t7 + t3) >> 3;
 377         dst[3] = (t8 + t4) >> 3;
 378         dst[4] = (t8 - t4) >> 3;
 379         dst[5] = (t7 - t3) >> 3;
 380         dst[6] = (t6 - t2) >> 3;
 381         dst[7] = (t5 - t1) >> 3;
 382
 383         src += 8;
 384         dst += 8;
 385     }
 386
 387     src = block;
 388     for (i = 0; i < 8; i++) {
 389         t1 = 17 * (src[ 0] + src[16]) + 64;
 390         t2 = 17 * (src[ 0] - src[16]) + 64;
 391         t3 = 22 * src[ 8] + 10 * src[24];
 392         t4 = 22 * src[24] - 10 * src[ 8];
 393
 394         dest[0 * linesize] = av_clip_uint8(dest[0 * linesize] + ((t1 + t3) >> 7));
 395         dest[1 * linesize] = av_clip_uint8(dest[1 * linesize] + ((t2 - t4) >> 7));
 396         dest[2 * linesize] = av_clip_uint8(dest[2 * linesize] + ((t2 + t4) >> 7));
 397         dest[3 * linesize] = av_clip_uint8(dest[3 * linesize] + ((t1 - t3) >> 7));
 398
 399         src++;
 400         dest++;
 401     }
 402 }
 403
 404 /* Do inverse transform on 4x8 parts of block */
 405 static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, int16_t *block)
 406 {
 407     int i;
 408     int dc = block[0];
 409
 410     dc = (17 * dc +  4) >> 3;
 411     dc = (12 * dc + 64) >> 7;
 412
 413     for (i = 0; i < 8; i++) {
 414         dest[0] = av_clip_uint8(dest[0] + dc);
 415         dest[1] = av_clip_uint8(dest[1] + dc);
 416         dest[2] = av_clip_uint8(dest[2] + dc);
 417         dest[3] = av_clip_uint8(dest[3] + dc);
 418         dest += linesize;
 419     }
 420 }
 421
 422 static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, int16_t *block)
 423 {
 424     int i;
 425     register int t1, t2, t3, t4, t5, t6, t7, t8;
 426     int16_t *src, *dst;
 427
 428     src = block;
 429     dst = block;
 430
 431     for (i = 0; i < 8; i++) {
 432         t1 = 17 * (src[0] + src[2]) + 4;
 433         t2 = 17 * (src[0] - src[2]) + 4;
 434         t3 = 22 * src[1] + 10 * src[3];
 435         t4 = 22 * src[3] - 10 * src[1];
 436
 437         dst[0] = (t1 + t3) >> 3;
 438         dst[1] = (t2 - t4) >> 3;
 439         dst[2] = (t2 + t4) >> 3;
 440         dst[3] = (t1 - t3) >> 3;
 441
 442         src += 8;
 443         dst += 8;
 444     }
 445
 446     src = block;
 447     for (i = 0; i < 4; i++) {
 448         t1 = 12 * (src[ 0] + src[32]) + 64;
 449         t2 = 12 * (src[ 0] - src[32]) + 64;
 450         t3 = 16 * src[16] +  6 * src[48];
 451         t4 =  6 * src[16] - 16 * src[48];
 452
 453         t5 = t1 + t3;
 454         t6 = t2 + t4;
 455         t7 = t2 - t4;
 456         t8 = t1 - t3;
 457
 458         t1 = 16 * src[ 8] + 15 * src[24] +  9 * src[40] +  4 * src[56];
 459         t2 = 15 * src[ 8] -  4 * src[24] - 16 * src[40] -  9 * src[56];
 460         t3 =  9 * src[ 8] - 16 * src[24] +  4 * src[40] + 15 * src[56];
 461         t4 =  4 * src[ 8] -  9 * src[24] + 15 * src[40] - 16 * src[56];
 462
 463         dest[0 * linesize] = av_clip_uint8(dest[0 * linesize] + ((t5 + t1)     >> 7));
 464         dest[1 * linesize] = av_clip_uint8(dest[1 * linesize] + ((t6 + t2)     >> 7));
 465         dest[2 * linesize] = av_clip_uint8(dest[2 * linesize] + ((t7 + t3)     >> 7));
 466         dest[3 * linesize] = av_clip_uint8(dest[3 * linesize] + ((t8 + t4)     >> 7));
 467         dest[4 * linesize] = av_clip_uint8(dest[4 * linesize] + ((t8 - t4 + 1) >> 7));
 468         dest[5 * linesize] = av_clip_uint8(dest[5 * linesize] + ((t7 - t3 + 1) >> 7));
 469         dest[6 * linesize] = av_clip_uint8(dest[6 * linesize] + ((t6 - t2 + 1) >> 7));
 470         dest[7 * linesize] = av_clip_uint8(dest[7 * linesize] + ((t5 - t1 + 1) >> 7));
 471
 472         src++;
 473         dest++;
 474     }
 475 }
 476
 477 /* Do inverse transform on 4x4 part of block */
 478 static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, int16_t *block)
 479 {
 480     int i;
 481     int dc = block[0];
 482
 483     dc = (17 * dc +  4) >> 3;
 484     dc = (17 * dc + 64) >> 7;
 485
 486     for (i = 0; i < 4; i++) {
 487         dest[0] = av_clip_uint8(dest[0] + dc);
 488         dest[1] = av_clip_uint8(dest[1] + dc);
 489         dest[2] = av_clip_uint8(dest[2] + dc);
 490         dest[3] = av_clip_uint8(dest[3] + dc);
 491         dest += linesize;
 492     }
 493 }
 494
 495 static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, int16_t *block)
 496 {
 497     int i;
 498     register int t1, t2, t3, t4;
 499     int16_t *src, *dst;
 500
 501     src = block;
 502     dst = block;
 503     for (i = 0; i < 4; i++) {
 504         t1 = 17 * (src[0] + src[2]) + 4;
 505         t2 = 17 * (src[0] - src[2]) + 4;
 506         t3 = 22 * src[1] + 10 * src[3];
 507         t4 = 22 * src[3] - 10 * src[1];
 508
 509         dst[0] = (t1 + t3) >> 3;
 510         dst[1] = (t2 - t4) >> 3;
 511         dst[2] = (t2 + t4) >> 3;
 512         dst[3] = (t1 - t3) >> 3;
 513
 514         src += 8;
 515         dst += 8;
 516     }
 517
 518     src = block;
 519     for (i = 0; i < 4; i++) {
 520         t1 = 17 * (src[0] + src[16]) + 64;
 521         t2 = 17 * (src[0] - src[16]) + 64;
 522         t3 = 22 * src[8] + 10 * src[24];
 523         t4 = 22 * src[24] - 10 * src[8];
 524
 525         dest[0 * linesize] = av_clip_uint8(dest[0 * linesize] + ((t1 + t3) >> 7));
 526         dest[1 * linesize] = av_clip_uint8(dest[1 * linesize] + ((t2 - t4) >> 7));
 527         dest[2 * linesize] = av_clip_uint8(dest[2 * linesize] + ((t2 + t4) >> 7));
 528         dest[3 * linesize] = av_clip_uint8(dest[3 * linesize] + ((t1 - t3) >> 7));
 529
 530         src++;
 531         dest++;
 532     }
 533 }
 534
 535 /* motion compensation functions */
 536
 537 /* Filter in case of 2 filters */
 538 #define VC1_MSPEL_FILTER_16B(DIR, TYPE)                                       \
 539 static av_always_inline int vc1_mspel_ ## DIR ## _filter_16bits(const TYPE *src, \
 540                                                                 int stride,   \
 541                                                                 int mode)     \
 542 {                                                                             \
 543     switch(mode) {                                                            \
 544     case 0: /* no shift - should not occur */                                 \
 545         return 0;                                                             \
 546     case 1: /* 1/4 shift */                                                   \
 547         return -4 * src[-stride] + 53 * src[0] +                              \
 548                18 * src[stride]  -  3 * src[stride * 2];                      \
 549     case 2: /* 1/2 shift */                                                   \
 550         return -1 * src[-stride] +  9 * src[0] +                              \
 551                 9 * src[stride]  -  1 * src[stride * 2];                      \
 552     case 3: /* 3/4 shift */                                                   \
 553         return -3 * src[-stride] + 18 * src[0] +                              \
 554                53 * src[stride]  -  4 * src[stride * 2];                      \
 555     }                                                                         \
 556     return 0; /* should not occur */                                          \
 557 }
 558
 559 VC1_MSPEL_FILTER_16B(ver, uint8_t)
 560 VC1_MSPEL_FILTER_16B(hor, int16_t)
 561
 562 /* Filter used to interpolate fractional pel values */
 563 static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride,
 564                                              int mode, int r)
 565 {
 566     switch (mode) {
 567     case 0: // no shift
 568         return src[0];
 569     case 1: // 1/4 shift
 570         return (-4 * src[-stride] + 53 * src[0] +
 571                 18 * src[stride]  -  3 * src[stride * 2] + 32 - r) >> 6;
 572     case 2: // 1/2 shift
 573         return (-1 * src[-stride] +  9 * src[0] +
 574                  9 * src[stride]  -  1 * src[stride * 2] + 8 - r) >> 4;
 575     case 3: // 3/4 shift
 576         return (-3 * src[-stride] + 18 * src[0] +
 577                 53 * src[stride]  -  4 * src[stride * 2] + 32 - r) >> 6;
 578     }
 579     return 0; // should not occur
 580 }
 581
 582 /* Function used to do motion compensation with bicubic interpolation */
 583 #define VC1_MSPEL_MC(OP, OPNAME)                                              \
 584 static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
 585                                                     const uint8_t *src,       \
 586                                                     int stride,               \
 587                                                     int hmode,                \
 588                                                     int vmode,                \
 589                                                     int rnd)                  \
 590 {                                                                             \
 591     int i, j;                                                                 \
 592                                                                               \
 593     if (vmode) { /* Horizontal filter to apply */                             \
 594         int r;                                                                \
 595                                                                               \
 596         if (hmode) { /* Vertical filter to apply, output to tmp */            \
 597             static const int shift_value[] = { 0, 5, 1, 5 };                  \
 598             int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;       \
 599             int16_t tmp[11 * 8], *tptr = tmp;                                 \
 600                                                                               \
 601             r = (1 << (shift - 1)) + rnd - 1;                                 \
 602                                                                               \
 603             src -= 1;                                                         \
 604             for (j = 0; j < 8; j++) {                                         \
 605                 for (i = 0; i < 11; i++)                                      \
 606                     tptr[i] = (vc1_mspel_ver_filter_16bits(src + i, stride, vmode) + r) >> shift; \
 607                 src  += stride;                                               \
 608                 tptr += 11;                                                   \
 609             }                                                                 \
 610                                                                               \
 611             r    = 64 - rnd;                                                  \
 612             tptr = tmp + 1;                                                   \
 613             for (j = 0; j < 8; j++) {                                         \
 614                 for (i = 0; i < 8; i++)                                       \
 615                     OP(dst[i], (vc1_mspel_hor_filter_16bits(tptr + i, 1, hmode) + r) >> 7); \
 616                 dst  += stride;                                               \
 617                 tptr += 11;                                                   \
 618             }                                                                 \
 619                                                                               \
 620             return;                                                           \
 621         } else { /* No horizontal filter, output 8 lines to dst */            \
 622             r = 1 - rnd;                                                      \
 623                                                                               \
 624             for (j = 0; j < 8; j++) {                                         \
 625                 for (i = 0; i < 8; i++)                                       \
 626                     OP(dst[i], vc1_mspel_filter(src + i, stride, vmode, r));  \
 627                 src += stride;                                                \
 628                 dst += stride;                                                \
 629             }                                                                 \
 630             return;                                                           \
 631         }                                                                     \
 632     }                                                                         \
 633                                                                               \
 634     /* Horizontal mode with no vertical mode */                               \
 635     for (j = 0; j < 8; j++) {                                                 \
 636         for (i = 0; i < 8; i++)                                               \
 637             OP(dst[i], vc1_mspel_filter(src + i, 1, hmode, rnd));             \
 638         dst += stride;                                                        \
 639         src += stride;                                                        \
 640     }                                                                         \
 641 }
 642
 643 #define op_put(a, b) a = av_clip_uint8(b)
 644 #define op_avg(a, b) a = (a + av_clip_uint8(b) + 1) >> 1
 645
 646 VC1_MSPEL_MC(op_put, put_)
 647 VC1_MSPEL_MC(op_avg, avg_)
 648
 649 /* pixel functions - really are entry points to vc1_mspel_mc */
 650
 651 #define PUT_VC1_MSPEL(a, b)                                                   \
 652 static void put_vc1_mspel_mc ## a ## b ## _c(uint8_t *dst,                    \
 653                                              const uint8_t *src,              \
 654                                              ptrdiff_t stride, int rnd)       \
 655 {                                                                             \
 656     put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                            \
 657 }                                                                             \
 658 static void avg_vc1_mspel_mc ## a ## b ## _c(uint8_t *dst,                    \
 659                                              const uint8_t *src,              \
 660                                              ptrdiff_t stride, int rnd)       \
 661 {                                                                             \
 662     avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                            \
 663 }
 664
 665 PUT_VC1_MSPEL(1, 0)
 666 PUT_VC1_MSPEL(2, 0)
 667 PUT_VC1_MSPEL(3, 0)
 668
 669 PUT_VC1_MSPEL(0, 1)
 670 PUT_VC1_MSPEL(1, 1)
 671 PUT_VC1_MSPEL(2, 1)
 672 PUT_VC1_MSPEL(3, 1)
 673
 674 PUT_VC1_MSPEL(0, 2)
 675 PUT_VC1_MSPEL(1, 2)
 676 PUT_VC1_MSPEL(2, 2)
 677 PUT_VC1_MSPEL(3, 2)
 678
 679 PUT_VC1_MSPEL(0, 3)
 680 PUT_VC1_MSPEL(1, 3)
 681 PUT_VC1_MSPEL(2, 3)
 682 PUT_VC1_MSPEL(3, 3)
 683
 684 #define chroma_mc(a) \
 685     ((A * src[a] + B * src[a + 1] + \
 686       C * src[stride + a] + D * src[stride + a + 1] + 32 - 4) >> 6)
 687 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
 688                                         uint8_t *src /* align 1 */,
 689                                         int stride, int h, int x, int y)
 690 {
 691     const int A = (8 - x) * (8 - y);
 692     const int B =     (x) * (8 - y);
 693     const int C = (8 - x) *     (y);
 694     const int D =     (x) *     (y);
 695     int i;
 696
 697     assert(x < 8 && y < 8 && x >= 0 && y >= 0);
 698
 699     for (i = 0; i < h; i++) {
 700         dst[0] = chroma_mc(0);
 701         dst[1] = chroma_mc(1);
 702         dst[2] = chroma_mc(2);
 703         dst[3] = chroma_mc(3);
 704         dst[4] = chroma_mc(4);
 705         dst[5] = chroma_mc(5);
 706         dst[6] = chroma_mc(6);
 707         dst[7] = chroma_mc(7);
 708         dst += stride;
 709         src += stride;
 710     }
 711 }
 712
 713 static void put_no_rnd_vc1_chroma_mc4_c(uint8_t *dst, uint8_t *src,
 714                                         int stride, int h, int x, int y)
 715 {
 716     const int A = (8 - x) * (8 - y);
 717     const int B =     (x) * (8 - y);
 718     const int C = (8 - x) *     (y);
 719     const int D =     (x) *     (y);
 720     int i;
 721
 722     assert(x < 8 && y < 8 && x >= 0 && y >= 0);
 723
 724     for (i = 0; i < h; i++) {
 725         dst[0] = chroma_mc(0);
 726         dst[1] = chroma_mc(1);
 727         dst[2] = chroma_mc(2);
 728         dst[3] = chroma_mc(3);
 729         dst += stride;
 730         src += stride;
 731     }
 732 }
 733
 734 #define avg2(a, b) (((a) + (b) + 1) >> 1)
 735 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst /* align 8 */,
 736                                         uint8_t *src /* align 1 */,
 737                                         int stride, int h, int x, int y)
 738 {
 739     const int A = (8 - x) * (8 - y);
 740     const int B =     (x) * (8 - y);
 741     const int C = (8 - x) *     (y);
 742     const int D =     (x) *     (y);
 743     int i;
 744
 745     assert(x < 8 && y < 8 && x >= 0 && y >= 0);
 746
 747     for (i = 0; i < h; i++) {
 748         dst[0] = avg2(dst[0], chroma_mc(0));
 749         dst[1] = avg2(dst[1], chroma_mc(1));
 750         dst[2] = avg2(dst[2], chroma_mc(2));
 751         dst[3] = avg2(dst[3], chroma_mc(3));
 752         dst[4] = avg2(dst[4], chroma_mc(4));
 753         dst[5] = avg2(dst[5], chroma_mc(5));
 754         dst[6] = avg2(dst[6], chroma_mc(6));
 755         dst[7] = avg2(dst[7], chroma_mc(7));
 756         dst += stride;
 757         src += stride;
 758     }
 759 }
 760
 761 static void avg_no_rnd_vc1_chroma_mc4_c(uint8_t *dst /* align 8 */,
 762                                         uint8_t *src /* align 1 */,
 763                                         int stride, int h, int x, int y)
 764 {
 765     const int A = (8 - x) * (8 - y);
 766     const int B = (    x) * (8 - y);
 767     const int C = (8 - x) * (    y);
 768     const int D = (    x) * (    y);
 769     int i;
 770
 771     assert(x < 8 && y < 8 && x >= 0 && y >= 0);
 772
 773     for (i = 0; i < h; i++) {
 774         dst[0] = avg2(dst[0], chroma_mc(0));
 775         dst[1] = avg2(dst[1], chroma_mc(1));
 776         dst[2] = avg2(dst[2], chroma_mc(2));
 777         dst[3] = avg2(dst[3], chroma_mc(3));
 778         dst += stride;
 779         src += stride;
 780     }
 781 }
 782
 783 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
 784
 785 static void sprite_h_c(uint8_t *dst, const uint8_t *src, int offset,
 786                        int advance, int count)
 787 {
 788     while (count--) {
 789         int a = src[(offset >> 16)];
 790         int b = src[(offset >> 16) + 1];
 791         *dst++  = a + ((b - a) * (offset & 0xFFFF) >> 16);
 792         offset += advance;
 793     }
 794 }
 795
 796 static av_always_inline void sprite_v_template(uint8_t *dst,
 797                                                const uint8_t *src1a,
 798                                                const uint8_t *src1b,
 799                                                int offset1,
 800                                                int two_sprites,
 801                                                const uint8_t *src2a,
 802                                                const uint8_t *src2b,
 803                                                int offset2,
 804                                                int alpha, int scaled,
 805                                                int width)
 806 {
 807     int a1, b1, a2, b2;
 808     while (width--) {
 809         a1 = *src1a++;
 810         if (scaled) {
 811             b1 = *src1b++;
 812             a1 = a1 + ((b1 - a1) * offset1 >> 16);
 813         }
 814         if (two_sprites) {
 815             a2 = *src2a++;
 816             if (scaled > 1) {
 817                 b2 = *src2b++;
 818                 a2 = a2 + ((b2 - a2) * offset2 >> 16);
 819             }
 820             a1 = a1 + ((a2 - a1) * alpha >> 16);
 821         }
 822         *dst++ = a1;
 823     }
 824 }
 825
 826 static void sprite_v_single_c(uint8_t *dst, const uint8_t *src1a,
 827                               const uint8_t *src1b,
 828                               int offset, int width)
 829 {
 830     sprite_v_template(dst, src1a, src1b, offset, 0, NULL, NULL, 0, 0, 1, width);
 831 }
 832
 833 static void sprite_v_double_noscale_c(uint8_t *dst, const uint8_t *src1a,
 834                                       const uint8_t *src2a,
 835                                       int alpha, int width)
 836 {
 837     sprite_v_template(dst, src1a, NULL, 0, 1, src2a, NULL, 0, alpha, 0, width);
 838 }
 839
 840 static void sprite_v_double_onescale_c(uint8_t *dst,
 841                                        const uint8_t *src1a,
 842                                        const uint8_t *src1b,
 843                                        int offset1,
 844                                        const uint8_t *src2a,
 845                                        int alpha, int width)
 846 {
 847     sprite_v_template(dst, src1a, src1b, offset1, 1, src2a, NULL, 0, alpha, 1,
 848                       width);
 849 }
 850
 851 static void sprite_v_double_twoscale_c(uint8_t *dst,
 852                                        const uint8_t *src1a,
 853                                        const uint8_t *src1b,
 854                                        int offset1,
 855                                        const uint8_t *src2a,
 856                                        const uint8_t *src2b,
 857                                        int offset2,
 858                                        int alpha,
 859                                        int width)
 860 {
 861     sprite_v_template(dst, src1a, src1b, offset1, 1, src2a, src2b, offset2,
 862                       alpha, 2, width);
 863 }
 864
 865 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
 866
 867 av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 868 {
 869     dsp->vc1_inv_trans_8x8    = vc1_inv_trans_8x8_c;
 870     dsp->vc1_inv_trans_4x8    = vc1_inv_trans_4x8_c;
 871     dsp->vc1_inv_trans_8x4    = vc1_inv_trans_8x4_c;
 872     dsp->vc1_inv_trans_4x4    = vc1_inv_trans_4x4_c;
 873     dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_c;
 874     dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_c;
 875     dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_c;
 876     dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c;
 877
 878     dsp->vc1_h_overlap        = vc1_h_overlap_c;
 879     dsp->vc1_v_overlap        = vc1_v_overlap_c;
 880     dsp->vc1_h_s_overlap      = vc1_h_s_overlap_c;
 881     dsp->vc1_v_s_overlap      = vc1_v_s_overlap_c;
 882
 883     dsp->vc1_v_loop_filter4   = vc1_v_loop_filter4_c;
 884     dsp->vc1_h_loop_filter4   = vc1_h_loop_filter4_c;
 885     dsp->vc1_v_loop_filter8   = vc1_v_loop_filter8_c;
 886     dsp->vc1_h_loop_filter8   = vc1_h_loop_filter8_c;
 887     dsp->vc1_v_loop_filter16  = vc1_v_loop_filter16_c;
 888     dsp->vc1_h_loop_filter16  = vc1_h_loop_filter16_c;
 889
 890     dsp->put_vc1_mspel_pixels_tab[0]  = ff_put_pixels8x8_c;
 891     dsp->put_vc1_mspel_pixels_tab[1]  = put_vc1_mspel_mc10_c;
 892     dsp->put_vc1_mspel_pixels_tab[2]  = put_vc1_mspel_mc20_c;
 893     dsp->put_vc1_mspel_pixels_tab[3]  = put_vc1_mspel_mc30_c;
 894     dsp->put_vc1_mspel_pixels_tab[4]  = put_vc1_mspel_mc01_c;
 895     dsp->put_vc1_mspel_pixels_tab[5]  = put_vc1_mspel_mc11_c;
 896     dsp->put_vc1_mspel_pixels_tab[6]  = put_vc1_mspel_mc21_c;
 897     dsp->put_vc1_mspel_pixels_tab[7]  = put_vc1_mspel_mc31_c;
 898     dsp->put_vc1_mspel_pixels_tab[8]  = put_vc1_mspel_mc02_c;
 899     dsp->put_vc1_mspel_pixels_tab[9]  = put_vc1_mspel_mc12_c;
 900     dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_c;
 901     dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_c;
 902     dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_c;
 903     dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_c;
 904     dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_c;
 905     dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_c;
 906
 907     dsp->avg_vc1_mspel_pixels_tab[0]  = ff_avg_pixels8x8_c;
 908     dsp->avg_vc1_mspel_pixels_tab[1]  = avg_vc1_mspel_mc10_c;
 909     dsp->avg_vc1_mspel_pixels_tab[2]  = avg_vc1_mspel_mc20_c;
 910     dsp->avg_vc1_mspel_pixels_tab[3]  = avg_vc1_mspel_mc30_c;
 911     dsp->avg_vc1_mspel_pixels_tab[4]  = avg_vc1_mspel_mc01_c;
 912     dsp->avg_vc1_mspel_pixels_tab[5]  = avg_vc1_mspel_mc11_c;
 913     dsp->avg_vc1_mspel_pixels_tab[6]  = avg_vc1_mspel_mc21_c;
 914     dsp->avg_vc1_mspel_pixels_tab[7]  = avg_vc1_mspel_mc31_c;
 915     dsp->avg_vc1_mspel_pixels_tab[8]  = avg_vc1_mspel_mc02_c;
 916     dsp->avg_vc1_mspel_pixels_tab[9]  = avg_vc1_mspel_mc12_c;
 917     dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_c;
 918     dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_c;
 919     dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_c;
 920     dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_c;
 921     dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_c;
 922     dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_c;
 923
 924     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_c;
 925     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_c;
 926     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = put_no_rnd_vc1_chroma_mc4_c;
 927     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = avg_no_rnd_vc1_chroma_mc4_c;
 928
 929 #if CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER
 930     dsp->sprite_h                 = sprite_h_c;
 931     dsp->sprite_v_single          = sprite_v_single_c;
 932     dsp->sprite_v_double_noscale  = sprite_v_double_noscale_c;
 933     dsp->sprite_v_double_onescale = sprite_v_double_onescale_c;
 934     dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c;
 935 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
 936
 937     if (ARCH_AARCH64)
 938         ff_vc1dsp_init_aarch64(dsp);
 939     if (ARCH_ARM)
 940         ff_vc1dsp_init_arm(dsp);
 941     if (ARCH_PPC)
 942         ff_vc1dsp_init_ppc(dsp);
 943     if (ARCH_X86)
 944         ff_vc1dsp_init_x86(dsp);
 945 }