git.sesse.net Git - ffmpeg/blob - libavcodec/x86/rv40dsp.asm

   1 ;******************************************************************************
   2 ;* MMX/SSE2-optimized functions for the RV40 decoder
   3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
   4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
   5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
   6 ;*
   7 ;* This file is part of FFmpeg.
   8 ;*
   9 ;* FFmpeg is free software; you can redistribute it and/or
  10 ;* modify it under the terms of the GNU Lesser General Public
  11 ;* License as published by the Free Software Foundation; either
  12 ;* version 2.1 of the License, or (at your option) any later version.
  13 ;*
  14 ;* FFmpeg is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 ;* Lesser General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU Lesser General Public
  20 ;* License along with FFmpeg; if not, write to the Free Software
  21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22 ;******************************************************************************
  23
  24 %include "libavutil/x86/x86util.asm"
  25
  26 SECTION_RODATA
  27
  28 pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
  29
  30 sixtap_filter_hb_m:  times 8 db   1, -5
  31                      times 8 db  52, 20
  32                      ; multiplied by 2 to have the same shift
  33                      times 8 db   2, -10
  34                      times 8 db  40,  40
  35                      ; back to normal
  36                      times 8 db   1, -5
  37                      times 8 db  20, 52
  38
  39 sixtap_filter_v_m:   times 8 dw   1
  40                      times 8 dw  -5
  41                      times 8 dw  52
  42                      times 8 dw  20
  43                      ; multiplied by 2 to have the same shift
  44                      times 8 dw   2
  45                      times 8 dw -10
  46                      times 8 dw  40
  47                      times 8 dw  40
  48                      ; back to normal
  49                      times 8 dw   1
  50                      times 8 dw  -5
  51                      times 8 dw  20
  52                      times 8 dw  52
  53
  54 %ifdef PIC
  55 %define sixtap_filter_hw   picregq
  56 %define sixtap_filter_hb   picregq
  57 %define sixtap_filter_v    picregq
  58 %define npicregs 1
  59 %else
  60 %define sixtap_filter_hw   sixtap_filter_hw_m
  61 %define sixtap_filter_hb   sixtap_filter_hb_m
  62 %define sixtap_filter_v    sixtap_filter_v_m
  63 %define npicregs 0
  64 %endif
  65
  66 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
  67 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
  68 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  69
  70 cextern  pw_32
  71 cextern  pw_16
  72 cextern  pw_512
  73
  74 SECTION .text
  75
  76 ;-----------------------------------------------------------------------------
  77 ; subpel MC functions:
  78 ;
  79 ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  80 ;                                          uint8_t *src, int srcstride,
  81 ;                                          int len, int m);
  82 ;----------------------------------------------------------------------
  83 %macro LOAD  2
  84 %if WIN64
  85    movsxd   %1q, %1d
  86 %endif
  87 %ifdef PIC
  88    add      %1q, picregq
  89 %else
  90    add      %1q, %2
  91 %endif
  92 %endmacro
  93
  94 %macro STORE 3
  95 %ifidn %3, avg
  96     movh      %2, [dstq]
  97 %endif
  98     packuswb  %1, %1
  99 %ifidn %3, avg
 100     PAVGB     %1, %2
 101 %endif
 102     movh  [dstq], %1
 103 %endmacro
 104
 105 %macro FILTER_V 1
 106 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
 107 %ifdef PIC
 108     lea  picregq, [sixtap_filter_v_m]
 109 %endif
 110     pxor      m7, m7
 111     LOAD      my, sixtap_filter_v
 112
 113     ; read 5 lines
 114     sub     srcq, srcstrideq
 115     sub     srcq, srcstrideq
 116     movh      m0, [srcq]
 117     movh      m1, [srcq+srcstrideq]
 118     movh      m2, [srcq+srcstrideq*2]
 119     lea     srcq, [srcq+srcstrideq*2]
 120     add     srcq, srcstrideq
 121     movh      m3, [srcq]
 122     movh      m4, [srcq+srcstrideq]
 123     punpcklbw m0, m7
 124     punpcklbw m1, m7
 125     punpcklbw m2, m7
 126     punpcklbw m3, m7
 127     punpcklbw m4, m7
 128
 129 %ifdef m8
 130     mova      m8, [myq+ 0]
 131     mova      m9, [myq+16]
 132     mova     m10, [myq+32]
 133     mova     m11, [myq+48]
 134 %define COEFF05  m8
 135 %define COEFF14  m9
 136 %define COEFF2   m10
 137 %define COEFF3   m11
 138 %else
 139 %define COEFF05  [myq+ 0]
 140 %define COEFF14  [myq+16]
 141 %define COEFF2   [myq+32]
 142 %define COEFF3   [myq+48]
 143 %endif
 144 .nextrow:
 145     mova      m6, m1
 146     movh      m5, [srcq+2*srcstrideq]      ; read new row
 147     paddw     m6, m4
 148     punpcklbw m5, m7
 149     pmullw    m6, COEFF14
 150     paddw     m0, m5
 151     pmullw    m0, COEFF05
 152     paddw     m6, m0
 153     mova      m0, m1
 154     paddw     m6, [pw_32]
 155     mova      m1, m2
 156     pmullw    m2, COEFF2
 157     paddw     m6, m2
 158     mova      m2, m3
 159     pmullw    m3, COEFF3
 160     paddw     m6, m3
 161
 162     ; round/clip/store
 163     mova      m3, m4
 164     psraw     m6, 6
 165     mova      m4, m5
 166     STORE     m6, m5, %1
 167
 168     ; go to next line
 169     add     dstq, dststrideq
 170     add     srcq, srcstrideq
 171     dec  heightd                           ; next row
 172     jg .nextrow
 173     REP_RET
 174 %endmacro
 175
 176 %macro FILTER_H  1
 177 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
 178 %ifdef PIC
 179     lea  picregq, [sixtap_filter_v_m]
 180 %endif
 181     pxor      m7, m7
 182     LOAD      mx, sixtap_filter_v
 183     mova      m6, [pw_32]
 184 %ifdef m8
 185     mova      m8, [mxq+ 0]
 186     mova      m9, [mxq+16]
 187     mova     m10, [mxq+32]
 188     mova     m11, [mxq+48]
 189 %define COEFF05  m8
 190 %define COEFF14  m9
 191 %define COEFF2   m10
 192 %define COEFF3   m11
 193 %else
 194 %define COEFF05  [mxq+ 0]
 195 %define COEFF14  [mxq+16]
 196 %define COEFF2   [mxq+32]
 197 %define COEFF3   [mxq+48]
 198 %endif
 199 .nextrow:
 200     movq      m0, [srcq-2]
 201     movq      m5, [srcq+3]
 202     movq      m1, [srcq-1]
 203     movq      m4, [srcq+2]
 204     punpcklbw m0, m7
 205     punpcklbw m5, m7
 206     punpcklbw m1, m7
 207     punpcklbw m4, m7
 208     movq      m2, [srcq-0]
 209     movq      m3, [srcq+1]
 210     paddw     m0, m5
 211     paddw     m1, m4
 212     punpcklbw m2, m7
 213     punpcklbw m3, m7
 214     pmullw    m0, COEFF05
 215     pmullw    m1, COEFF14
 216     pmullw    m2, COEFF2
 217     pmullw    m3, COEFF3
 218     paddw     m0, m6
 219     paddw     m1, m2
 220     paddw     m0, m3
 221     paddw     m0, m1
 222     psraw     m0, 6
 223     STORE     m0, m1, %1
 224
 225     ; go to next line
 226     add     dstq, dststrideq
 227     add     srcq, srcstrideq
 228     dec  heightd            ; next row
 229     jg .nextrow
 230     REP_RET
 231 %endmacro
 232
 233 %if ARCH_X86_32
 234 INIT_MMX  mmx
 235 FILTER_V  put
 236 FILTER_H  put
 237
 238 INIT_MMX  mmxext
 239 FILTER_V  avg
 240 FILTER_H  avg
 241
 242 INIT_MMX  3dnow
 243 FILTER_V  avg
 244 FILTER_H  avg
 245 %endif
 246
 247 INIT_XMM  sse2
 248 FILTER_H  put
 249 FILTER_H  avg
 250 FILTER_V  put
 251 FILTER_V  avg
 252
 253 %macro FILTER_SSSE3 1
 254 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
 255 %ifdef PIC
 256     lea  picregq, [sixtap_filter_hb_m]
 257 %endif
 258
 259     ; read 5 lines
 260     sub     srcq, srcstrideq
 261     LOAD      my, sixtap_filter_hb
 262     sub     srcq, srcstrideq
 263     movh      m0, [srcq]
 264     movh      m1, [srcq+srcstrideq]
 265     movh      m2, [srcq+srcstrideq*2]
 266     lea     srcq, [srcq+srcstrideq*2]
 267     add     srcq, srcstrideq
 268     mova      m5, [myq]
 269     movh      m3, [srcq]
 270     movh      m4, [srcq+srcstrideq]
 271     lea     srcq, [srcq+2*srcstrideq]
 272
 273 .nextrow:
 274     mova      m6, m2
 275     punpcklbw m0, m1
 276     punpcklbw m6, m3
 277     pmaddubsw m0, m5
 278     pmaddubsw m6, [myq+16]
 279     movh      m7, [srcq]      ; read new row
 280     paddw     m6, m0
 281     mova      m0, m1
 282     mova      m1, m2
 283     mova      m2, m3
 284     mova      m3, m4
 285     mova      m4, m7
 286     punpcklbw m7, m3
 287     pmaddubsw m7, m5
 288     paddw     m6, m7
 289     pmulhrsw  m6, [pw_512]
 290     STORE     m6, m7, %1
 291
 292     ; go to next line
 293     add     dstq, dststrideq
 294     add     srcq, srcstrideq
 295     dec       heightd                          ; next row
 296     jg       .nextrow
 297     REP_RET
 298
 299 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
 300 %ifdef PIC
 301     lea  picregq, [sixtap_filter_hb_m]
 302 %endif
 303     mova      m3, [filter_h6_shuf2]
 304     mova      m4, [filter_h6_shuf3]
 305     LOAD      mx, sixtap_filter_hb
 306     mova      m5, [mxq] ; set up 6tap filter in bytes
 307     mova      m6, [mxq+16]
 308     mova      m7, [filter_h6_shuf1]
 309
 310 .nextrow:
 311     movu      m0, [srcq-2]
 312     mova      m1, m0
 313     mova      m2, m0
 314     pshufb    m0, m7
 315     pshufb    m1, m3
 316     pshufb    m2, m4
 317     pmaddubsw m0, m5
 318     pmaddubsw m1, m6
 319     pmaddubsw m2, m5
 320     paddw     m0, m1
 321     paddw     m0, m2
 322     pmulhrsw  m0, [pw_512]
 323     STORE     m0, m1, %1
 324
 325     ; go to next line
 326     add     dstq, dststrideq
 327     add     srcq, srcstrideq
 328     dec  heightd            ; next row
 329     jg .nextrow
 330     REP_RET
 331 %endmacro
 332
 333 INIT_XMM ssse3
 334 FILTER_SSSE3  put
 335 FILTER_SSSE3  avg
 336
 337 ; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
 338 %macro RV40_WCORE  4-5
 339     movh       m4, [%3 + r6 + 0]
 340     movh       m5, [%4 + r6 + 0]
 341 %if %0 == 4
 342 %define OFFSET r6 + mmsize / 2
 343 %else
 344     ; 8x8 block and SSE2, stride was provided
 345 %define OFFSET r6
 346     add        r6, r5
 347 %endif
 348     movh       m6, [%3 + OFFSET]
 349     movh       m7, [%4 + OFFSET]
 350
 351 %if %1 == 0
 352     ; 14-bit weights
 353     punpcklbw  m4, m0
 354     punpcklbw  m5, m0
 355     punpcklbw  m6, m0
 356     punpcklbw  m7, m0
 357
 358     psllw      m4, 7
 359     psllw      m5, 7
 360     psllw      m6, 7
 361     psllw      m7, 7
 362     pmulhw     m4, m3
 363     pmulhw     m5, m2
 364     pmulhw     m6, m3
 365     pmulhw     m7, m2
 366
 367     paddw      m4, m5
 368     paddw      m6, m7
 369 %else
 370     ; 5-bit weights
 371 %if cpuflag(ssse3)
 372     punpcklbw  m4, m5
 373     punpcklbw  m6, m7
 374
 375     pmaddubsw  m4, m3
 376     pmaddubsw  m6, m3
 377 %else
 378     punpcklbw  m4, m0
 379     punpcklbw  m5, m0
 380     punpcklbw  m6, m0
 381     punpcklbw  m7, m0
 382
 383     pmullw     m4, m3
 384     pmullw     m5, m2
 385     pmullw     m6, m3
 386     pmullw     m7, m2
 387     paddw      m4, m5
 388     paddw      m6, m7
 389 %endif
 390
 391 %endif
 392
 393     ; bias and shift down
 394 %if cpuflag(ssse3)
 395     pmulhrsw   m4, m1
 396     pmulhrsw   m6, m1
 397 %else
 398     paddw      m4, m1
 399     paddw      m6, m1
 400     psrlw      m4, 5
 401     psrlw      m6, 5
 402 %endif
 403
 404     packuswb   m4, m6
 405 %if %0 == 5
 406     ; Only called for 8x8 blocks and SSE2
 407     sub        r6, r5
 408     movh       [%2 + r6], m4
 409     add        r6, r5
 410     movhps     [%2 + r6], m4
 411 %else
 412     mova       [%2 + r6], m4
 413 %endif
 414 %endmacro
 415
 416
 417 %macro MAIN_LOOP   2
 418 %if mmsize == 8
 419     RV40_WCORE %2, r0, r1, r2
 420 %if %1 == 16
 421     RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
 422 %endif
 423
 424     ; Prepare for next loop
 425     add        r6, r5
 426 %else
 427 %ifidn %1, 8
 428     RV40_WCORE %2, r0, r1, r2, r5
 429     ; Prepare 2 next lines
 430     add        r6, r5
 431 %else
 432     RV40_WCORE %2, r0, r1, r2
 433     ; Prepare single next line
 434     add        r6, r5
 435 %endif
 436 %endif
 437
 438 %endmacro
 439
 440 ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 441 ; %1=size  %2=num of xmm regs
 442 ; The weights are FP0.14 notation of fractions depending on pts.
 443 ; For timebases without rounding error (i.e. PAL), the fractions
 444 ; can be simplified, and several operations can be avoided.
 445 ; Therefore, we check here whether they are multiples of 2^9 for
 446 ; those simplifications to occur.
 447 %macro RV40_WEIGHT  3
 448 cglobal rv40_weight_func_%1_%2, 6, 7, 8
 449 %if cpuflag(ssse3)
 450     mova       m1, [pw_1024]
 451 %else
 452     mova       m1, [pw_16]
 453 %endif
 454     pxor       m0, m0
 455     ; Set loop counter and increments
 456     mov        r6, r5
 457     shl        r6, %3
 458     add        r0, r6
 459     add        r1, r6
 460     add        r2, r6
 461     neg        r6
 462
 463     movd       m2, r3d
 464     movd       m3, r4d
 465 %ifidn %1,rnd
 466 %define  RND   0
 467     SPLATW     m2, m2
 468 %else
 469 %define  RND   1
 470 %if cpuflag(ssse3)
 471     punpcklbw  m3, m2
 472 %else
 473     SPLATW     m2, m2
 474 %endif
 475 %endif
 476     SPLATW     m3, m3
 477
 478 .loop:
 479     MAIN_LOOP  %2, RND
 480     jnz        .loop
 481     REP_RET
 482 %endmacro
 483
 484 INIT_MMX mmxext
 485 RV40_WEIGHT   rnd,    8, 3
 486 RV40_WEIGHT   rnd,   16, 4
 487 RV40_WEIGHT   nornd,  8, 3
 488 RV40_WEIGHT   nornd, 16, 4
 489
 490 INIT_XMM sse2
 491 RV40_WEIGHT   rnd,    8, 3
 492 RV40_WEIGHT   rnd,   16, 4
 493 RV40_WEIGHT   nornd,  8, 3
 494 RV40_WEIGHT   nornd, 16, 4
 495
 496 INIT_XMM ssse3
 497 RV40_WEIGHT   rnd,    8, 3
 498 RV40_WEIGHT   rnd,   16, 4
 499 RV40_WEIGHT   nornd,  8, 3
 500 RV40_WEIGHT   nornd, 16, 4