git.sesse.net Git - ffmpeg/blob - libavcodec/x86/rv40dsp.asm

   1 ;******************************************************************************
   2 ;* MMX/SSE2-optimized functions for the RV40 decoder
   3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
   4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
   5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
   6 ;*
   7 ;* This file is part of Libav.
   8 ;*
   9 ;* Libav is free software; you can redistribute it and/or
  10 ;* modify it under the terms of the GNU Lesser General Public
  11 ;* License as published by the Free Software Foundation; either
  12 ;* version 2.1 of the License, or (at your option) any later version.
  13 ;*
  14 ;* Libav is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 ;* Lesser General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU Lesser General Public
  20 ;* License along with Libav; if not, write to the Free Software
  21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22 ;******************************************************************************
  23
  24 %include "libavutil/x86/x86util.asm"
  25
  26 SECTION_RODATA
  27
  28 align 16
  29 pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
  30
  31 sixtap_filter_hb_m:  times 8 db   1, -5
  32                      times 8 db  52, 20
  33                      ; multiplied by 2 to have the same shift
  34                      times 8 db   2, -10
  35                      times 8 db  40,  40
  36                      ; back to normal
  37                      times 8 db   1, -5
  38                      times 8 db  20, 52
  39
  40 sixtap_filter_v_m:   times 8 dw   1
  41                      times 8 dw  -5
  42                      times 8 dw  52
  43                      times 8 dw  20
  44                      ; multiplied by 2 to have the same shift
  45                      times 8 dw   2
  46                      times 8 dw -10
  47                      times 8 dw  40
  48                      times 8 dw  40
  49                      ; back to normal
  50                      times 8 dw   1
  51                      times 8 dw  -5
  52                      times 8 dw  20
  53                      times 8 dw  52
  54
  55 %ifdef PIC
  56 %define sixtap_filter_hw   picregq
  57 %define sixtap_filter_hb   picregq
  58 %define sixtap_filter_v    picregq
  59 %define npicregs 1
  60 %else
  61 %define sixtap_filter_hw   sixtap_filter_hw_m
  62 %define sixtap_filter_hb   sixtap_filter_hb_m
  63 %define sixtap_filter_v    sixtap_filter_v_m
  64 %define npicregs 0
  65 %endif
  66
  67 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
  68 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
  69 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  70
  71 cextern  pw_32
  72 cextern  pw_16
  73 cextern  pw_512
  74
  75 SECTION .text
  76
  77 ;-----------------------------------------------------------------------------
  78 ; subpel MC functions:
  79 ;
  80 ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  81 ;                                       uint8_t *src, int srcstride,
  82 ;                                       int len, int m);
  83 ;----------------------------------------------------------------------
  84 %macro LOAD  2
  85 %if WIN64
  86    movsxd   %1q, %1d
  87 %endif
  88 %ifdef PIC
  89    add      %1q, picregq
  90 %else
  91    add      %1q, %2
  92 %endif
  93 %endmacro
  94
  95 %macro STORE 3
  96 %ifidn %3, avg
  97     movh      %2, [dstq]
  98 %endif
  99     packuswb  %1, %1
 100 %ifidn %3, avg
 101 %if cpuflag(3dnow)
 102     pavgusb   %1, %2
 103 %else
 104     pavgb     %1, %2
 105 %endif
 106 %endif
 107     movh  [dstq], %1
 108 %endmacro
 109
 110 %macro FILTER_V 1
 111 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
 112 %ifdef PIC
 113     lea  picregq, [sixtap_filter_v_m]
 114 %endif
 115     pxor      m7, m7
 116     LOAD      my, sixtap_filter_v
 117
 118     ; read 5 lines
 119     sub     srcq, srcstrideq
 120     sub     srcq, srcstrideq
 121     movh      m0, [srcq]
 122     movh      m1, [srcq+srcstrideq]
 123     movh      m2, [srcq+srcstrideq*2]
 124     lea     srcq, [srcq+srcstrideq*2]
 125     add     srcq, srcstrideq
 126     movh      m3, [srcq]
 127     movh      m4, [srcq+srcstrideq]
 128     punpcklbw m0, m7
 129     punpcklbw m1, m7
 130     punpcklbw m2, m7
 131     punpcklbw m3, m7
 132     punpcklbw m4, m7
 133
 134 %ifdef m8
 135     mova      m8, [myq+ 0]
 136     mova      m9, [myq+16]
 137     mova     m10, [myq+32]
 138     mova     m11, [myq+48]
 139 %define COEFF05  m8
 140 %define COEFF14  m9
 141 %define COEFF2   m10
 142 %define COEFF3   m11
 143 %else
 144 %define COEFF05  [myq+ 0]
 145 %define COEFF14  [myq+16]
 146 %define COEFF2   [myq+32]
 147 %define COEFF3   [myq+48]
 148 %endif
 149 .nextrow:
 150     mova      m6, m1
 151     movh      m5, [srcq+2*srcstrideq]      ; read new row
 152     paddw     m6, m4
 153     punpcklbw m5, m7
 154     pmullw    m6, COEFF14
 155     paddw     m0, m5
 156     pmullw    m0, COEFF05
 157     paddw     m6, m0
 158     mova      m0, m1
 159     paddw     m6, [pw_32]
 160     mova      m1, m2
 161     pmullw    m2, COEFF2
 162     paddw     m6, m2
 163     mova      m2, m3
 164     pmullw    m3, COEFF3
 165     paddw     m6, m3
 166
 167     ; round/clip/store
 168     mova      m3, m4
 169     psraw     m6, 6
 170     mova      m4, m5
 171     STORE     m6, m5, %1
 172
 173     ; go to next line
 174     add     dstq, dststrideq
 175     add     srcq, srcstrideq
 176     dec  heightd                           ; next row
 177     jg .nextrow
 178     REP_RET
 179 %endmacro
 180
 181 %macro FILTER_H  1
 182 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
 183 %ifdef PIC
 184     lea  picregq, [sixtap_filter_v_m]
 185 %endif
 186     pxor      m7, m7
 187     LOAD      mx, sixtap_filter_v
 188     mova      m6, [pw_32]
 189 %ifdef m8
 190     mova      m8, [mxq+ 0]
 191     mova      m9, [mxq+16]
 192     mova     m10, [mxq+32]
 193     mova     m11, [mxq+48]
 194 %define COEFF05  m8
 195 %define COEFF14  m9
 196 %define COEFF2   m10
 197 %define COEFF3   m11
 198 %else
 199 %define COEFF05  [mxq+ 0]
 200 %define COEFF14  [mxq+16]
 201 %define COEFF2   [mxq+32]
 202 %define COEFF3   [mxq+48]
 203 %endif
 204 .nextrow:
 205     movq      m0, [srcq-2]
 206     movq      m5, [srcq+3]
 207     movq      m1, [srcq-1]
 208     movq      m4, [srcq+2]
 209     punpcklbw m0, m7
 210     punpcklbw m5, m7
 211     punpcklbw m1, m7
 212     punpcklbw m4, m7
 213     movq      m2, [srcq-0]
 214     movq      m3, [srcq+1]
 215     paddw     m0, m5
 216     paddw     m1, m4
 217     punpcklbw m2, m7
 218     punpcklbw m3, m7
 219     pmullw    m0, COEFF05
 220     pmullw    m1, COEFF14
 221     pmullw    m2, COEFF2
 222     pmullw    m3, COEFF3
 223     paddw     m0, m6
 224     paddw     m1, m2
 225     paddw     m0, m3
 226     paddw     m0, m1
 227     psraw     m0, 6
 228     STORE     m0, m1, %1
 229
 230     ; go to next line
 231     add     dstq, dststrideq
 232     add     srcq, srcstrideq
 233     dec  heightd            ; next row
 234     jg .nextrow
 235     REP_RET
 236 %endmacro
 237
 238 %if ARCH_X86_32
 239 INIT_MMX  mmx
 240 FILTER_V  put
 241 FILTER_H  put
 242
 243 INIT_MMX  mmxext
 244 FILTER_V  avg
 245 FILTER_H  avg
 246
 247 INIT_MMX  3dnow
 248 FILTER_V  avg
 249 FILTER_H  avg
 250 %endif
 251
 252 INIT_XMM  sse2
 253 FILTER_H  put
 254 FILTER_H  avg
 255 FILTER_V  put
 256 FILTER_V  avg
 257
 258 %macro FILTER_SSSE3 1
 259 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
 260 %ifdef PIC
 261     lea  picregq, [sixtap_filter_hb_m]
 262 %endif
 263
 264     ; read 5 lines
 265     sub     srcq, srcstrideq
 266     LOAD      my, sixtap_filter_hb
 267     sub     srcq, srcstrideq
 268     movh      m0, [srcq]
 269     movh      m1, [srcq+srcstrideq]
 270     movh      m2, [srcq+srcstrideq*2]
 271     lea     srcq, [srcq+srcstrideq*2]
 272     add     srcq, srcstrideq
 273     mova      m5, [myq]
 274     movh      m3, [srcq]
 275     movh      m4, [srcq+srcstrideq]
 276     lea     srcq, [srcq+2*srcstrideq]
 277
 278 .nextrow:
 279     mova      m6, m2
 280     punpcklbw m0, m1
 281     punpcklbw m6, m3
 282     pmaddubsw m0, m5
 283     pmaddubsw m6, [myq+16]
 284     movh      m7, [srcq]      ; read new row
 285     paddw     m6, m0
 286     mova      m0, m1
 287     mova      m1, m2
 288     mova      m2, m3
 289     mova      m3, m4
 290     mova      m4, m7
 291     punpcklbw m7, m3
 292     pmaddubsw m7, m5
 293     paddw     m6, m7
 294     pmulhrsw  m6, [pw_512]
 295     STORE     m6, m7, %1
 296
 297     ; go to next line
 298     add     dstq, dststrideq
 299     add     srcq, srcstrideq
 300     dec       heightd                          ; next row
 301     jg       .nextrow
 302     REP_RET
 303
 304 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
 305 %ifdef PIC
 306     lea  picregq, [sixtap_filter_hb_m]
 307 %endif
 308     mova      m3, [filter_h6_shuf2]
 309     mova      m4, [filter_h6_shuf3]
 310     LOAD      mx, sixtap_filter_hb
 311     mova      m5, [mxq] ; set up 6tap filter in bytes
 312     mova      m6, [mxq+16]
 313     mova      m7, [filter_h6_shuf1]
 314
 315 .nextrow:
 316     movu      m0, [srcq-2]
 317     mova      m1, m0
 318     mova      m2, m0
 319     pshufb    m0, m7
 320     pshufb    m1, m3
 321     pshufb    m2, m4
 322     pmaddubsw m0, m5
 323     pmaddubsw m1, m6
 324     pmaddubsw m2, m5
 325     paddw     m0, m1
 326     paddw     m0, m2
 327     pmulhrsw  m0, [pw_512]
 328     STORE     m0, m1, %1
 329
 330     ; go to next line
 331     add     dstq, dststrideq
 332     add     srcq, srcstrideq
 333     dec  heightd            ; next row
 334     jg .nextrow
 335     REP_RET
 336 %endmacro
 337
 338 INIT_XMM ssse3
 339 FILTER_SSSE3  put
 340 FILTER_SSSE3  avg
 341
 342 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
 343 %macro RV40_WCORE  4-5
 344     movh       m4, [%3 + r6 + 0]
 345     movh       m5, [%4 + r6 + 0]
 346 %if %0 == 4
 347 %define OFFSET r6 + mmsize / 2
 348 %else
 349     ; 8x8 block and sse2, stride was provided
 350 %define OFFSET r6
 351     add        r6, r5
 352 %endif
 353     movh       m6, [%3 + OFFSET]
 354     movh       m7, [%4 + OFFSET]
 355
 356 %if %1 == 0
 357     ; 14bits weights
 358     punpcklbw  m4, m0
 359     punpcklbw  m5, m0
 360     punpcklbw  m6, m0
 361     punpcklbw  m7, m0
 362
 363     psllw      m4, 7
 364     psllw      m5, 7
 365     psllw      m6, 7
 366     psllw      m7, 7
 367     pmulhw     m4, m3
 368     pmulhw     m5, m2
 369     pmulhw     m6, m3
 370     pmulhw     m7, m2
 371
 372     paddw      m4, m5
 373     paddw      m6, m7
 374 %else
 375     ; 5bits weights
 376 %if cpuflag(ssse3)
 377     punpcklbw  m4, m5
 378     punpcklbw  m6, m7
 379
 380     pmaddubsw  m4, m3
 381     pmaddubsw  m6, m3
 382 %else
 383     punpcklbw  m4, m0
 384     punpcklbw  m5, m0
 385     punpcklbw  m6, m0
 386     punpcklbw  m7, m0
 387
 388     pmullw     m4, m3
 389     pmullw     m5, m2
 390     pmullw     m6, m3
 391     pmullw     m7, m2
 392     paddw      m4, m5
 393     paddw      m6, m7
 394 %endif
 395
 396 %endif
 397
 398     ; bias and shift down
 399 %if cpuflag(ssse3)
 400     pmulhrsw   m4, m1
 401     pmulhrsw   m6, m1
 402 %else
 403     paddw      m4, m1
 404     paddw      m6, m1
 405     psrlw      m4, 5
 406     psrlw      m6, 5
 407 %endif
 408
 409     packuswb   m4, m6
 410 %if %0 == 5
 411     ; Only called for 8x8 blocks and sse2
 412     sub        r6, r5
 413     movh       [%2 + r6], m4
 414     add        r6, r5
 415     movhps     [%2 + r6], m4
 416 %else
 417     mova       [%2 + r6], m4
 418 %endif
 419 %endmacro
 420
 421
 422 %macro MAIN_LOOP   2
 423 %if mmsize == 8
 424     RV40_WCORE %2, r0, r1, r2
 425 %if %1 == 16
 426     RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
 427 %endif
 428
 429     ; Prepare for next loop
 430     add        r6, r5
 431 %else
 432 %ifidn %1, 8
 433     RV40_WCORE %2, r0, r1, r2, r5
 434     ; Prepare 2 next lines
 435     add        r6, r5
 436 %else
 437     RV40_WCORE %2, r0, r1, r2
 438     ; Prepare single next line
 439     add        r6, r5
 440 %endif
 441 %endif
 442
 443 %endmacro
 444
 445 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 446 ; %1=size  %2=num of xmm regs
 447 ; The weights are FP0.14 notation of fractions depending on pts.
 448 ; For timebases without rounding error (i.e. PAL), the fractions
 449 ; can be simplified, and several operations can be avoided.
 450 ; Therefore, we check here whether they are multiples of 2^9 for
 451 ; those simplifications to occur.
 452 %macro RV40_WEIGHT  3
 453 cglobal rv40_weight_func_%1_%2, 6, 7, 8
 454 %if cpuflag(ssse3)
 455     mova       m1, [pw_1024]
 456 %else
 457     mova       m1, [pw_16]
 458 %endif
 459     pxor       m0, m0
 460     ; Set loop counter and increments
 461     mov        r6, r5
 462     shl        r6, %3
 463     add        r0, r6
 464     add        r1, r6
 465     add        r2, r6
 466     neg        r6
 467
 468     movd       m2, r3d
 469     movd       m3, r4d
 470 %ifidn %1,rnd
 471 %define  RND   0
 472     SPLATW     m2, m2
 473 %else
 474 %define  RND   1
 475 %if cpuflag(ssse3)
 476     punpcklbw  m3, m2
 477 %else
 478     SPLATW     m2, m2
 479 %endif
 480 %endif
 481     SPLATW     m3, m3
 482
 483 .loop:
 484     MAIN_LOOP  %2, RND
 485     jnz        .loop
 486     REP_RET
 487 %endmacro
 488
 489 INIT_MMX mmxext
 490 RV40_WEIGHT   rnd,    8, 3
 491 RV40_WEIGHT   rnd,   16, 4
 492 RV40_WEIGHT   nornd,  8, 3
 493 RV40_WEIGHT   nornd, 16, 4
 494
 495 INIT_XMM sse2
 496 RV40_WEIGHT   rnd,    8, 3
 497 RV40_WEIGHT   rnd,   16, 4
 498 RV40_WEIGHT   nornd,  8, 3
 499 RV40_WEIGHT   nornd, 16, 4
 500
 501 INIT_XMM ssse3
 502 RV40_WEIGHT   rnd,    8, 3
 503 RV40_WEIGHT   rnd,   16, 4
 504 RV40_WEIGHT   nornd,  8, 3
 505 RV40_WEIGHT   nornd, 16, 4