git.sesse.net Git - ffmpeg/blob - libavcodec/x86/rv40dsp.asm

   1 ;******************************************************************************
   2 ;* MMX/SSE2-optimized functions for the RV40 decoder
   3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
   4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
   5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
   6 ;*
   7 ;* This file is part of Libav.
   8 ;*
   9 ;* Libav is free software; you can redistribute it and/or
  10 ;* modify it under the terms of the GNU Lesser General Public
  11 ;* License as published by the Free Software Foundation; either
  12 ;* version 2.1 of the License, or (at your option) any later version.
  13 ;*
  14 ;* Libav is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 ;* Lesser General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU Lesser General Public
  20 ;* License along with Libav; if not, write to the Free Software
  21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22 ;******************************************************************************
  23
  24 %include "x86inc.asm"
  25 %include "x86util.asm"
  26
  27 SECTION_RODATA
  28
  29 align 16
  30 pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
  31
  32 sixtap_filter_hb_m:  times 8 db   1, -5
  33                      times 8 db  52, 20
  34                      ; multiplied by 2 to have the same shift
  35                      times 8 db   2, -10
  36                      times 8 db  40,  40
  37                      ; back to normal
  38                      times 8 db   1, -5
  39                      times 8 db  20, 52
  40
  41 sixtap_filter_v_m:   times 8 dw   1
  42                      times 8 dw  -5
  43                      times 8 dw  52
  44                      times 8 dw  20
  45                      ; multiplied by 2 to have the same shift
  46                      times 8 dw   2
  47                      times 8 dw -10
  48                      times 8 dw  40
  49                      times 8 dw  40
  50                      ; back to normal
  51                      times 8 dw   1
  52                      times 8 dw  -5
  53                      times 8 dw  20
  54                      times 8 dw  52
  55
  56 %ifdef PIC
  57 %define sixtap_filter_hw   picregq
  58 %define sixtap_filter_hb   picregq
  59 %define sixtap_filter_v    picregq
  60 %define npicregs 1
  61 %else
  62 %define sixtap_filter_hw   sixtap_filter_hw_m
  63 %define sixtap_filter_hb   sixtap_filter_hb_m
  64 %define sixtap_filter_v    sixtap_filter_v_m
  65 %define npicregs 0
  66 %endif
  67
  68 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
  69 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
  70 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  71
  72 cextern  pw_32
  73 cextern  pw_16
  74 cextern  pw_512
  75
  76 SECTION .text
  77
  78 ;-----------------------------------------------------------------------------
  79 ; subpel MC functions:
  80 ;
  81 ; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  82 ;                                       uint8_t *src, int srcstride,
  83 ;                                       int len, int m);
  84 ;----------------------------------------------------------------------
  85 %macro LOAD  2
  86 %if WIN64
  87    movsxd   %1q, %1d
  88 %endif
  89 %ifdef PIC
  90    add      %1q, picregq
  91 %else
  92    add      %1q, %2
  93 %endif
  94 %endmacro
  95
  96 %macro STORE 3
  97 %ifidn %3, avg
  98     movh      %2, [dstq]
  99 %endif
 100     packuswb  %1, %1
 101 %ifidn %3, avg
 102 %if cpuflag(3dnow)
 103     pavgusb   %1, %2
 104 %else
 105     pavgb     %1, %2
 106 %endif
 107 %endif
 108     movh  [dstq], %1
 109 %endmacro
 110
 111 %macro FILTER_V 1
 112 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
 113 %ifdef PIC
 114     lea  picregq, [sixtap_filter_v_m]
 115 %endif
 116     pxor      m7, m7
 117     LOAD      my, sixtap_filter_v
 118
 119     ; read 5 lines
 120     sub     srcq, srcstrideq
 121     sub     srcq, srcstrideq
 122     movh      m0, [srcq]
 123     movh      m1, [srcq+srcstrideq]
 124     movh      m2, [srcq+srcstrideq*2]
 125     lea     srcq, [srcq+srcstrideq*2]
 126     add     srcq, srcstrideq
 127     movh      m3, [srcq]
 128     movh      m4, [srcq+srcstrideq]
 129     punpcklbw m0, m7
 130     punpcklbw m1, m7
 131     punpcklbw m2, m7
 132     punpcklbw m3, m7
 133     punpcklbw m4, m7
 134
 135 %ifdef m8
 136     mova      m8, [myq+ 0]
 137     mova      m9, [myq+16]
 138     mova     m10, [myq+32]
 139     mova     m11, [myq+48]
 140 %define COEFF05  m8
 141 %define COEFF14  m9
 142 %define COEFF2   m10
 143 %define COEFF3   m11
 144 %else
 145 %define COEFF05  [myq+ 0]
 146 %define COEFF14  [myq+16]
 147 %define COEFF2   [myq+32]
 148 %define COEFF3   [myq+48]
 149 %endif
 150 .nextrow:
 151     mova      m6, m1
 152     movh      m5, [srcq+2*srcstrideq]      ; read new row
 153     paddw     m6, m4
 154     punpcklbw m5, m7
 155     pmullw    m6, COEFF14
 156     paddw     m0, m5
 157     pmullw    m0, COEFF05
 158     paddw     m6, m0
 159     mova      m0, m1
 160     paddw     m6, [pw_32]
 161     mova      m1, m2
 162     pmullw    m2, COEFF2
 163     paddw     m6, m2
 164     mova      m2, m3
 165     pmullw    m3, COEFF3
 166     paddw     m6, m3
 167
 168     ; round/clip/store
 169     mova      m3, m4
 170     psraw     m6, 6
 171     mova      m4, m5
 172     STORE     m6, m5, %1
 173
 174     ; go to next line
 175     add     dstq, dststrideq
 176     add     srcq, srcstrideq
 177     dec  heightd                           ; next row
 178     jg .nextrow
 179     REP_RET
 180 %endmacro
 181
 182 %macro FILTER_H  1
 183 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
 184 %ifdef PIC
 185     lea  picregq, [sixtap_filter_v_m]
 186 %endif
 187     pxor      m7, m7
 188     LOAD      mx, sixtap_filter_v
 189     mova      m6, [pw_32]
 190 %ifdef m8
 191     mova      m8, [mxq+ 0]
 192     mova      m9, [mxq+16]
 193     mova     m10, [mxq+32]
 194     mova     m11, [mxq+48]
 195 %define COEFF05  m8
 196 %define COEFF14  m9
 197 %define COEFF2   m10
 198 %define COEFF3   m11
 199 %else
 200 %define COEFF05  [mxq+ 0]
 201 %define COEFF14  [mxq+16]
 202 %define COEFF2   [mxq+32]
 203 %define COEFF3   [mxq+48]
 204 %endif
 205 .nextrow:
 206     movq      m0, [srcq-2]
 207     movq      m5, [srcq+3]
 208     movq      m1, [srcq-1]
 209     movq      m4, [srcq+2]
 210     punpcklbw m0, m7
 211     punpcklbw m5, m7
 212     punpcklbw m1, m7
 213     punpcklbw m4, m7
 214     movq      m2, [srcq-0]
 215     movq      m3, [srcq+1]
 216     paddw     m0, m5
 217     paddw     m1, m4
 218     punpcklbw m2, m7
 219     punpcklbw m3, m7
 220     pmullw    m0, COEFF05
 221     pmullw    m1, COEFF14
 222     pmullw    m2, COEFF2
 223     pmullw    m3, COEFF3
 224     paddw     m0, m6
 225     paddw     m1, m2
 226     paddw     m0, m3
 227     paddw     m0, m1
 228     psraw     m0, 6
 229     STORE     m0, m1, %1
 230
 231     ; go to next line
 232     add     dstq, dststrideq
 233     add     srcq, srcstrideq
 234     dec  heightd            ; next row
 235     jg .nextrow
 236     REP_RET
 237 %endmacro
 238
 239 %if ARCH_X86_32
 240 INIT_MMX  mmx
 241 FILTER_V  put
 242 FILTER_H  put
 243
 244 INIT_MMX  mmx2
 245 FILTER_V  avg
 246 FILTER_H  avg
 247
 248 INIT_MMX  3dnow
 249 FILTER_V  avg
 250 FILTER_H  avg
 251 %endif
 252
 253 INIT_XMM  sse2
 254 FILTER_H  put
 255 FILTER_H  avg
 256 FILTER_V  put
 257 FILTER_V  avg
 258
 259 %macro FILTER_SSSE3 1
 260 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
 261 %ifdef PIC
 262     lea  picregq, [sixtap_filter_hb_m]
 263 %endif
 264
 265     ; read 5 lines
 266     sub     srcq, srcstrideq
 267     LOAD      my, sixtap_filter_hb
 268     sub     srcq, srcstrideq
 269     movh      m0, [srcq]
 270     movh      m1, [srcq+srcstrideq]
 271     movh      m2, [srcq+srcstrideq*2]
 272     lea     srcq, [srcq+srcstrideq*2]
 273     add     srcq, srcstrideq
 274     mova      m5, [myq]
 275     movh      m3, [srcq]
 276     movh      m4, [srcq+srcstrideq]
 277     lea     srcq, [srcq+2*srcstrideq]
 278
 279 .nextrow:
 280     mova      m6, m2
 281     punpcklbw m0, m1
 282     punpcklbw m6, m3
 283     pmaddubsw m0, m5
 284     pmaddubsw m6, [myq+16]
 285     movh      m7, [srcq]      ; read new row
 286     paddw     m6, m0
 287     mova      m0, m1
 288     mova      m1, m2
 289     mova      m2, m3
 290     mova      m3, m4
 291     mova      m4, m7
 292     punpcklbw m7, m3
 293     pmaddubsw m7, m5
 294     paddw     m6, m7
 295     pmulhrsw  m6, [pw_512]
 296     STORE     m6, m7, %1
 297
 298     ; go to next line
 299     add     dstq, dststrideq
 300     add     srcq, srcstrideq
 301     dec       heightd                          ; next row
 302     jg       .nextrow
 303     REP_RET
 304
 305 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
 306 %ifdef PIC
 307     lea  picregq, [sixtap_filter_hb_m]
 308 %endif
 309     mova      m3, [filter_h6_shuf2]
 310     mova      m4, [filter_h6_shuf3]
 311     LOAD      mx, sixtap_filter_hb
 312     mova      m5, [mxq] ; set up 6tap filter in bytes
 313     mova      m6, [mxq+16]
 314     mova      m7, [filter_h6_shuf1]
 315
 316 .nextrow:
 317     movu      m0, [srcq-2]
 318     mova      m1, m0
 319     mova      m2, m0
 320     pshufb    m0, m7
 321     pshufb    m1, m3
 322     pshufb    m2, m4
 323     pmaddubsw m0, m5
 324     pmaddubsw m1, m6
 325     pmaddubsw m2, m5
 326     paddw     m0, m1
 327     paddw     m0, m2
 328     pmulhrsw  m0, [pw_512]
 329     STORE     m0, m1, %1
 330
 331     ; go to next line
 332     add     dstq, dststrideq
 333     add     srcq, srcstrideq
 334     dec  heightd            ; next row
 335     jg .nextrow
 336     REP_RET
 337 %endmacro
 338
 339 INIT_XMM ssse3
 340 FILTER_SSSE3  put
 341 FILTER_SSSE3  avg
 342
 343 ; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
 344 %macro RV40_WCORE  4-5
 345     movh       m4, [%3 + r6 + 0]
 346     movh       m5, [%4 + r6 + 0]
 347 %if %0 == 4
 348 %define OFFSET r6 + mmsize / 2
 349 %else
 350     ; 8x8 block and sse2, stride was provided
 351 %define OFFSET r6
 352     add        r6, r5
 353 %endif
 354     movh       m6, [%3 + OFFSET]
 355     movh       m7, [%4 + OFFSET]
 356
 357 %if %1 == 0
 358     ; 14bits weights
 359     punpcklbw  m4, m0
 360     punpcklbw  m5, m0
 361     punpcklbw  m6, m0
 362     punpcklbw  m7, m0
 363
 364     psllw      m4, 7
 365     psllw      m5, 7
 366     psllw      m6, 7
 367     psllw      m7, 7
 368     pmulhw     m4, m3
 369     pmulhw     m5, m2
 370     pmulhw     m6, m3
 371     pmulhw     m7, m2
 372
 373     paddw      m4, m5
 374     paddw      m6, m7
 375 %else
 376     ; 5bits weights
 377 %if cpuflag(ssse3)
 378     punpcklbw  m4, m5
 379     punpcklbw  m6, m7
 380
 381     pmaddubsw  m4, m3
 382     pmaddubsw  m6, m3
 383 %else
 384     punpcklbw  m4, m0
 385     punpcklbw  m5, m0
 386     punpcklbw  m6, m0
 387     punpcklbw  m7, m0
 388
 389     pmullw     m4, m3
 390     pmullw     m5, m2
 391     pmullw     m6, m3
 392     pmullw     m7, m2
 393     paddw      m4, m5
 394     paddw      m6, m7
 395 %endif
 396
 397 %endif
 398
 399     ; bias and shift down
 400 %if cpuflag(ssse3)
 401     pmulhrsw   m4, m1
 402     pmulhrsw   m6, m1
 403 %else
 404     paddw      m4, m1
 405     paddw      m6, m1
 406     psrlw      m4, 5
 407     psrlw      m6, 5
 408 %endif
 409
 410     packuswb   m4, m6
 411 %if %0 == 5
 412     ; Only called for 8x8 blocks and sse2
 413     sub        r6, r5
 414     movh       [%2 + r6], m4
 415     add        r6, r5
 416     movhps     [%2 + r6], m4
 417 %else
 418     mova       [%2 + r6], m4
 419 %endif
 420 %endmacro
 421
 422
 423 %macro MAIN_LOOP   2
 424 %if mmsize == 8
 425     RV40_WCORE %2, r0, r1, r2
 426 %if %1 == 16
 427     RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
 428 %endif
 429
 430     ; Prepare for next loop
 431     add        r6, r5
 432 %else
 433 %ifidn %1, 8
 434     RV40_WCORE %2, r0, r1, r2, r5
 435     ; Prepare 2 next lines
 436     add        r6, r5
 437 %else
 438     RV40_WCORE %2, r0, r1, r2
 439     ; Prepare single next line
 440     add        r6, r5
 441 %endif
 442 %endif
 443
 444 %endmacro
 445
 446 ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 447 ; %1=size  %2=num of xmm regs
 448 ; The weights are FP0.14 notation of fractions depending on pts.
 449 ; For timebases without rounding error (i.e. PAL), the fractions
 450 ; can be simplified, and several operations can be avoided.
 451 ; Therefore, we check here whether they are multiples of 2^9 for
 452 ; those simplifications to occur.
 453 %macro RV40_WEIGHT  3
 454 cglobal rv40_weight_func_%1_%2, 6, 7, 8
 455 %if cpuflag(ssse3)
 456     mova       m1, [pw_1024]
 457 %else
 458     mova       m1, [pw_16]
 459 %endif
 460     pxor       m0, m0
 461     ; Set loop counter and increments
 462     mov        r6, r5
 463     shl        r6, %3
 464     add        r0, r6
 465     add        r1, r6
 466     add        r2, r6
 467     neg        r6
 468
 469     movd       m2, r3
 470     movd       m3, r4
 471 %ifidn %1,rnd
 472 %define  RND   0
 473     SPLATW     m2, m2
 474 %else
 475 %define  RND   1
 476 %if cpuflag(ssse3)
 477     punpcklbw  m3, m2
 478 %else
 479     SPLATW     m2, m2
 480 %endif
 481 %endif
 482     SPLATW     m3, m3
 483
 484 .loop:
 485     MAIN_LOOP  %2, RND
 486     jnz        .loop
 487     REP_RET
 488 %endmacro
 489
 490 INIT_MMX mmx2
 491 RV40_WEIGHT   rnd,    8, 3
 492 RV40_WEIGHT   rnd,   16, 4
 493 RV40_WEIGHT   nornd,  8, 3
 494 RV40_WEIGHT   nornd, 16, 4
 495
 496 INIT_XMM sse2
 497 RV40_WEIGHT   rnd,    8, 3
 498 RV40_WEIGHT   rnd,   16, 4
 499 RV40_WEIGHT   nornd,  8, 3
 500 RV40_WEIGHT   nornd, 16, 4
 501
 502 INIT_XMM ssse3
 503 RV40_WEIGHT   rnd,    8, 3
 504 RV40_WEIGHT   rnd,   16, 4
 505 RV40_WEIGHT   nornd,  8, 3
 506 RV40_WEIGHT   nornd, 16, 4