git.sesse.net Git - ffmpeg/blob - libavcodec/x86/rv40dsp.asm

   1 ;******************************************************************************
   2 ;* MMX/SSE2-optimized functions for the RV40 decoder
   3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
   4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
   5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
   6 ;*
   7 ;* This file is part of FFmpeg.
   8 ;*
   9 ;* FFmpeg is free software; you can redistribute it and/or
  10 ;* modify it under the terms of the GNU Lesser General Public
  11 ;* License as published by the Free Software Foundation; either
  12 ;* version 2.1 of the License, or (at your option) any later version.
  13 ;*
  14 ;* FFmpeg is distributed in the hope that it will be useful,
  15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 ;* Lesser General Public License for more details.
  18 ;*
  19 ;* You should have received a copy of the GNU Lesser General Public
  20 ;* License along with FFmpeg; if not, write to the Free Software
  21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22 ;******************************************************************************
  23
  24 %include "libavutil/x86/x86util.asm"
  25
  26 SECTION_RODATA
  27
  28 align 16
  29 pw_1024:   times 8 dw 1 << (16 - 6) ; pw_1024
  30
  31 sixtap_filter_hb_m:  times 8 db   1, -5
  32                      times 8 db  52, 20
  33                      ; multiplied by 2 to have the same shift
  34                      times 8 db   2, -10
  35                      times 8 db  40,  40
  36                      ; back to normal
  37                      times 8 db   1, -5
  38                      times 8 db  20, 52
  39
  40 sixtap_filter_v_m:   times 8 dw   1
  41                      times 8 dw  -5
  42                      times 8 dw  52
  43                      times 8 dw  20
  44                      ; multiplied by 2 to have the same shift
  45                      times 8 dw   2
  46                      times 8 dw -10
  47                      times 8 dw  40
  48                      times 8 dw  40
  49                      ; back to normal
  50                      times 8 dw   1
  51                      times 8 dw  -5
  52                      times 8 dw  20
  53                      times 8 dw  52
  54
  55 %ifdef PIC
  56 %define sixtap_filter_hw   picregq
  57 %define sixtap_filter_hb   picregq
  58 %define sixtap_filter_v    picregq
  59 %define npicregs 1
  60 %else
  61 %define sixtap_filter_hw   sixtap_filter_hw_m
  62 %define sixtap_filter_hb   sixtap_filter_hb_m
  63 %define sixtap_filter_v    sixtap_filter_v_m
  64 %define npicregs 0
  65 %endif
  66
  67 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,  5, 6,  6,  7,  7,  8
  68 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7,  7, 8,  8,  9,  9, 10
  69 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
  70
  71 cextern  pw_32
  72 cextern  pw_16
  73 cextern  pw_512
  74
  75 SECTION .text
  76
  77 ;-----------------------------------------------------------------------------
  78 ; subpel MC functions:
  79 ;
  80 ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
  81 ;                                          uint8_t *src, int srcstride,
  82 ;                                          int len, int m);
  83 ;----------------------------------------------------------------------
  84 %macro LOAD  2
  85 %if WIN64
  86    movsxd   %1q, %1d
  87 %endif
  88 %ifdef PIC
  89    add      %1q, picregq
  90 %else
  91    add      %1q, %2
  92 %endif
  93 %endmacro
  94
  95 %macro STORE 3
  96 %ifidn %3, avg
  97     movh      %2, [dstq]
  98 %endif
  99     packuswb  %1, %1
 100 %ifidn %3, avg
 101     PAVGB     %1, %2
 102 %endif
 103     movh  [dstq], %1
 104 %endmacro
 105
 106 %macro FILTER_V 1
 107 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
 108 %ifdef PIC
 109     lea  picregq, [sixtap_filter_v_m]
 110 %endif
 111     pxor      m7, m7
 112     LOAD      my, sixtap_filter_v
 113
 114     ; read 5 lines
 115     sub     srcq, srcstrideq
 116     sub     srcq, srcstrideq
 117     movh      m0, [srcq]
 118     movh      m1, [srcq+srcstrideq]
 119     movh      m2, [srcq+srcstrideq*2]
 120     lea     srcq, [srcq+srcstrideq*2]
 121     add     srcq, srcstrideq
 122     movh      m3, [srcq]
 123     movh      m4, [srcq+srcstrideq]
 124     punpcklbw m0, m7
 125     punpcklbw m1, m7
 126     punpcklbw m2, m7
 127     punpcklbw m3, m7
 128     punpcklbw m4, m7
 129
 130 %ifdef m8
 131     mova      m8, [myq+ 0]
 132     mova      m9, [myq+16]
 133     mova     m10, [myq+32]
 134     mova     m11, [myq+48]
 135 %define COEFF05  m8
 136 %define COEFF14  m9
 137 %define COEFF2   m10
 138 %define COEFF3   m11
 139 %else
 140 %define COEFF05  [myq+ 0]
 141 %define COEFF14  [myq+16]
 142 %define COEFF2   [myq+32]
 143 %define COEFF3   [myq+48]
 144 %endif
 145 .nextrow:
 146     mova      m6, m1
 147     movh      m5, [srcq+2*srcstrideq]      ; read new row
 148     paddw     m6, m4
 149     punpcklbw m5, m7
 150     pmullw    m6, COEFF14
 151     paddw     m0, m5
 152     pmullw    m0, COEFF05
 153     paddw     m6, m0
 154     mova      m0, m1
 155     paddw     m6, [pw_32]
 156     mova      m1, m2
 157     pmullw    m2, COEFF2
 158     paddw     m6, m2
 159     mova      m2, m3
 160     pmullw    m3, COEFF3
 161     paddw     m6, m3
 162
 163     ; round/clip/store
 164     mova      m3, m4
 165     psraw     m6, 6
 166     mova      m4, m5
 167     STORE     m6, m5, %1
 168
 169     ; go to next line
 170     add     dstq, dststrideq
 171     add     srcq, srcstrideq
 172     dec  heightd                           ; next row
 173     jg .nextrow
 174     REP_RET
 175 %endmacro
 176
 177 %macro FILTER_H  1
 178 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
 179 %ifdef PIC
 180     lea  picregq, [sixtap_filter_v_m]
 181 %endif
 182     pxor      m7, m7
 183     LOAD      mx, sixtap_filter_v
 184     mova      m6, [pw_32]
 185 %ifdef m8
 186     mova      m8, [mxq+ 0]
 187     mova      m9, [mxq+16]
 188     mova     m10, [mxq+32]
 189     mova     m11, [mxq+48]
 190 %define COEFF05  m8
 191 %define COEFF14  m9
 192 %define COEFF2   m10
 193 %define COEFF3   m11
 194 %else
 195 %define COEFF05  [mxq+ 0]
 196 %define COEFF14  [mxq+16]
 197 %define COEFF2   [mxq+32]
 198 %define COEFF3   [mxq+48]
 199 %endif
 200 .nextrow:
 201     movq      m0, [srcq-2]
 202     movq      m5, [srcq+3]
 203     movq      m1, [srcq-1]
 204     movq      m4, [srcq+2]
 205     punpcklbw m0, m7
 206     punpcklbw m5, m7
 207     punpcklbw m1, m7
 208     punpcklbw m4, m7
 209     movq      m2, [srcq-0]
 210     movq      m3, [srcq+1]
 211     paddw     m0, m5
 212     paddw     m1, m4
 213     punpcklbw m2, m7
 214     punpcklbw m3, m7
 215     pmullw    m0, COEFF05
 216     pmullw    m1, COEFF14
 217     pmullw    m2, COEFF2
 218     pmullw    m3, COEFF3
 219     paddw     m0, m6
 220     paddw     m1, m2
 221     paddw     m0, m3
 222     paddw     m0, m1
 223     psraw     m0, 6
 224     STORE     m0, m1, %1
 225
 226     ; go to next line
 227     add     dstq, dststrideq
 228     add     srcq, srcstrideq
 229     dec  heightd            ; next row
 230     jg .nextrow
 231     REP_RET
 232 %endmacro
 233
 234 %if ARCH_X86_32
 235 INIT_MMX  mmx
 236 FILTER_V  put
 237 FILTER_H  put
 238
 239 INIT_MMX  mmxext
 240 FILTER_V  avg
 241 FILTER_H  avg
 242
 243 INIT_MMX  3dnow
 244 FILTER_V  avg
 245 FILTER_H  avg
 246 %endif
 247
 248 INIT_XMM  sse2
 249 FILTER_H  put
 250 FILTER_H  avg
 251 FILTER_V  put
 252 FILTER_V  avg
 253
 254 %macro FILTER_SSSE3 1
 255 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
 256 %ifdef PIC
 257     lea  picregq, [sixtap_filter_hb_m]
 258 %endif
 259
 260     ; read 5 lines
 261     sub     srcq, srcstrideq
 262     LOAD      my, sixtap_filter_hb
 263     sub     srcq, srcstrideq
 264     movh      m0, [srcq]
 265     movh      m1, [srcq+srcstrideq]
 266     movh      m2, [srcq+srcstrideq*2]
 267     lea     srcq, [srcq+srcstrideq*2]
 268     add     srcq, srcstrideq
 269     mova      m5, [myq]
 270     movh      m3, [srcq]
 271     movh      m4, [srcq+srcstrideq]
 272     lea     srcq, [srcq+2*srcstrideq]
 273
 274 .nextrow:
 275     mova      m6, m2
 276     punpcklbw m0, m1
 277     punpcklbw m6, m3
 278     pmaddubsw m0, m5
 279     pmaddubsw m6, [myq+16]
 280     movh      m7, [srcq]      ; read new row
 281     paddw     m6, m0
 282     mova      m0, m1
 283     mova      m1, m2
 284     mova      m2, m3
 285     mova      m3, m4
 286     mova      m4, m7
 287     punpcklbw m7, m3
 288     pmaddubsw m7, m5
 289     paddw     m6, m7
 290     pmulhrsw  m6, [pw_512]
 291     STORE     m6, m7, %1
 292
 293     ; go to next line
 294     add     dstq, dststrideq
 295     add     srcq, srcstrideq
 296     dec       heightd                          ; next row
 297     jg       .nextrow
 298     REP_RET
 299
 300 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
 301 %ifdef PIC
 302     lea  picregq, [sixtap_filter_hb_m]
 303 %endif
 304     mova      m3, [filter_h6_shuf2]
 305     mova      m4, [filter_h6_shuf3]
 306     LOAD      mx, sixtap_filter_hb
 307     mova      m5, [mxq] ; set up 6tap filter in bytes
 308     mova      m6, [mxq+16]
 309     mova      m7, [filter_h6_shuf1]
 310
 311 .nextrow:
 312     movu      m0, [srcq-2]
 313     mova      m1, m0
 314     mova      m2, m0
 315     pshufb    m0, m7
 316     pshufb    m1, m3
 317     pshufb    m2, m4
 318     pmaddubsw m0, m5
 319     pmaddubsw m1, m6
 320     pmaddubsw m2, m5
 321     paddw     m0, m1
 322     paddw     m0, m2
 323     pmulhrsw  m0, [pw_512]
 324     STORE     m0, m1, %1
 325
 326     ; go to next line
 327     add     dstq, dststrideq
 328     add     srcq, srcstrideq
 329     dec  heightd            ; next row
 330     jg .nextrow
 331     REP_RET
 332 %endmacro
 333
 334 INIT_XMM ssse3
 335 FILTER_SSSE3  put
 336 FILTER_SSSE3  avg
 337
 338 ; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
 339 %macro RV40_WCORE  4-5
 340     movh       m4, [%3 + r6 + 0]
 341     movh       m5, [%4 + r6 + 0]
 342 %if %0 == 4
 343 %define OFFSET r6 + mmsize / 2
 344 %else
 345     ; 8x8 block and SSE2, stride was provided
 346 %define OFFSET r6
 347     add        r6, r5
 348 %endif
 349     movh       m6, [%3 + OFFSET]
 350     movh       m7, [%4 + OFFSET]
 351
 352 %if %1 == 0
 353     ; 14-bit weights
 354     punpcklbw  m4, m0
 355     punpcklbw  m5, m0
 356     punpcklbw  m6, m0
 357     punpcklbw  m7, m0
 358
 359     psllw      m4, 7
 360     psllw      m5, 7
 361     psllw      m6, 7
 362     psllw      m7, 7
 363     pmulhw     m4, m3
 364     pmulhw     m5, m2
 365     pmulhw     m6, m3
 366     pmulhw     m7, m2
 367
 368     paddw      m4, m5
 369     paddw      m6, m7
 370 %else
 371     ; 5-bit weights
 372 %if cpuflag(ssse3)
 373     punpcklbw  m4, m5
 374     punpcklbw  m6, m7
 375
 376     pmaddubsw  m4, m3
 377     pmaddubsw  m6, m3
 378 %else
 379     punpcklbw  m4, m0
 380     punpcklbw  m5, m0
 381     punpcklbw  m6, m0
 382     punpcklbw  m7, m0
 383
 384     pmullw     m4, m3
 385     pmullw     m5, m2
 386     pmullw     m6, m3
 387     pmullw     m7, m2
 388     paddw      m4, m5
 389     paddw      m6, m7
 390 %endif
 391
 392 %endif
 393
 394     ; bias and shift down
 395 %if cpuflag(ssse3)
 396     pmulhrsw   m4, m1
 397     pmulhrsw   m6, m1
 398 %else
 399     paddw      m4, m1
 400     paddw      m6, m1
 401     psrlw      m4, 5
 402     psrlw      m6, 5
 403 %endif
 404
 405     packuswb   m4, m6
 406 %if %0 == 5
 407     ; Only called for 8x8 blocks and SSE2
 408     sub        r6, r5
 409     movh       [%2 + r6], m4
 410     add        r6, r5
 411     movhps     [%2 + r6], m4
 412 %else
 413     mova       [%2 + r6], m4
 414 %endif
 415 %endmacro
 416
 417
 418 %macro MAIN_LOOP   2
 419 %if mmsize == 8
 420     RV40_WCORE %2, r0, r1, r2
 421 %if %1 == 16
 422     RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
 423 %endif
 424
 425     ; Prepare for next loop
 426     add        r6, r5
 427 %else
 428 %ifidn %1, 8
 429     RV40_WCORE %2, r0, r1, r2, r5
 430     ; Prepare 2 next lines
 431     add        r6, r5
 432 %else
 433     RV40_WCORE %2, r0, r1, r2
 434     ; Prepare single next line
 435     add        r6, r5
 436 %endif
 437 %endif
 438
 439 %endmacro
 440
 441 ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
 442 ; %1=size  %2=num of xmm regs
 443 ; The weights are FP0.14 notation of fractions depending on pts.
 444 ; For timebases without rounding error (i.e. PAL), the fractions
 445 ; can be simplified, and several operations can be avoided.
 446 ; Therefore, we check here whether they are multiples of 2^9 for
 447 ; those simplifications to occur.
 448 %macro RV40_WEIGHT  3
 449 cglobal rv40_weight_func_%1_%2, 6, 7, 8
 450 %if cpuflag(ssse3)
 451     mova       m1, [pw_1024]
 452 %else
 453     mova       m1, [pw_16]
 454 %endif
 455     pxor       m0, m0
 456     ; Set loop counter and increments
 457     mov        r6, r5
 458     shl        r6, %3
 459     add        r0, r6
 460     add        r1, r6
 461     add        r2, r6
 462     neg        r6
 463
 464     movd       m2, r3d
 465     movd       m3, r4d
 466 %ifidn %1,rnd
 467 %define  RND   0
 468     SPLATW     m2, m2
 469 %else
 470 %define  RND   1
 471 %if cpuflag(ssse3)
 472     punpcklbw  m3, m2
 473 %else
 474     SPLATW     m2, m2
 475 %endif
 476 %endif
 477     SPLATW     m3, m3
 478
 479 .loop:
 480     MAIN_LOOP  %2, RND
 481     jnz        .loop
 482     REP_RET
 483 %endmacro
 484
 485 INIT_MMX mmxext
 486 RV40_WEIGHT   rnd,    8, 3
 487 RV40_WEIGHT   rnd,   16, 4
 488 RV40_WEIGHT   nornd,  8, 3
 489 RV40_WEIGHT   nornd, 16, 4
 490
 491 INIT_XMM sse2
 492 RV40_WEIGHT   rnd,    8, 3
 493 RV40_WEIGHT   rnd,   16, 4
 494 RV40_WEIGHT   nornd,  8, 3
 495 RV40_WEIGHT   nornd, 16, 4
 496
 497 INIT_XMM ssse3
 498 RV40_WEIGHT   rnd,    8, 3
 499 RV40_WEIGHT   rnd,   16, 4
 500 RV40_WEIGHT   nornd,  8, 3
 501 RV40_WEIGHT   nornd, 16, 4