git.sesse.net Git - x264/blob - common/x86/predict-a.asm

   1 ;*****************************************************************************
   2 ;* predict-a.asm: h264 encoder library
   3 ;*****************************************************************************
   4 ;* Copyright (C) 2005-2008 x264 project
   5 ;*
   6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
   7 ;*          Holger Lubitz <holger@lubitz.org>
   8 ;*          Fiona Glaser <fiona@x264.com>
   9 ;*
  10 ;* This program is free software; you can redistribute it and/or modify
  11 ;* it under the terms of the GNU General Public License as published by
  12 ;* the Free Software Foundation; either version 2 of the License, or
  13 ;* (at your option) any later version.
  14 ;*
  15 ;* This program is distributed in the hope that it will be useful,
  16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;* GNU General Public License for more details.
  19 ;*
  20 ;* You should have received a copy of the GNU General Public License
  21 ;* along with this program; if not, write to the Free Software
  22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23 ;*****************************************************************************
  24
  25 %include "x86inc.asm"
  26 %include "x86util.asm"
  27
  28 %macro STORE8x8 2
  29     add r0, 4*FDEC_STRIDE
  30     movq        [r0 + -4*FDEC_STRIDE], %1
  31     movq        [r0 + -3*FDEC_STRIDE], %1
  32     movq        [r0 + -2*FDEC_STRIDE], %1
  33     movq        [r0 + -1*FDEC_STRIDE], %1
  34     movq        [r0 +  0*FDEC_STRIDE], %2
  35     movq        [r0 +  1*FDEC_STRIDE], %2
  36     movq        [r0 +  2*FDEC_STRIDE], %2
  37     movq        [r0 +  3*FDEC_STRIDE], %2
  38 %endmacro
  39
  40 %macro STORE16x16 2
  41     mov         r1d, 4
  42 .loop:
  43     movq        [r0 + 0*FDEC_STRIDE], %1
  44     movq        [r0 + 1*FDEC_STRIDE], %1
  45     movq        [r0 + 2*FDEC_STRIDE], %1
  46     movq        [r0 + 3*FDEC_STRIDE], %1
  47     movq        [r0 + 0*FDEC_STRIDE + 8], %2
  48     movq        [r0 + 1*FDEC_STRIDE + 8], %2
  49     movq        [r0 + 2*FDEC_STRIDE + 8], %2
  50     movq        [r0 + 3*FDEC_STRIDE + 8], %2
  51     add         r0, 4*FDEC_STRIDE
  52     dec         r1d
  53     jg          .loop
  54 %endmacro
  55
  56 %macro STORE16x16_SSE2 1
  57     add r0, 4*FDEC_STRIDE
  58     movdqa      [r0 + -4*FDEC_STRIDE], %1
  59     movdqa      [r0 + -3*FDEC_STRIDE], %1
  60     movdqa      [r0 + -2*FDEC_STRIDE], %1
  61     movdqa      [r0 + -1*FDEC_STRIDE], %1
  62     movdqa      [r0 +  0*FDEC_STRIDE], %1
  63     movdqa      [r0 +  1*FDEC_STRIDE], %1
  64     movdqa      [r0 +  2*FDEC_STRIDE], %1
  65     movdqa      [r0 +  3*FDEC_STRIDE], %1
  66     add r0, 8*FDEC_STRIDE
  67     movdqa      [r0 + -4*FDEC_STRIDE], %1
  68     movdqa      [r0 + -3*FDEC_STRIDE], %1
  69     movdqa      [r0 + -2*FDEC_STRIDE], %1
  70     movdqa      [r0 + -1*FDEC_STRIDE], %1
  71     movdqa      [r0 +  0*FDEC_STRIDE], %1
  72     movdqa      [r0 +  1*FDEC_STRIDE], %1
  73     movdqa      [r0 +  2*FDEC_STRIDE], %1
  74     movdqa      [r0 +  3*FDEC_STRIDE], %1
  75 %endmacro
  76
  77 SECTION_RODATA
  78
  79 ALIGN 16
  80 pb_1:       times 16 db 1
  81 pb_3:       times 16 db 3
  82 pw_2:       times 4 dw 2
  83 pw_4:       times 4 dw 4
  84 pw_8:       times 8 dw 8
  85 pw_76543210:
  86 pw_3210:    dw 0, 1, 2, 3, 4, 5, 6, 7
  87 pb_00s_ff:  times 8 db 0
  88 pb_0s_ff:   times 7 db 0
  89             db 0xff
  90 pw_ff00:    times 8 dw 0xff00
  91 pb_reverse: db 7, 6, 5, 4, 3, 2, 1, 0
  92
  93 SECTION .text
  94
  95 ; dest, left, right, src, tmp
  96 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  97 %macro PRED8x8_LOWPASS0 6
  98     mov%6       %5, %2
  99     pavgb       %2, %3
 100     pxor        %3, %5
 101     mov%6       %1, %4
 102     pand        %3, [pb_1 GLOBAL]
 103     psubusb     %2, %3
 104     pavgb       %1, %2
 105 %endmacro
 106 %macro PRED8x8_LOWPASS 5
 107     PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q
 108 %endmacro
 109 %macro PRED8x8_LOWPASS_XMM 5
 110     PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
 111 %endmacro
 112
 113 %macro LOAD_PLANE_ARGS 0
 114 %ifdef ARCH_X86_64
 115     movd        mm0, r1d
 116     movd        mm2, r2d
 117     movd        mm4, r3d
 118     pshufw      mm0, mm0, 0
 119     pshufw      mm2, mm2, 0
 120     pshufw      mm4, mm4, 0
 121 %else
 122     pshufw      mm0, r1m, 0
 123     pshufw      mm2, r2m, 0
 124     pshufw      mm4, r3m, 0
 125 %endif
 126 %endmacro
 127
 128 ;-----------------------------------------------------------------------------
 129 ; void predict_4x4_ddl_mmxext( uint8_t *src )
 130 ;-----------------------------------------------------------------------------
 131 cglobal predict_4x4_ddl_mmxext, 1,1
 132     movq    mm1, [r0-FDEC_STRIDE]
 133     movq    mm2, mm1
 134     movq    mm3, mm1
 135     movq    mm4, mm1
 136     psllq   mm1, 8
 137     pxor    mm2, mm1
 138     psrlq   mm2, 8
 139     pxor    mm3, mm2
 140     PRED8x8_LOWPASS mm0, mm1, mm3, mm4, mm5
 141
 142 %assign Y 0
 143 %rep 4
 144     psrlq       mm0, 8
 145     movd        [r0+Y*FDEC_STRIDE], mm0
 146 %assign Y (Y+1)
 147 %endrep
 148
 149     RET
 150
 151 ;-----------------------------------------------------------------------------
 152 ; void predict_4x4_ddr_mmxext( uint8_t *src )
 153 ;-----------------------------------------------------------------------------
 154 %macro PREDICT_4x4 1
 155 cglobal predict_4x4_ddr_%1, 1,1
 156     movq      mm1, [r0+1*FDEC_STRIDE-8]
 157     movq      mm2, [r0+0*FDEC_STRIDE-8]
 158     punpckhbw mm2, [r0-1*FDEC_STRIDE-8]
 159     movd      mm3, [r0-1*FDEC_STRIDE]
 160     punpckhwd mm1, mm2
 161     PALIGNR   mm3, mm1, 5, mm1
 162     movq      mm1, mm3
 163     PALIGNR   mm3, [r0+2*FDEC_STRIDE-8], 7, mm4
 164     movq      mm2, mm3
 165     PALIGNR   mm3, [r0+3*FDEC_STRIDE-8], 7, mm4
 166     PRED8x8_LOWPASS mm0, mm3, mm1, mm2, mm4
 167 %assign Y 3
 168     movd    [r0+Y*FDEC_STRIDE], mm0
 169 %rep 3
 170 %assign Y (Y-1)
 171     psrlq    mm0, 8
 172     movd    [r0+Y*FDEC_STRIDE], mm0
 173 %endrep
 174     RET
 175
 176 cglobal predict_4x4_vr_%1, 1,1
 177     movd    mm0, [r0-1*FDEC_STRIDE]              ; ........t3t2t1t0
 178     movq    mm7, mm0
 179     PALIGNR mm0, [r0-1*FDEC_STRIDE-8], 7, mm1    ; ......t3t2t1t0lt
 180     pavgb   mm7, mm0
 181     PALIGNR mm0, [r0+0*FDEC_STRIDE-8], 7, mm1    ; ....t3t2t1t0ltl0
 182     movq    mm1, mm0
 183     PALIGNR mm0, [r0+1*FDEC_STRIDE-8], 7, mm2    ; ..t3t2t1t0ltl0l1
 184     movq    mm2, mm0
 185     PALIGNR mm0, [r0+2*FDEC_STRIDE-8], 7, mm3    ; t3t2t1t0ltl0l1l2
 186     PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
 187     movq    mm1, mm3
 188     psrlq   mm3, 16
 189     psllq   mm1, 48
 190     movd   [r0+0*FDEC_STRIDE], mm7
 191     movd   [r0+1*FDEC_STRIDE], mm3
 192     PALIGNR mm7, mm1, 7, mm2
 193     psllq   mm1, 8
 194     movd   [r0+2*FDEC_STRIDE], mm7
 195     PALIGNR mm3, mm1, 7, mm1
 196     movd   [r0+3*FDEC_STRIDE], mm3
 197     RET
 198
 199 cglobal predict_4x4_hd_%1, 1,1
 200     movd      mm0, [r0-1*FDEC_STRIDE-4] ; lt ..
 201     punpckldq mm0, [r0-1*FDEC_STRIDE]   ; t3 t2 t1 t0 lt .. .. ..
 202     psllq     mm0, 8                    ; t2 t1 t0 lt .. .. .. ..
 203     movq      mm1, [r0+3*FDEC_STRIDE-8] ; l3
 204     punpckhbw mm1, [r0+2*FDEC_STRIDE-8] ; l2 l3
 205     movq      mm2, [r0+1*FDEC_STRIDE-8] ; l1
 206     punpckhbw mm2, [r0+0*FDEC_STRIDE-8] ; l0 l1
 207     punpckhwd mm1, mm2                  ; l0 l1 l2 l3
 208     punpckhdq mm1, mm0                  ; t2 t1 t0 lt l0 l1 l2 l3
 209     movq      mm0, mm1
 210     movq      mm2, mm1
 211     movq      mm7, mm1
 212     psrlq     mm0, 16                   ; .. .. t2 t1 t0 lt l0 l1
 213     psrlq     mm2, 8                    ; .. t2 t1 t0 lt l0 l1 l2
 214     pavgb     mm7, mm2
 215     PRED8x8_LOWPASS mm3, mm1, mm0, mm2, mm4
 216     punpcklbw mm7, mm3
 217     psrlq     mm3, 32
 218     PALIGNR   mm3, mm7, 6, mm6
 219 %assign Y 3
 220     movd     [r0+Y*FDEC_STRIDE], mm7
 221 %rep 2
 222 %assign Y (Y-1)
 223     psrlq     mm7, 16
 224     movd     [r0+Y*FDEC_STRIDE], mm7
 225 %endrep
 226     movd     [r0+0*FDEC_STRIDE], mm3
 227     RET
 228 %endmacro
 229
 230 %define PALIGNR PALIGNR_MMX
 231 PREDICT_4x4 mmxext
 232 %define PALIGNR PALIGNR_SSSE3
 233 PREDICT_4x4 ssse3
 234
 235 ;-----------------------------------------------------------------------------
 236 ; void predict_4x4_hu_mmxext( uint8_t *src )
 237 ;-----------------------------------------------------------------------------
 238 cglobal predict_4x4_hu_mmxext, 1,1
 239     movq      mm0, [r0+0*FDEC_STRIDE-8]
 240     punpckhbw mm0, [r0+1*FDEC_STRIDE-8]
 241     movq      mm1, [r0+2*FDEC_STRIDE-8]
 242     punpckhbw mm1, [r0+3*FDEC_STRIDE-8]
 243     punpckhwd mm0, mm1
 244     movq      mm1, mm0
 245     punpckhbw mm1, mm1
 246     pshufw    mm1, mm1, 0xFF
 247     punpckhdq mm0, mm1
 248     movq      mm2, mm0
 249     movq      mm3, mm0
 250     movq      mm7, mm0
 251     psrlq     mm2, 16
 252     psrlq     mm3, 8
 253     pavgb     mm7, mm3
 254     PRED8x8_LOWPASS mm4, mm0, mm2, mm3, mm5
 255     punpcklbw mm7, mm4
 256 %assign Y 0
 257     movd    [r0+Y*FDEC_STRIDE], mm7
 258 %rep 2
 259 %assign Y (Y+1)
 260     psrlq    mm7, 16
 261     movd    [r0+Y*FDEC_STRIDE], mm7
 262 %endrep
 263     movd    [r0+3*FDEC_STRIDE], mm1
 264     RET
 265
 266 ;-----------------------------------------------------------------------------
 267 ; void predict_4x4_vl_mmxext( uint8_t *src )
 268 ;-----------------------------------------------------------------------------
 269 cglobal predict_4x4_vl_mmxext, 1,1
 270     movq        mm1, [r0-FDEC_STRIDE]
 271     movq        mm3, mm1
 272     movq        mm2, mm1
 273     psrlq       mm3, 8
 274     psrlq       mm2, 16
 275     movq        mm4, mm3
 276     pavgb       mm4, mm1
 277
 278     PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5
 279
 280     movd        [r0+0*FDEC_STRIDE], mm4
 281     movd        [r0+1*FDEC_STRIDE], mm0
 282     psrlq       mm4, 8
 283     psrlq       mm0, 8
 284     movd        [r0+2*FDEC_STRIDE], mm4
 285     movd        [r0+3*FDEC_STRIDE], mm0
 286
 287     RET
 288
 289 ;-----------------------------------------------------------------------------
 290 ; void predict_4x4_dc( uint8_t *src )
 291 ;-----------------------------------------------------------------------------
 292
 293 cglobal predict_4x4_dc_mmxext, 1,4
 294     pxor   mm7, mm7
 295     movd   mm0, [r0-FDEC_STRIDE]
 296     psadbw mm0, mm7
 297     movd   r3d, mm0
 298     movzx  r1d, byte [r0-1]
 299 %assign n 1
 300 %rep 3
 301     movzx  r2d, byte [r0+FDEC_STRIDE*n-1]
 302     add    r1d, r2d
 303 %assign n n+1
 304 %endrep
 305     lea    r1d, [r1+r3+4]
 306     shr    r1d, 3
 307     imul   r1d, 0x01010101
 308     mov   [r0+FDEC_STRIDE*0], r1d
 309     mov   [r0+FDEC_STRIDE*1], r1d
 310     mov   [r0+FDEC_STRIDE*2], r1d
 311     mov   [r0+FDEC_STRIDE*3], r1d
 312     RET
 313
 314 %macro PREDICT_FILTER 1
 315 ;-----------------------------------------------------------------------------
 316 ;void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
 317 ;-----------------------------------------------------------------------------
 318
 319 cglobal predict_8x8_filter_%1, 4,5
 320     add          r0, 0x58
 321 %define src r0-0x58
 322 %ifndef ARCH_X86_64
 323     mov          r4, r1
 324 %define t1 r4
 325 %define t4 r1
 326 %else
 327 %define t1 r1
 328 %define t4 r4
 329 %endif
 330     test        r3b, 0x01
 331     je .check_top
 332     movq        mm0, [src+0*FDEC_STRIDE-8]
 333     punpckhbw   mm0, [src-1*FDEC_STRIDE-8]
 334     movq        mm1, [src+2*FDEC_STRIDE-8]
 335     punpckhbw   mm1, [src+1*FDEC_STRIDE-8]
 336     punpckhwd   mm1, mm0
 337     movq        mm2, [src+4*FDEC_STRIDE-8]
 338     punpckhbw   mm2, [src+3*FDEC_STRIDE-8]
 339     movq        mm3, [src+6*FDEC_STRIDE-8]
 340     punpckhbw   mm3, [src+5*FDEC_STRIDE-8]
 341     punpckhwd   mm3, mm2
 342     punpckhdq   mm3, mm1
 343     movq        mm0, [src+7*FDEC_STRIDE-8]
 344     movq        mm1, [src-1*FDEC_STRIDE]
 345     movq        mm4, mm3
 346     movq        mm2, mm3
 347     PALIGNR     mm4, mm0, 7, mm0
 348     PALIGNR     mm1, mm2, 1, mm2
 349     test        r2b, 0x08
 350     je .fix_lt_1
 351 .do_left:
 352     movq        mm0, mm4
 353     PRED8x8_LOWPASS mm2, mm1, mm4, mm3, mm5
 354     movq     [t1+8], mm2
 355     movq        mm4, mm0
 356     PRED8x8_LOWPASS mm1, mm3, mm0, mm4, mm5
 357     movd         t4, mm1
 358     mov      [t1+7], t4b
 359 .check_top:
 360     test        r3b, 0x02
 361     je .done
 362     movq        mm0, [src-1*FDEC_STRIDE-8]
 363     movq        mm3, [src-1*FDEC_STRIDE]
 364     movq        mm1, [src-1*FDEC_STRIDE+8]
 365     movq        mm2, mm3
 366     movq        mm4, mm3
 367     PALIGNR     mm2, mm0, 7, mm0
 368     PALIGNR     mm1, mm4, 1, mm4
 369     test        r2b, 0x08
 370     je .fix_lt_2
 371     test        r2b, 0x04
 372     je .fix_tr_1
 373 .do_top:
 374     PRED8x8_LOWPASS mm4, mm2, mm1, mm3, mm5
 375     movq    [t1+16], mm4
 376     test        r3b, 0x04
 377     je .done
 378     test        r2b, 0x04
 379     je .fix_tr_2
 380     movq        mm0, [src-1*FDEC_STRIDE+8]
 381     movq        mm5, mm0
 382     movq        mm2, mm0
 383     movq        mm4, mm0
 384     psrlq       mm5, 56
 385     PALIGNR     mm2, mm3, 7, mm3
 386     PALIGNR     mm5, mm4, 1, mm4
 387     PRED8x8_LOWPASS mm1, mm2, mm5, mm0, mm4
 388     jmp .do_topright
 389 .fix_tr_2:
 390     punpckhbw   mm3, mm3
 391     pshufw      mm1, mm3, 0xFF
 392 .do_topright:
 393     movq    [t1+24], mm1
 394     psrlq       mm1, 56
 395     movd         t4, mm1
 396     mov     [t1+32], t4b
 397 .done:
 398     REP_RET
 399 .fix_lt_1:
 400     movq        mm5, mm3
 401     pxor        mm5, mm4
 402     psrlq       mm5, 56
 403     psllq       mm5, 48
 404     pxor        mm1, mm5
 405     jmp .do_left
 406 .fix_lt_2:
 407     movq        mm5, mm3
 408     pxor        mm5, mm2
 409     psllq       mm5, 56
 410     psrlq       mm5, 56
 411     pxor        mm2, mm5
 412     test        r2b, 0x04
 413     jne .do_top
 414 .fix_tr_1:
 415     movq        mm5, mm3
 416     pxor        mm5, mm1
 417     psrlq       mm5, 56
 418     psllq       mm5, 56
 419     pxor        mm1, mm5
 420     jmp .do_top
 421 %endmacro
 422
 423 %define PALIGNR PALIGNR_MMX
 424 PREDICT_FILTER mmxext
 425 %define PALIGNR PALIGNR_SSSE3
 426 PREDICT_FILTER ssse3
 427
 428 ;-----------------------------------------------------------------------------
 429 ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
 430 ;-----------------------------------------------------------------------------
 431 cglobal predict_8x8_v_mmxext, 2,2
 432     movq        mm0, [r1+16]
 433     STORE8x8    mm0, mm0
 434     RET
 435
 436 ;-----------------------------------------------------------------------------
 437 ; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
 438 ;-----------------------------------------------------------------------------
 439
 440 INIT_MMX
 441 cglobal predict_8x8_h_mmxext, 2,2
 442     movu   m3, [r1+7]
 443     mova   m7, m3
 444     punpckhbw m3, m3
 445     punpcklbw m7, m7
 446     pshufw m0, m3, 0xff
 447     pshufw m1, m3, 0xaa
 448     pshufw m2, m3, 0x55
 449     pshufw m3, m3, 0x00
 450     pshufw m4, m7, 0xff
 451     pshufw m5, m7, 0xaa
 452     pshufw m6, m7, 0x55
 453     pshufw m7, m7, 0x00
 454 %assign n 0
 455 %rep 8
 456     mova [r0+n*FDEC_STRIDE], m %+ n
 457 %assign n n+1
 458 %endrep
 459     RET
 460
 461 ;-----------------------------------------------------------------------------
 462 ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
 463 ;-----------------------------------------------------------------------------
 464 cglobal predict_8x8_dc_mmxext, 2,2
 465     pxor        mm0, mm0
 466     pxor        mm1, mm1
 467     psadbw      mm0, [r1+7]
 468     psadbw      mm1, [r1+16]
 469     paddw       mm0, [pw_8 GLOBAL]
 470     paddw       mm0, mm1
 471     psrlw       mm0, 4
 472     pshufw      mm0, mm0, 0
 473     packuswb    mm0, mm0
 474     STORE8x8    mm0, mm0
 475     RET
 476
 477 ;-----------------------------------------------------------------------------
 478 ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
 479 ;-----------------------------------------------------------------------------
 480 %macro PRED8x8_DC 2
 481 cglobal %1, 2,2
 482     pxor        mm0, mm0
 483     psadbw      mm0, [r1+%2]
 484     paddw       mm0, [pw_4 GLOBAL]
 485     psrlw       mm0, 3
 486     pshufw      mm0, mm0, 0
 487     packuswb    mm0, mm0
 488     STORE8x8    mm0, mm0
 489     RET
 490 %endmacro
 491
 492 PRED8x8_DC predict_8x8_dc_top_mmxext, 16
 493 PRED8x8_DC predict_8x8_dc_left_mmxext, 7
 494
 495 %ifndef ARCH_X86_64
 496 ; sse2 is faster even on amd, so there's no sense in spending exe size on these
 497 ; functions if we know sse2 is available.
 498
 499 ;-----------------------------------------------------------------------------
 500 ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
 501 ;-----------------------------------------------------------------------------
 502 cglobal predict_8x8_ddl_mmxext, 2,2
 503     movq        mm5, [r1+16]
 504     movq        mm2, [r1+17]
 505     movq        mm3, [r1+23]
 506     movq        mm4, [r1+25]
 507     movq        mm1, mm5
 508     psllq       mm1, 8
 509     PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
 510     PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6
 511
 512 %assign Y 7
 513 %rep 6
 514     movq        [r0+Y*FDEC_STRIDE], mm1
 515     movq        mm2, mm0
 516     psllq       mm1, 8
 517     psrlq       mm2, 56
 518     psllq       mm0, 8
 519     por         mm1, mm2
 520 %assign Y (Y-1)
 521 %endrep
 522     movq        [r0+Y*FDEC_STRIDE], mm1
 523     psllq       mm1, 8
 524     psrlq       mm0, 56
 525     por         mm1, mm0
 526 %assign Y (Y-1)
 527     movq        [r0+Y*FDEC_STRIDE], mm1
 528     RET
 529
 530 ;-----------------------------------------------------------------------------
 531 ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
 532 ;-----------------------------------------------------------------------------
 533 cglobal predict_8x8_ddr_mmxext, 2,2
 534     movq        mm1, [r1+7]
 535     movq        mm2, [r1+9]
 536     movq        mm3, [r1+15]
 537     movq        mm4, [r1+17]
 538     PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7
 539     PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6
 540
 541 %assign Y 7
 542 %rep 6
 543     movq        [r0+Y*FDEC_STRIDE], mm0
 544     movq        mm2, mm1
 545     psrlq       mm0, 8
 546     psllq       mm2, 56
 547     psrlq       mm1, 8
 548     por         mm0, mm2
 549 %assign Y (Y-1)
 550 %endrep
 551     movq        [r0+Y*FDEC_STRIDE], mm0
 552     psrlq       mm0, 8
 553     psllq       mm1, 56
 554     por         mm0, mm1
 555 %assign Y (Y-1)
 556     movq        [r0+Y*FDEC_STRIDE], mm0
 557     RET
 558
 559 ;-----------------------------------------------------------------------------
 560 ; void predict_8x8_hu_mmxext( uint8_t *src, uint8_t *edge )
 561 ;-----------------------------------------------------------------------------
 562 %define PALIGNR PALIGNR_MMX
 563 cglobal predict_8x8_hu_mmxext, 2,2
 564     movq    mm1, [r1+7]         ; l0 l1 l2 l3 l4 l5 l6 l7
 565     add      r0, 4*FDEC_STRIDE
 566     pshufw  mm0, mm1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
 567     psllq   mm1, 56             ; l7 .. .. .. .. .. .. ..
 568     movq    mm2, mm0
 569     psllw   mm0, 8
 570     psrlw   mm2, 8
 571     por     mm2, mm0            ; l7 l6 l5 l4 l3 l2 l1 l0
 572     movq    mm3, mm2
 573     movq    mm4, mm2
 574     movq    mm5, mm2
 575     psrlq   mm2, 8
 576     psrlq   mm3, 16
 577     por     mm2, mm1            ; l7 l7 l6 l5 l4 l3 l2 l1
 578     punpckhbw mm1, mm1
 579     por     mm3, mm1            ; l7 l7 l7 l6 l5 l4 l3 l2
 580     pavgb   mm4, mm2
 581     PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
 582     movq    mm5, mm4
 583     punpcklbw mm4, mm1          ; p4 p3 p2 p1
 584     punpckhbw mm5, mm1          ; p8 p7 p6 p5
 585     movq    mm6, mm5
 586     movq    mm7, mm5
 587     movq    mm0, mm5
 588     PALIGNR mm5, mm4, 2, mm1
 589     pshufw  mm1, mm6, 11111001b
 590     PALIGNR mm6, mm4, 4, mm2
 591     pshufw  mm2, mm7, 11111110b
 592     PALIGNR mm7, mm4, 6, mm3
 593     pshufw  mm3, mm0, 11111111b
 594     movq   [r0-4*FDEC_STRIDE], mm4
 595     movq   [r0-3*FDEC_STRIDE], mm5
 596     movq   [r0-2*FDEC_STRIDE], mm6
 597     movq   [r0-1*FDEC_STRIDE], mm7
 598     movq   [r0+0*FDEC_STRIDE], mm0
 599     movq   [r0+1*FDEC_STRIDE], mm1
 600     movq   [r0+2*FDEC_STRIDE], mm2
 601     movq   [r0+3*FDEC_STRIDE], mm3
 602     RET
 603
 604 ;-----------------------------------------------------------------------------
 605 ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
 606 ;-----------------------------------------------------------------------------
 607
 608 ; fills only some pixels:
 609 ; f01234567
 610 ; 0........
 611 ; 1,,,,,,,,
 612 ; 2 .......
 613 ; 3 ,,,,,,,
 614 ; 4  ......
 615 ; 5  ,,,,,,
 616 ; 6   .....
 617 ; 7   ,,,,,
 618
 619 cglobal predict_8x8_vr_core_mmxext, 2,2
 620     movq        mm2, [r1+16]
 621     movq        mm3, [r1+15]
 622     movq        mm1, [r1+14]
 623     movq        mm4, mm3
 624     pavgb       mm3, mm2
 625     PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
 626
 627 %assign Y 0
 628 %rep 3
 629     movq        [r0+ Y   *FDEC_STRIDE], mm3
 630     movq        [r0+(Y+1)*FDEC_STRIDE], mm0
 631     psllq       mm3, 8
 632     psllq       mm0, 8
 633 %assign Y (Y+2)
 634 %endrep
 635     movq        [r0+ Y   *FDEC_STRIDE], mm3
 636     movq        [r0+(Y+1)*FDEC_STRIDE], mm0
 637
 638     RET
 639
 640 ;-----------------------------------------------------------------------------
 641 ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
 642 ;-----------------------------------------------------------------------------
 643 cglobal predict_8x8c_p_core_mmxext, 1,2
 644     LOAD_PLANE_ARGS
 645     movq        mm1, mm2
 646     pmullw      mm2, [pw_3210 GLOBAL]
 647     psllw       mm1, 2
 648     paddsw      mm0, mm2        ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
 649     paddsw      mm1, mm0        ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
 650
 651     mov         r1d, 8
 652 ALIGN 4
 653 .loop:
 654     movq        mm5, mm0
 655     movq        mm6, mm1
 656     psraw       mm5, 5
 657     psraw       mm6, 5
 658     packuswb    mm5, mm6
 659     movq        [r0], mm5
 660
 661     paddsw      mm0, mm4
 662     paddsw      mm1, mm4
 663     add         r0, FDEC_STRIDE
 664     dec         r1d
 665     jg          .loop
 666     REP_RET
 667
 668 ;-----------------------------------------------------------------------------
 669 ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
 670 ;-----------------------------------------------------------------------------
 671 cglobal predict_16x16_p_core_mmxext, 1,2
 672     LOAD_PLANE_ARGS
 673     movq        mm5, mm2
 674     movq        mm1, mm2
 675     pmullw      mm5, [pw_3210 GLOBAL]
 676     psllw       mm2, 3
 677     psllw       mm1, 2
 678     movq        mm3, mm2
 679     paddsw      mm0, mm5        ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
 680     paddsw      mm1, mm0        ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
 681     paddsw      mm2, mm0        ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
 682     paddsw      mm3, mm1        ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
 683
 684     mov         r1d, 16
 685 ALIGN 4
 686 .loop:
 687     movq        mm5, mm0
 688     movq        mm6, mm1
 689     psraw       mm5, 5
 690     psraw       mm6, 5
 691     packuswb    mm5, mm6
 692     movq        [r0], mm5
 693
 694     movq        mm5, mm2
 695     movq        mm6, mm3
 696     psraw       mm5, 5
 697     psraw       mm6, 5
 698     packuswb    mm5, mm6
 699     movq        [r0+8], mm5
 700
 701     paddsw      mm0, mm4
 702     paddsw      mm1, mm4
 703     paddsw      mm2, mm4
 704     paddsw      mm3, mm4
 705     add         r0, FDEC_STRIDE
 706     dec         r1d
 707     jg          .loop
 708     REP_RET
 709
 710 %endif ; !ARCH_X86_64
 711
 712 ;-----------------------------------------------------------------------------
 713 ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
 714 ;-----------------------------------------------------------------------------
 715 cglobal predict_8x8_ddl_sse2, 2,2
 716     movdqa      xmm3, [r1+16]
 717     movdqu      xmm2, [r1+17]
 718     movdqa      xmm1, xmm3
 719     pslldq      xmm1, 1
 720     PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
 721
 722 %assign Y 0
 723 %rep 8
 724     psrldq      xmm0, 1
 725     movq        [r0+Y*FDEC_STRIDE], xmm0
 726 %assign Y (Y+1)
 727 %endrep
 728     RET
 729
 730 ;-----------------------------------------------------------------------------
 731 ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
 732 ;-----------------------------------------------------------------------------
 733 cglobal predict_8x8_ddr_sse2, 2,2
 734     movdqu      xmm3, [r1+8]
 735     movdqu      xmm1, [r1+7]
 736     movdqa      xmm2, xmm3
 737     psrldq      xmm2, 1
 738     PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
 739
 740     movdqa      xmm1, xmm0
 741     psrldq      xmm1, 1
 742 %assign Y 7
 743 %rep 3
 744     movq        [r0+Y*FDEC_STRIDE], xmm0
 745     movq        [r0+(Y-1)*FDEC_STRIDE], xmm1
 746     psrldq      xmm0, 2
 747     psrldq      xmm1, 2
 748 %assign Y (Y-2)
 749 %endrep
 750     movq        [r0+1*FDEC_STRIDE], xmm0
 751     movq        [r0+0*FDEC_STRIDE], xmm1
 752
 753     RET
 754
 755 ;-----------------------------------------------------------------------------
 756 ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
 757 ;-----------------------------------------------------------------------------
 758 cglobal predict_8x8_vl_sse2, 2,2
 759     movdqa      xmm4, [r1+16]
 760     movdqa      xmm2, xmm4
 761     movdqa      xmm1, xmm4
 762     movdqa      xmm3, xmm4
 763     psrldq      xmm2, 1
 764     pslldq      xmm1, 1
 765     pavgb       xmm3, xmm2
 766     PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
 767 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
 768 ; xmm3: (t0 + t1 + 1) >> 1
 769
 770 %assign Y 0
 771 %rep 3
 772     psrldq      xmm0, 1
 773     movq        [r0+ Y   *FDEC_STRIDE], xmm3
 774     movq        [r0+(Y+1)*FDEC_STRIDE], xmm0
 775     psrldq      xmm3, 1
 776 %assign Y (Y+2)
 777 %endrep
 778     psrldq      xmm0, 1
 779     movq        [r0+ Y   *FDEC_STRIDE], xmm3
 780     movq        [r0+(Y+1)*FDEC_STRIDE], xmm0
 781
 782     RET
 783
 784 ;-----------------------------------------------------------------------------
 785 ; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
 786 ;-----------------------------------------------------------------------------
 787 cglobal predict_8x8_vr_sse2, 2,2,7
 788     movdqu      xmm0, [r1+8]
 789     movdqa      xmm6, [pw_ff00 GLOBAL]
 790     add         r0, 4*FDEC_STRIDE
 791     movdqa      xmm1, xmm0
 792     movdqa      xmm2, xmm0
 793     movdqa      xmm3, xmm0
 794     pslldq      xmm0, 1
 795     pslldq      xmm1, 2
 796     pavgb       xmm2, xmm0
 797     PRED8x8_LOWPASS_XMM xmm4, xmm3, xmm1, xmm0, xmm5
 798     pandn       xmm6, xmm4
 799     movdqa      xmm5, xmm4
 800     psrlw       xmm4, 8
 801     packuswb    xmm6, xmm4
 802     movhlps     xmm4, xmm6
 803     movhps [r0-3*FDEC_STRIDE], xmm5
 804     movhps [r0-4*FDEC_STRIDE], xmm2
 805     psrldq      xmm5, 4
 806     movss       xmm5, xmm6
 807     psrldq      xmm2, 4
 808     movss       xmm2, xmm4
 809 %assign Y 3
 810 %rep 3
 811     psrldq      xmm5, 1
 812     psrldq      xmm2, 1
 813     movq        [r0+Y*FDEC_STRIDE], xmm5
 814     movq        [r0+(Y-1)*FDEC_STRIDE], xmm2
 815 %assign Y (Y-2)
 816 %endrep
 817     RET
 818
 819 ;-----------------------------------------------------------------------------
 820 ; void predict_8x8_hd_mmxext( uint8_t *src, uint8_t *edge )
 821 ;-----------------------------------------------------------------------------
 822 %define PALIGNR PALIGNR_MMX
 823 cglobal predict_8x8_hd_mmxext, 2,2
 824     add     r0, 4*FDEC_STRIDE
 825     movq    mm0, [r1]           ; l7 .. .. .. .. .. .. ..
 826     movq    mm1, [r1+8]         ; lt l0 l1 l2 l3 l4 l5 l6
 827     movq    mm2, [r1+16]        ; t7 t6 t5 t4 t3 t2 t1 t0
 828     movq    mm3, mm1            ; lt l0 l1 l2 l3 l4 l5 l6
 829     movq    mm4, mm2            ; t7 t6 t5 t4 t3 t2 t1 t0
 830     PALIGNR mm2, mm1, 7, mm5    ; t6 t5 t4 t3 t2 t1 t0 lt
 831     PALIGNR mm1, mm0, 7, mm6    ; l0 l1 l2 l3 l4 l5 l6 l7
 832     PALIGNR mm4, mm3, 1, mm7    ; t0 lt l0 l1 l2 l3 l4 l5
 833     movq    mm5, mm3
 834     pavgb   mm3, mm1
 835     PRED8x8_LOWPASS mm0, mm4, mm1, mm5, mm7
 836     movq    mm4, mm2
 837     movq    mm1, mm2            ; t6 t5 t4 t3 t2 t1 t0 lt
 838     psrlq   mm4, 16             ; .. .. t6 t5 t4 t3 t2 t1
 839     psrlq   mm1, 8              ; .. t6 t5 t4 t3 t2 t1 t0
 840     PRED8x8_LOWPASS mm6, mm4, mm2, mm1, mm5
 841                                 ; .. p11 p10 p9
 842     movq    mm7, mm3
 843     punpcklbw mm3, mm0          ; p4 p3 p2 p1
 844     punpckhbw mm7, mm0          ; p8 p7 p6 p5
 845     movq    mm1, mm7
 846     movq    mm0, mm7
 847     movq    mm4, mm7
 848     movq   [r0+3*FDEC_STRIDE], mm3
 849     PALIGNR mm7, mm3, 2, mm5
 850     movq   [r0+2*FDEC_STRIDE], mm7
 851     PALIGNR mm1, mm3, 4, mm5
 852     movq   [r0+1*FDEC_STRIDE], mm1
 853     PALIGNR mm0, mm3, 6, mm3
 854     movq    [r0+0*FDEC_STRIDE], mm0
 855     movq    mm2, mm6
 856     movq    mm3, mm6
 857     movq   [r0-1*FDEC_STRIDE], mm4
 858     PALIGNR mm6, mm4, 2, mm5
 859     movq   [r0-2*FDEC_STRIDE], mm6
 860     PALIGNR mm2, mm4, 4, mm5
 861     movq   [r0-3*FDEC_STRIDE], mm2
 862     PALIGNR mm3, mm4, 6, mm4
 863     movq   [r0-4*FDEC_STRIDE], mm3
 864     RET
 865
 866 ;-----------------------------------------------------------------------------
 867 ; void predict_8x8_hd_ssse3( uint8_t *src, uint8_t *edge )
 868 ;-----------------------------------------------------------------------------
 869 %macro PREDICT_8x8_HD 1
 870 cglobal predict_8x8_hd_%1, 2,2
 871     add       r0, 4*FDEC_STRIDE
 872     movdqa  xmm0, [r1]
 873     movdqa  xmm1, [r1+16]
 874     movdqa  xmm2, xmm1
 875     movdqa  xmm3, xmm1
 876     PALIGNR xmm1, xmm0, 7, xmm4
 877     PALIGNR xmm2, xmm0, 9, xmm5
 878     PALIGNR xmm3, xmm0, 8, xmm0
 879     movdqa  xmm4, xmm1
 880     pavgb   xmm4, xmm3
 881     PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm5
 882     punpcklbw xmm4, xmm0
 883     movhlps xmm0, xmm4
 884
 885 %assign Y 3
 886 %rep 3
 887     movq   [r0+(Y)*FDEC_STRIDE], xmm4
 888     movq   [r0+(Y-4)*FDEC_STRIDE], xmm0
 889     psrldq xmm4, 2
 890     psrldq xmm0, 2
 891 %assign Y (Y-1)
 892 %endrep
 893     movq   [r0+(Y)*FDEC_STRIDE], xmm4
 894     movq   [r0+(Y-4)*FDEC_STRIDE], xmm0
 895     RET
 896 %endmacro
 897
 898 INIT_XMM
 899 PREDICT_8x8_HD sse2
 900 %define PALIGNR PALIGNR_SSSE3
 901 PREDICT_8x8_HD ssse3
 902 INIT_MMX
 903 %define PALIGNR PALIGNR_MMX
 904
 905 ;-----------------------------------------------------------------------------
 906 ; void predict_8x8_hu_sse2( uint8_t *src, uint8_t *edge )
 907 ;-----------------------------------------------------------------------------
 908 %macro PREDICT_8x8_HU 1
 909 cglobal predict_8x8_hu_%1, 2,2
 910     add        r0, 4*FDEC_STRIDE
 911 %ifidn %1, ssse3
 912     movq      mm5, [r1+7]
 913     movq      mm6, [pb_reverse GLOBAL]
 914     movq      mm1, mm5
 915     movq      mm2, mm5
 916     movq      mm3, mm5
 917     pshufb    mm5, mm6
 918     psrlq     mm6, 8
 919     pshufb    mm2, mm6
 920     psrlq     mm6, 8
 921     pshufb    mm3, mm6
 922     movq      mm4, mm5
 923 %else
 924     movq      mm1, [r1+7]           ; l0 l1 l2 l3 l4 l5 l6 l7
 925     pshufw    mm0, mm1, 00011011b   ; l6 l7 l4 l5 l2 l3 l0 l1
 926     movq      mm2, mm0
 927     psllw     mm0, 8
 928     psrlw     mm2, 8
 929     por       mm2, mm0              ; l7 l6 l5 l4 l3 l2 l1 l0
 930     psllq     mm1, 56               ; l7 .. .. .. .. .. .. ..
 931     movq      mm3, mm2
 932     movq      mm4, mm2
 933     movq      mm5, mm2
 934     psrlq     mm2, 8
 935     psrlq     mm3, 16
 936     por       mm2, mm1              ; l7 l7 l6 l5 l4 l3 l2 l1
 937     punpckhbw mm1, mm1
 938     por       mm3, mm1              ; l7 l7 l7 l6 l5 l4 l3 l2
 939 %endif
 940     pavgb     mm4, mm2
 941     PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
 942
 943     movq2dq   xmm0, mm4
 944     movq2dq   xmm1, mm1
 945     punpcklbw xmm0, xmm1
 946     punpckhbw  mm4, mm1
 947 %assign Y -4
 948 %rep 3
 949     movq     [r0+Y*FDEC_STRIDE], xmm0
 950     psrldq    xmm0, 2
 951 %assign Y (Y+1)
 952 %endrep
 953     pshufw     mm5, mm4, 11111001b
 954     pshufw     mm6, mm4, 11111110b
 955     pshufw     mm7, mm4, 11111111b
 956     movq     [r0+Y*FDEC_STRIDE], xmm0
 957     movq     [r0+0*FDEC_STRIDE], mm4
 958     movq     [r0+1*FDEC_STRIDE], mm5
 959     movq     [r0+2*FDEC_STRIDE], mm6
 960     movq     [r0+3*FDEC_STRIDE], mm7
 961     RET
 962 %endmacro
 963
 964 PREDICT_8x8_HU sse2
 965 PREDICT_8x8_HU ssse3
 966
 967 ;-----------------------------------------------------------------------------
 968 ; void predict_8x8c_v_mmx( uint8_t *src )
 969 ;-----------------------------------------------------------------------------
 970 cglobal predict_8x8c_v_mmx, 1,1
 971     movq        mm0, [r0 - FDEC_STRIDE]
 972     STORE8x8    mm0, mm0
 973     RET
 974
 975 ;-----------------------------------------------------------------------------
 976 ; void predict_8x8c_h_mmxext( uint8_t *src )
 977 ;-----------------------------------------------------------------------------
 978
 979 %macro PRED_8x8C_H 1
 980 cglobal predict_8x8c_h_%1, 1,1
 981 %ifidn %1, ssse3
 982     mova   m1, [pb_3 GLOBAL]
 983 %endif
 984 %assign n 0
 985 %rep 8
 986     SPLATB m0, r0+FDEC_STRIDE*n-1, m1
 987     mova [r0+FDEC_STRIDE*n], m0
 988 %assign n n+1
 989 %endrep
 990     RET
 991 %endmacro
 992
 993 INIT_MMX
 994 %define SPLATB SPLATB_MMX
 995 PRED_8x8C_H mmxext
 996 %define SPLATB SPLATB_SSSE3
 997 PRED_8x8C_H ssse3
 998
 999 ;-----------------------------------------------------------------------------
1000 ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
1001 ;-----------------------------------------------------------------------------
1002 cglobal predict_8x8c_dc_core_mmxext, 1,1
1003     movq        mm0, [r0 - FDEC_STRIDE]
1004     pxor        mm1, mm1
1005     pxor        mm2, mm2
1006     punpckhbw   mm1, mm0
1007     punpcklbw   mm0, mm2
1008     psadbw      mm1, mm2        ; s1
1009     psadbw      mm0, mm2        ; s0
1010
1011 %ifdef ARCH_X86_64
1012     movd        mm4, r1d
1013     movd        mm5, r2d
1014     paddw       mm0, mm4
1015     pshufw      mm2, mm5, 0
1016 %else
1017     paddw       mm0, r1m
1018     pshufw      mm2, r2m, 0
1019 %endif
1020     psrlw       mm0, 3
1021     paddw       mm1, [pw_2 GLOBAL]
1022     movq        mm3, mm2
1023     pshufw      mm1, mm1, 0
1024     pshufw      mm0, mm0, 0     ; dc0 (w)
1025     paddw       mm3, mm1
1026     psrlw       mm3, 3          ; dc3 (w)
1027     psrlw       mm2, 2          ; dc2 (w)
1028     psrlw       mm1, 2          ; dc1 (w)
1029
1030     packuswb    mm0, mm1        ; dc0,dc1 (b)
1031     packuswb    mm2, mm3        ; dc2,dc3 (b)
1032
1033     STORE8x8    mm0, mm2
1034     RET
1035
1036 cglobal predict_8x8c_dc_top_mmxext, 1,1
1037     movq        mm0, [r0 - FDEC_STRIDE]
1038     pxor        mm1, mm1
1039     pxor        mm2, mm2
1040     punpckhbw   mm1, mm0
1041     punpcklbw   mm0, mm2
1042     psadbw      mm1, mm2        ; s1
1043     psadbw      mm0, mm2        ; s0
1044     psrlw       mm1, 1
1045     psrlw       mm0, 1
1046     pavgw       mm1, mm2
1047     pavgw       mm0, mm2
1048     pshufw      mm1, mm1, 0
1049     pshufw      mm0, mm0, 0     ; dc0 (w)
1050     packuswb    mm0, mm1        ; dc0,dc1 (b)
1051     STORE8x8    mm0, mm0
1052     RET
1053
1054 ;-----------------------------------------------------------------------------
1055 ; void predict_8x8c_p_core_sse2( uint8_t *src, int i00, int b, int c )
1056 ;-----------------------------------------------------------------------------
1057
1058 cglobal predict_8x8c_p_core_sse2, 1,1
1059     movd        xmm0, r1m
1060     movd        xmm2, r2m
1061     movd        xmm4, r3m
1062     pshuflw     xmm0, xmm0, 0
1063     pshuflw     xmm2, xmm2, 0
1064     pshuflw     xmm4, xmm4, 0
1065     punpcklqdq  xmm0, xmm0
1066     punpcklqdq  xmm2, xmm2
1067     punpcklqdq  xmm4, xmm4
1068     pmullw      xmm2, [pw_76543210 GLOBAL]
1069     paddsw      xmm0, xmm2        ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
1070     movdqa      xmm3, xmm0
1071     paddsw      xmm3, xmm4
1072     paddsw      xmm4, xmm4
1073 call .loop
1074     add           r0, FDEC_STRIDE*4
1075 .loop:
1076     movdqa      xmm5, xmm0
1077     movdqa      xmm1, xmm3
1078     psraw       xmm0, 5
1079     psraw       xmm3, 5
1080     packuswb    xmm0, xmm3
1081     movq        [r0+FDEC_STRIDE*0], xmm0
1082     movhps      [r0+FDEC_STRIDE*1], xmm0
1083     paddsw      xmm5, xmm4
1084     paddsw      xmm1, xmm4
1085     movdqa      xmm0, xmm5
1086     movdqa      xmm3, xmm1
1087     psraw       xmm5, 5
1088     psraw       xmm1, 5
1089     packuswb    xmm5, xmm1
1090     movq        [r0+FDEC_STRIDE*2], xmm5
1091     movhps      [r0+FDEC_STRIDE*3], xmm5
1092     paddsw      xmm0, xmm4
1093     paddsw      xmm3, xmm4
1094     RET
1095
1096 ;-----------------------------------------------------------------------------
1097 ; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
1098 ;-----------------------------------------------------------------------------
1099 cglobal predict_16x16_p_core_sse2, 1,2,8
1100     movd        xmm0, r1m
1101     movd        xmm1, r2m
1102     movd        xmm2, r3m
1103     pshuflw     xmm0, xmm0, 0
1104     pshuflw     xmm1, xmm1, 0
1105     pshuflw     xmm2, xmm2, 0
1106     punpcklqdq  xmm0, xmm0
1107     punpcklqdq  xmm1, xmm1
1108     punpcklqdq  xmm2, xmm2
1109     movdqa      xmm3, xmm1
1110     pmullw      xmm3, [pw_76543210 GLOBAL]
1111     psllw       xmm1, 3
1112     paddsw      xmm0, xmm3  ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
1113     paddsw      xmm1, xmm0  ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
1114     movdqa      xmm7, xmm2
1115     paddsw      xmm7, xmm7
1116     mov         r1d, 8
1117 ALIGN 4
1118 .loop:
1119     movdqa      xmm3, xmm0
1120     movdqa      xmm4, xmm1
1121     movdqa      xmm5, xmm0
1122     movdqa      xmm6, xmm1
1123     psraw       xmm3, 5
1124     psraw       xmm4, 5
1125     paddsw      xmm5, xmm2
1126     paddsw      xmm6, xmm2
1127     psraw       xmm5, 5
1128     psraw       xmm6, 5
1129     packuswb    xmm3, xmm4
1130     packuswb    xmm5, xmm6
1131     movdqa      [r0+FDEC_STRIDE*0], xmm3
1132     movdqa      [r0+FDEC_STRIDE*1], xmm5
1133     paddsw      xmm0, xmm7
1134     paddsw      xmm1, xmm7
1135     add         r0, FDEC_STRIDE*2
1136     dec         r1d
1137     jg          .loop
1138     REP_RET
1139
1140 ;-----------------------------------------------------------------------------
1141 ; void predict_16x16_v_mmx( uint8_t *src )
1142 ;-----------------------------------------------------------------------------
1143 cglobal predict_16x16_v_mmx, 1,2
1144     movq        mm0, [r0 - FDEC_STRIDE]
1145     movq        mm1, [r0 - FDEC_STRIDE + 8]
1146     STORE16x16  mm0, mm1
1147     REP_RET
1148
1149 ;-----------------------------------------------------------------------------
1150 ; void predict_16x16_v_sse2( uint8_t *src )
1151 ;-----------------------------------------------------------------------------
1152 cglobal predict_16x16_v_sse2, 1,1
1153     movdqa      xmm0, [r0 - FDEC_STRIDE]
1154     STORE16x16_SSE2 xmm0
1155     RET
1156
1157 ;-----------------------------------------------------------------------------
1158 ; void predict_16x16_h_mmxext( uint8_t *src )
1159 ;-----------------------------------------------------------------------------
1160
1161 %macro PRED_16x16_H 1
1162 cglobal predict_16x16_h_%1, 1,2
1163     mov r1, FDEC_STRIDE*12
1164 %ifidn %1, ssse3
1165     mova   m1, [pb_3 GLOBAL]
1166 %endif
1167 .vloop:
1168 %assign n 0
1169 %rep 4
1170     SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
1171     mova [r0+r1+FDEC_STRIDE*n], m0
1172 %if mmsize==8
1173     mova [r0+r1+FDEC_STRIDE*n+8], m0
1174 %endif
1175 %assign n n+1
1176 %endrep
1177     add r1, -FDEC_STRIDE*4
1178     jge .vloop
1179     REP_RET
1180 %endmacro
1181
1182 ;no SSE2, its slower than MMX on all systems that don't support SSSE3
1183 INIT_MMX
1184 %define SPLATB SPLATB_MMX
1185 PRED_16x16_H mmxext
1186 INIT_XMM
1187 %define SPLATB SPLATB_SSSE3
1188 PRED_16x16_H ssse3
1189
1190 ;-----------------------------------------------------------------------------
1191 ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
1192 ;-----------------------------------------------------------------------------
1193
1194 %macro PRED16x16_DC 2
1195     pxor        mm0, mm0
1196     pxor        mm1, mm1
1197     psadbw      mm0, [r0 - FDEC_STRIDE]
1198     psadbw      mm1, [r0 - FDEC_STRIDE + 8]
1199     paddusw     mm0, mm1
1200     paddusw     mm0, %1
1201     psrlw       mm0, %2                       ; dc
1202     pshufw      mm0, mm0, 0
1203     packuswb    mm0, mm0                      ; dc in bytes
1204     STORE16x16  mm0, mm0
1205 %endmacro
1206
1207 cglobal predict_16x16_dc_core_mmxext, 1,2
1208 %ifdef ARCH_X86_64
1209     movd         mm2, r1d
1210     PRED16x16_DC mm2, 5
1211 %else
1212     PRED16x16_DC r1m, 5
1213 %endif
1214     REP_RET
1215
1216 cglobal predict_16x16_dc_top_mmxext, 1,2
1217     PRED16x16_DC [pw_8 GLOBAL], 4
1218     REP_RET
1219
1220 cglobal predict_16x16_dc_left_core_mmxext, 1,1
1221     movd       mm0, r1m
1222     pshufw     mm0, mm0, 0
1223     packuswb   mm0, mm0
1224     STORE16x16 mm0, mm0
1225     REP_RET
1226
1227 ;-----------------------------------------------------------------------------
1228 ; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left )
1229 ;-----------------------------------------------------------------------------
1230
1231 %macro PRED16x16_DC_SSE2 2
1232     pxor        xmm0, xmm0
1233     psadbw      xmm0, [r0 - FDEC_STRIDE]
1234     movhlps     xmm1, xmm0
1235     paddw       xmm0, xmm1
1236     paddusw     xmm0, %1
1237     psrlw       xmm0, %2                ; dc
1238     pshuflw     xmm0, xmm0, 0
1239     punpcklqdq  xmm0, xmm0
1240     packuswb    xmm0, xmm0              ; dc in bytes
1241     STORE16x16_SSE2 xmm0
1242 %endmacro
1243
1244 cglobal predict_16x16_dc_core_sse2, 1,1
1245     movd xmm2, r1m
1246     PRED16x16_DC_SSE2 xmm2, 5
1247     RET
1248
1249 cglobal predict_16x16_dc_top_sse2, 1,1
1250     PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
1251     RET
1252
1253 cglobal predict_16x16_dc_left_core_sse2, 1,1
1254     movd       xmm0, r1m
1255     pshuflw    xmm0, xmm0, 0
1256     punpcklqdq xmm0, xmm0
1257     packuswb   xmm0, xmm0
1258     STORE16x16_SSE2 xmm0
1259     RET