;***************************************************************************** ;* deblock-a.asm: x86 deblocking ;***************************************************************************** ;* Copyright (C) 2005-2013 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15 SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pb_a1 cextern pw_2 cextern pw_4 cextern pw_00ff cextern pw_pixel_max %if HIGH_BIT_DEPTH ; out: %4 = |%1-%2|-%3 ; clobbers: %5 %macro ABS_SUB 5 psubusw %5, %2, %1 psubusw %4, %1, %2 por %4, %5 psubw %4, %3 %endmacro ; out: %4 = |%1-%2|<%3 %macro DIFF_LT 5 psubusw %4, %2, %1 psubusw %5, %1, %2 por %5, %4 ; |%1-%2| pxor %4, %4 psubw %5, %3 ; |%1-%2|-%3 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 %endmacro %macro LOAD_AB 4 movd %1, %3 movd %2, %4 SPLATW %1, %1 SPLATW %2, %2 %endmacro ; in: %2=tc reg ; out: %1=splatted tc %macro LOAD_TC 2 %if mmsize == 8 pshufw %1, [%2-1], 0 %else movd %1, [%2] punpcklbw %1, %1 pshuflw %1, %1, q1100 pshufd %1, %1, q1100 %endif psraw %1, 8 %endmacro ; in: %1=p1, %2=p0, %3=q0, %4=q1 ; %5=alpha, %6=beta, %7-%9=tmp ; out: %7=mask %macro LOAD_MASK 9 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta pand %8, %9 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta pxor %7, %7 pand %8, %9 pcmpgtw %7, %8 %endmacro ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp ; out: %1=p0', m2=q0' %macro DEBLOCK_P0_Q0 7 psubw %3, %4 pxor %7, %7 paddw %3, [pw_4] psubw %7, %5 psubw %6, %2, %1 psllw %6, 2 paddw %3, %6 psraw %3, 3 mova %6, [pw_pixel_max] CLIPW %3, %7, %5 pxor %7, %7 paddw %1, %3 psubw %2, %3 CLIPW %1, %7, %6 CLIPW %2, %7, %6 %endmacro ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp %macro LUMA_Q1 6 pavgw %6, %3, %4 ; (p0+q0+1)>>1 paddw %1, %6 pxor %6, %6 psraw %1, 1 psubw %6, %5 psubw %1, %2 CLIPW %1, %6, %5 paddw %1, %2 %endmacro %macro LUMA_DEBLOCK_ONE 3 DIFF_LT m5, %1, bm, m4, m6 pxor m6, m6 mova %3, m4 pcmpgtw m6, tcm pand m4, tcm pandn m6, m7 pand m4, m6 LUMA_Q1 m5, %2, m1, m2, m4, m6 %endmacro %macro LUMA_H_STORE 2 %if mmsize == 8 movq [r0-4], m0 movq [r0+r1-4], m1 movq [r0+r1*2-4], m2 movq [r0+%2-4], m3 %else movq [r0-4], m0 movhps [r0+r1-4], m0 movq [r0+r1*2-4], m1 movhps [%1-4], m1 movq [%1+r1-4], m2 movhps [%1+r1*2-4], m2 movq [%1+%2-4], m3 movhps [%1+r1*4-4], m3 %endif %endmacro %macro DEBLOCK_LUMA 0 ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_luma, 5,5,8 %assign pad 5*mmsize+12-(stack_offset&15) %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] %define am [rsp+mmsize*3] %define bm [rsp+mmsize*4] SUB rsp, pad add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, 32/mmsize mov r2, r0 sub r0, r1 mova am, m4 sub r0, r1 mova bm, m5 sub r0, r1 .loop: mova m0, [r0+r1] mova m1, [r0+r1*2] mova m2, [r2] mova m3, [r2+r1] LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 LOAD_TC m6, r4 mova tcm, m6 mova m5, [r0] LUMA_DEBLOCK_ONE m1, m0, ms1 mova [r0+r1], m5 mova m5, [r2+r1*2] LUMA_DEBLOCK_ONE m2, m3, ms2 mova [r2+r1], m5 pxor m5, m5 mova m6, tcm pcmpgtw m5, tcm psubw m6, ms1 pandn m5, m7 psubw m6, ms2 pand m5, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 mova [r0+r1*2], m1 mova [r2], m2 add r0, mmsize add r2, mmsize add r4, mmsize/8 dec r3 jg .loop ADD rsp, pad RET cglobal deblock_h_luma, 5,6,8 %assign pad 7*mmsize+12-(stack_offset&15) %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] %define p1m [rsp+mmsize*3] %define p2m [rsp+mmsize*4] %define am [rsp+mmsize*5] %define bm [rsp+mmsize*6] SUB rsp, pad add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, r1 mova am, m4 add r3, r1 mov r5, 32/mmsize mova bm, m5 add r3, r1 %if mmsize == 16 mov r2, r0 add r2, r3 %endif .loop: %if mmsize == 8 movq m2, [r0-8] ; y q2 q1 q0 movq m7, [r0+0] movq m5, [r0+r1-8] movq m3, [r0+r1+0] movq m0, [r0+r1*2-8] movq m6, [r0+r1*2+0] movq m1, [r0+r3-8] TRANSPOSE4x4W 2, 5, 0, 1, 4 SWAP 2, 7 movq m7, [r0+r3] TRANSPOSE4x4W 2, 3, 6, 7, 4 %else movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x movu m0, [r0+r1-8] movu m2, [r0+r1*2-8] movu m3, [r2-8] TRANSPOSE4x4W 5, 0, 2, 3, 6 mova tcm, m3 movu m4, [r2+r1-8] movu m1, [r2+r1*2-8] movu m3, [r2+r3-8] movu m7, [r2+r1*4-8] TRANSPOSE4x4W 4, 1, 3, 7, 6 mova m6, tcm punpcklqdq m6, m7 punpckhqdq m5, m4 SBUTTERFLY qdq, 0, 1, 7 SBUTTERFLY qdq, 2, 3, 7 %endif mova p2m, m6 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 LOAD_TC m6, r4 mova tcm, m6 LUMA_DEBLOCK_ONE m1, m0, ms1 mova p1m, m5 mova m5, p2m LUMA_DEBLOCK_ONE m2, m3, ms2 mova p2m, m5 pxor m5, m5 mova m6, tcm pcmpgtw m5, tcm psubw m6, ms1 pandn m5, m7 psubw m6, ms2 pand m5, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 mova m0, p1m mova m3, p2m TRANSPOSE4x4W 0, 1, 2, 3, 4 LUMA_H_STORE r2, r3 add r4, mmsize/8 lea r0, [r0+r1*(mmsize/2)] lea r2, [r2+r1*(mmsize/2)] dec r5 jg .loop ADD rsp, pad RET %endmacro %if ARCH_X86_64 ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 ; m12=alpha, m13=beta ; out: m0=p1', m3=q1', m1=p0', m2=q0' ; clobbers: m4, m5, m6, m7, m10, m11, m14 %macro DEBLOCK_LUMA_INTER_SSE2 0 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 LOAD_TC m6, r4 DIFF_LT m8, m1, m13, m10, m4 DIFF_LT m9, m2, m13, m11, m4 pand m6, m7 mova m14, m6 pxor m4, m4 pcmpgtw m6, m4 pand m6, m14 mova m5, m10 pand m5, m6 LUMA_Q1 m8, m0, m1, m2, m5, m4 mova m5, m11 pand m5, m6 LUMA_Q1 m9, m3, m1, m2, m5, m4 pxor m4, m4 psubw m6, m10 pcmpgtw m4, m14 pandn m4, m7 psubw m6, m11 pand m4, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 SWAP 0, 8 SWAP 3, 9 %endmacro %macro DEBLOCK_LUMA_64 0 cglobal deblock_v_luma, 5,5,15 %define p2 m8 %define p1 m0 %define p0 m1 %define q0 m2 %define q1 m3 %define q2 m9 %define mask0 m7 %define mask1 m10 %define mask2 m11 add r1, r1 LOAD_AB m12, m13, r2d, r3d mov r2, r0 sub r0, r1 sub r0, r1 sub r0, r1 mov r3, 2 .loop: mova p2, [r0] mova p1, [r0+r1] mova p0, [r0+r1*2] mova q0, [r2] mova q1, [r2+r1] mova q2, [r2+r1*2] DEBLOCK_LUMA_INTER_SSE2 mova [r0+r1], p1 mova [r0+r1*2], p0 mova [r2], q0 mova [r2+r1], q1 add r0, mmsize add r2, mmsize add r4, 2 dec r3 jg .loop RET cglobal deblock_h_luma, 5,7,15 add r1, r1 LOAD_AB m12, m13, r2d, r3d mov r2, r1 add r2, r1 add r2, r1 mov r5, r0 add r5, r2 mov r6, 2 .loop: movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x movu m0, [r0+r1-8] movu m2, [r0+r1*2-8] movu m9, [r5-8] movu m5, [r5+r1-8] movu m1, [r5+r1*2-8] movu m3, [r5+r2-8] movu m7, [r5+r1*4-8] TRANSPOSE4x4W 8, 0, 2, 9, 10 TRANSPOSE4x4W 5, 1, 3, 7, 10 punpckhqdq m8, m5 SBUTTERFLY qdq, 0, 1, 10 SBUTTERFLY qdq, 2, 3, 10 punpcklqdq m9, m7 DEBLOCK_LUMA_INTER_SSE2 TRANSPOSE4x4W 0, 1, 2, 3, 4 LUMA_H_STORE r5, r2 add r4, 2 lea r0, [r0+r1*8] lea r5, [r5+r1*8] dec r6 jg .loop RET %endmacro INIT_XMM sse2 DEBLOCK_LUMA_64 INIT_XMM avx DEBLOCK_LUMA_64 %endif %macro SWAPMOVA 2 %ifid %1 SWAP %1, %2 %else mova %1, %2 %endif %endmacro ; in: t0-t2: tmp registers ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' %macro LUMA_INTRA_P012 12 ; p0..p3 in memory %if ARCH_X86_64 paddw t0, %3, %2 mova t2, %4 paddw t2, %3 %else mova t0, %3 mova t2, %4 paddw t0, %2 paddw t2, %3 %endif paddw t0, %1 paddw t2, t2 paddw t0, %5 paddw t2, %9 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) psrlw t2, 3 psrlw t1, t0, 2 psubw t2, %3 psubw t1, %2 pand t2, %8 pand t1, %8 paddw t2, %3 paddw t1, %2 SWAPMOVA %11, t1 psubw t1, t0, %3 paddw t0, t0 psubw t1, %5 psubw t0, %3 paddw t1, %6 paddw t1, %2 paddw t0, %6 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 pxor t0, t1 pxor t1, %1 pand t0, %8 pand t1, %7 pxor t0, t1 pxor t0, %1 SWAPMOVA %10, t0 SWAPMOVA %12, t2 %endmacro %macro LUMA_INTRA_INIT 1 %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) %define t0 m4 %define t1 m5 %define t2 m6 %define t3 m7 %assign i 4 %rep %1 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] %assign i i+1 %endrep SUB rsp, pad add r1, r1 %endmacro ; in: %1-%3=tmp, %4=p2, %5=q2 %macro LUMA_INTRA_INTER 5 LOAD_AB t0, t1, r2d, r3d mova %1, t0 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 %if ARCH_X86_64 mova %2, t0 ; mask0 psrlw t3, %1, 2 %else mova t3, %1 mova %2, t0 ; mask0 psrlw t3, 2 %endif paddw t3, [pw_2] ; alpha/4+2 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 pand t2, %2 mova t3, %5 ; q2 mova %1, t2 ; mask1 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta pand t2, %1 mova t3, %4 ; p2 mova %3, t2 ; mask1q DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta pand t2, %1 mova %1, t2 ; mask1p %endmacro %macro LUMA_H_INTRA_LOAD 0 %if mmsize == 8 movu t0, [r0-8] movu t1, [r0+r1-8] movu m0, [r0+r1*2-8] movu m1, [r0+r4-8] TRANSPOSE4x4W 4, 5, 0, 1, 2 mova t4, t0 ; p3 mova t5, t1 ; p2 movu m2, [r0] movu m3, [r0+r1] movu t0, [r0+r1*2] movu t1, [r0+r4] TRANSPOSE4x4W 2, 3, 4, 5, 6 mova t6, t0 ; q2 mova t7, t1 ; q3 %else movu t0, [r0-8] movu t1, [r0+r1-8] movu m0, [r0+r1*2-8] movu m1, [r0+r5-8] movu m2, [r4-8] movu m3, [r4+r1-8] movu t2, [r4+r1*2-8] movu t3, [r4+r5-8] TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 mova t4, t0 ; p3 mova t5, t1 ; p2 mova t6, t2 ; q2 mova t7, t3 ; q3 %endif %endmacro ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp %macro LUMA_H_INTRA_STORE 9 %if mmsize == 8 TRANSPOSE4x4W %1, %2, %3, %4, %9 movq [r0-8], m%1 movq [r0+r1-8], m%2 movq [r0+r1*2-8], m%3 movq [r0+r4-8], m%4 movq m%1, %8 TRANSPOSE4x4W %5, %6, %7, %1, %9 movq [r0], m%5 movq [r0+r1], m%6 movq [r0+r1*2], m%7 movq [r0+r4], m%1 %else TRANSPOSE2x4x4W %1, %2, %3, %4, %9 movq [r0-8], m%1 movq [r0+r1-8], m%2 movq [r0+r1*2-8], m%3 movq [r0+r5-8], m%4 movhps [r4-8], m%1 movhps [r4+r1-8], m%2 movhps [r4+r1*2-8], m%3 movhps [r4+r5-8], m%4 %ifnum %8 SWAP %1, %8 %else mova m%1, %8 %endif TRANSPOSE2x4x4W %5, %6, %7, %1, %9 movq [r0], m%5 movq [r0+r1], m%6 movq [r0+r1*2], m%7 movq [r0+r5], m%1 movhps [r4], m%5 movhps [r4+r1], m%6 movhps [r4+r1*2], m%7 movhps [r4+r5], m%1 %endif %endmacro %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA_INTRA_64 0 cglobal deblock_v_luma_intra, 4,7,16 %define t0 m1 %define t1 m2 %define t2 m4 %define p2 m8 %define p1 m9 %define p0 m10 %define q0 m11 %define q1 m12 %define q2 m13 %define aa m5 %define bb m14 add r1, r1 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride neg r4 add r4, r0 ; pix-4*stride mov r6, 2 mova m0, [pw_2] LOAD_AB aa, bb, r2d, r3d .loop mova p2, [r4+r1] mova p1, [r4+2*r1] mova p0, [r4+r5] mova q0, [r0] mova q1, [r0+r1] mova q2, [r0+2*r1] LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 mova t2, aa psrlw t2, 2 paddw t2, m0 ; alpha/4+2 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta pand m6, m3 pand m7, m6 pand m6, t1 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] add r0, mmsize add r4, mmsize dec r6 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,7,16 %define t0 m15 %define t1 m14 %define t2 m2 %define q3 m5 %define q2 m8 %define q1 m9 %define q0 m10 %define p0 m11 %define p1 m12 %define p2 m13 %define p3 m4 %define spill [rsp] %assign pad 24-(stack_offset&15) SUB rsp, pad add r1, r1 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride add r4, r0 ; pix+4*stride mov r6, 2 mova m0, [pw_2] .loop movu q3, [r0-8] movu q2, [r0+r1-8] movu q1, [r0+r1*2-8] movu q0, [r0+r5-8] movu p0, [r4-8] movu p1, [r4+r1-8] movu p2, [r4+r1*2-8] movu p3, [r4+r5-8] TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 LOAD_AB m1, m2, r2d, r3d LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 psrlw m1, 2 paddw m1, m0 ; alpha/4+2 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta pand m6, m3 pand m7, m6 pand m6, t1 mova spill, q3 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 mova m7, spill LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 lea r0, [r0+r1*8] lea r4, [r4+r1*8] dec r6 jg .loop ADD rsp, pad RET %endmacro INIT_XMM sse2 DEBLOCK_LUMA_INTRA_64 INIT_XMM avx DEBLOCK_LUMA_INTRA_64 %endif %macro DEBLOCK_LUMA_INTRA 0 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_luma_intra, 4,7,8 LUMA_INTRA_INIT 3 lea r4, [r1*4] lea r5, [r1*3] neg r4 add r4, r0 mov r6, 32/mmsize .loop: mova m0, [r4+r1*2] ; p1 mova m1, [r4+r5] ; p0 mova m2, [r0] ; q0 mova m3, [r0+r1] ; q1 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] mova t3, [r0+r1*2] ; q2 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] add r0, mmsize add r4, mmsize dec r6 jg .loop ADD rsp, pad RET ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,7,8 LUMA_INTRA_INIT 8 %if mmsize == 8 lea r4, [r1*3] mov r5, 32/mmsize %else lea r4, [r1*4] lea r5, [r1*3] ; 3*stride add r4, r0 ; pix+4*stride mov r6, 32/mmsize %endif .loop: LUMA_H_INTRA_LOAD LUMA_INTRA_INTER t8, t9, t10, t5, t6 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 mova t3, t6 ; q2 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 mova m2, t4 mova m0, t11 mova m1, t5 mova m3, t8 mova m6, t6 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 lea r0, [r0+r1*(mmsize/2)] %if mmsize == 8 dec r5 %else lea r4, [r4+r1*(mmsize/2)] dec r6 %endif jg .loop ADD rsp, pad RET %endmacro %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA INIT_XMM sse2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA INIT_XMM avx DEBLOCK_LUMA DEBLOCK_LUMA_INTRA %endif %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] %define PASS8ROWS(base, base3, stride, stride3, offset) \ PASS8ROWS(base+offset, base3+offset, stride, stride3) ; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 %macro TRANSPOSE4x8_LOAD 11 movh m0, %4 movh m2, %5 movh m1, %6 movh m3, %7 punpckl%1 m0, m2 punpckl%1 m1, m3 mova m2, m0 punpckl%2 m0, m1 punpckh%2 m2, m1 movh m4, %8 movh m6, %9 movh m5, %10 movh m7, %11 punpckl%1 m4, m6 punpckl%1 m5, m7 mova m6, m4 punpckl%2 m4, m5 punpckh%2 m6, m5 punpckh%3 m1, m0, m4 punpckh%3 m3, m2, m6 punpckl%3 m0, m4 punpckl%3 m2, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 %macro TRANSPOSE8x4B_STORE 8 punpckhdq m4, m0, m0 punpckhdq m5, m1, m1 punpckhdq m6, m2, m2 punpcklbw m0, m1 punpcklbw m2, m3 punpcklwd m1, m0, m2 punpckhwd m0, m2 movh %1, m1 punpckhdq m1, m1 movh %2, m1 movh %3, m0 punpckhdq m0, m0 movh %4, m0 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 punpcklwd m5, m4, m6 punpckhwd m4, m6 movh %5, m5 punpckhdq m5, m5 movh %6, m5 movh %7, m4 punpckhdq m4, m4 movh %8, m4 %endmacro %macro TRANSPOSE4x8B_LOAD 8 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 %endmacro %macro TRANSPOSE4x8W_LOAD 8 %if mmsize==16 TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 %else SWAP 1, 4, 2, 3 mova m0, [t5] mova m1, [t5+r1] mova m2, [t5+r1*2] mova m3, [t5+t6] TRANSPOSE4x4W 0, 1, 2, 3, 4 %endif %endmacro %macro TRANSPOSE8x2W_STORE 8 punpckhwd m0, m1, m2 punpcklwd m1, m2 %if mmsize==8 movd %3, m0 movd %1, m1 psrlq m1, 32 psrlq m0, 32 movd %2, m1 movd %4, m0 %else movd %5, m0 movd %1, m1 psrldq m1, 4 psrldq m0, 4 movd %2, m1 movd %6, m0 psrldq m1, 4 psrldq m0, 4 movd %3, m1 movd %7, m0 psrldq m1, 4 psrldq m0, 4 movd %4, m1 movd %8, m0 %endif %endmacro %macro SBUTTERFLY3 4 punpckh%1 %4, %2, %3 punpckl%1 %2, %3 %endmacro ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 movq m3, %4 movq m4, %5 movq m5, %6 movq m6, %7 SBUTTERFLY bw, 0, 1, 7 SBUTTERFLY bw, 2, 3, 7 SBUTTERFLY bw, 4, 5, 7 movq [%9+0x10], m3 SBUTTERFLY3 bw, m6, %8, m7 SBUTTERFLY wd, 0, 2, 3 SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 SBUTTERFLY3 wd, m1, [%9+0x10], m3 SBUTTERFLY wd, 5, 7, 0 SBUTTERFLY dq, 1, 5, 0 SBUTTERFLY dq, 2, 6, 0 punpckldq m3, m7 movq [%9+0x10], m2 movq [%9+0x20], m6 movq [%9+0x30], m1 movq [%9+0x40], m5 movq [%9+0x50], m3 RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 movq m3, %4 movq m4, %5 movq m5, %6 movq m6, %7 SBUTTERFLY bw, 0, 1, 7 SBUTTERFLY bw, 2, 3, 7 SBUTTERFLY bw, 4, 5, 7 SBUTTERFLY3 bw, m6, %8, m7 movq %9, m5 SBUTTERFLY wd, 0, 2, 5 SBUTTERFLY wd, 4, 6, 5 SBUTTERFLY wd, 1, 3, 5 movq %11, m6 movq m6, %9 SBUTTERFLY wd, 6, 7, 5 SBUTTERFLY dq, 0, 4, 5 SBUTTERFLY dq, 1, 6, 5 movq %9, m0 movq %10, m4 movq %13, m1 movq %14, m6 SBUTTERFLY3 dq, m2, %11, m0 SBUTTERFLY dq, 3, 7, 4 movq %11, m2 movq %12, m0 movq %15, m3 movq %16, m7 RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT 5 %if avx_enabled == 0 mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 %else psubusb %5, %2, %1 psubusb %4, %1, %2 %endif por %4, %5 psubusb %4, %3 %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT2 5 %if ARCH_X86_64 psubusb %5, %2, %1 psubusb %4, %1, %2 %else mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 %endif psubusb %5, %3 psubusb %4, %3 pcmpeqb %4, %5 %endmacro ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 ; out: m5=beta-1, m7=mask, %3=alpha-1 ; clobbers: m4,m6 %macro LOAD_MASK 2-3 movd m4, %1 movd m5, %2 SPLATW m4, m4 SPLATW m5, m5 packuswb m4, m4 ; 16x alpha-1 packuswb m5, m5 ; 16x beta-1 %if %0>2 mova %3, m4 %endif DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 por m7, m4 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 por m7, m4 pxor m6, m6 pcmpeqb m7, m6 %endmacro ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 pxor m5, m1, m2 ; p0^q0 pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 pavgb m3, m0 ; (p1 - q1 + 256)>>1 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 paddusb m3, m4 ; d+128+33 mova m6, [pb_a1] psubusb m6, m3 psubusb m3, [pb_a1] pminub m6, m7 pminub m3, m7 psubusb m1, m6 psubusb m2, m3 paddusb m1, m3 paddusb m2, m6 %endmacro ; in: m1=p0 m2=q0 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; clobbers: q2, tmp, tc0 %macro LUMA_Q1 6 pavgb %6, m1, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 psubusb %6, %1, %5 paddusb %5, %1 pmaxub %2, %6 pminub %2, %5 mova %4, %2 %endmacro %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA 0 cglobal deblock_v_luma, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 neg r4 dec r3d ; beta-1 add r4, r0 ; pix-3*stride mova m0, [r4+r1] ; p1 mova m1, [r4+2*r1] ; p0 mova m2, [r0] ; q0 mova m3, [r0+r1] ; q1 LOAD_MASK r2d, r3d punpcklbw m8, m8 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] pcmpeqb m9, m9 pcmpeqb m9, m8 pandn m9, m7 pand m8, m9 movdqa m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 psubb m7, m8, m6 pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 movdqa m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 pand m6, m9 pand m8, m6 psubb m7, m6 mova m3, [r0+r1] LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 RET ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname cglobal deblock_h_luma, 5,9 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] %if WIN64 sub rsp, 0x98 %define pix_tmp rsp+0x30 %else sub rsp, 0x68 %define pix_tmp rsp %endif ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp lea r6, [r6+r1*8] lea r5, [r5+r1*8] TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them mov r7, r1 lea r0, [pix_tmp+0x30] mov r1d, 0x10 %if WIN64 mov [rsp+0x20], r4 %endif call deblock_v_luma ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 add r5, 2 movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) shl r7, 3 sub r6, r7 sub r5, r7 shr r7, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) %if WIN64 add rsp, 0x98 %else add rsp, 0x68 %endif RET %endmacro INIT_XMM sse2 DEBLOCK_LUMA INIT_XMM avx DEBLOCK_LUMA %else %macro DEBLOCK_LUMA 2 ;----------------------------------------------------------------------------- ; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_%1_luma, 5,5,8,2*%2 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride mova m0, [r4+r1] ; p1 mova m1, [r4+2*r1] ; p0 mova m2, [r0] ; q0 mova m3, [r0+r1] ; q1 LOAD_MASK r2d, r3d mov r3, r4mp movd m4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] mova [esp+%2], m4 ; tc pcmpeqb m3, m3 pcmpgtb m4, m3 pand m4, m7 mova [esp], m4 ; mask mova m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%2] ; tc psubb m7, m4, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 mova m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 mova m5, [esp] ; mask pand m6, m5 mova m5, [esp+%2] ; tc pand m5, m6 psubb m7, m6 mova m3, [r0+r1] LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 RET ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 mov r0, r0mp mov r3, r1m lea r4, [r3*3] sub r0, 4 lea r1, [r0+r4] %define pix_tmp esp+12*HAVE_ALIGNED_STACK ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp lea r0, [r0+r3*8] lea r1, [r1+r3*8] TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 ; vertical filter lea r0, [pix_tmp+0x30] PUSH dword r4m PUSH dword r3m PUSH dword r2m PUSH dword 16 PUSH dword r0 call deblock_%1_luma %ifidn %1, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 call deblock_%1_luma %endif ADD esp, 20 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) mov r0, r0mp sub r0, 2 lea r1, [r0+r4] movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) RET %endmacro ; DEBLOCK_LUMA INIT_MMX mmx2 DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 INIT_XMM avx DEBLOCK_LUMA v, 16 %endif ; ARCH %macro LUMA_INTRA_P012 4 ; p0..p3 in memory %if ARCH_X86_64 pavgb t0, p2, p1 pavgb t1, p0, q0 %else mova t0, p2 mova t1, p0 pavgb t0, p1 pavgb t1, q0 %endif pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 mova t5, t1 %if ARCH_X86_64 paddb t2, p2, p1 paddb t3, p0, q0 %else mova t2, p2 mova t3, p0 paddb t2, p1 paddb t3, q0 %endif paddb t2, t3 mova t3, t2 mova t4, t2 psrlw t2, 1 pavgb t2, mpb_0 pxor t2, t0 pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; %if ARCH_X86_64 pavgb t1, p2, q1 psubb t2, p2, q1 %else mova t1, p2 mova t2, p2 pavgb t1, q1 psubb t2, q1 %endif paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 pand t2, mpb_1 psubb t1, t2 pavgb t1, p1 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 psrlw t3, 2 pavgb t3, mpb_0 pxor t3, t1 pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 pxor t3, p0, q1 pavgb t2, p0, q1 pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 pxor t1, t2 pxor t2, p0 pand t1, mask1p pand t2, mask0 pxor t1, t2 pxor t1, p0 mova %1, t1 ; store p0 mova t1, %4 ; p3 paddb t2, t1, p2 pavgb t1, p2 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 psrlw t2, 2 pavgb t2, mpb_0 pxor t2, t1 pand t2, mpb_1 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 pxor t0, p1 pxor t1, p2 pand t0, mask1p pand t1, mask1p pxor t0, p1 pxor t1, p2 mova %2, t0 ; store p1 mova %3, t1 ; store p2 %endmacro %macro LUMA_INTRA_SWAP_PQ 0 %define q1 m0 %define q0 m1 %define p0 m2 %define p1 m3 %define p2 q2 %define mask1p mask1q %endmacro %macro DEBLOCK_LUMA_INTRA 1 %define p1 m0 %define p0 m1 %define q0 m2 %define q1 m3 %define t0 m4 %define t1 m5 %define t2 m6 %define t3 m7 %if ARCH_X86_64 %define p2 m8 %define q2 m9 %define t4 m10 %define t5 m11 %define mask0 m12 %define mask1p m13 %if WIN64 %define mask1q [rsp] %else %define mask1q [rsp-24] %endif %define mpb_0 m14 %define mpb_1 m15 %else %define spill(x) [esp+16*x] %define p2 [r4+r1] %define q2 [r0+2*r1] %define t4 spill(0) %define t5 spill(1) %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) %define mpb_0 [pb_0] %define mpb_1 [pb_1] %endif ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_%1_luma_intra, 4,6,16,0-(1-ARCH_X86_64)*0x50-WIN64*0x10 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride dec r2d ; alpha-1 jl .end neg r4 dec r3d ; beta-1 jl .end add r4, r0 ; pix-4*stride mova p1, [r4+2*r1] mova p0, [r4+r5] mova q0, [r0] mova q1, [r0+r1] %if ARCH_X86_64 pxor mpb_0, mpb_0 mova mpb_1, [pb_1] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 pavgb t5, mpb_0 pavgb t5, mpb_1 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2*r1] DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 pand t0, mask0 pand t4, t0 pand t2, t0 mova mask1q, t4 mova mask1p, t2 %else LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 pavgb m4, [pb_0] pavgb m4, [pb_1] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 pand m4, m6 mova mask1p, m4 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 pand m4, m6 mova mask1q, m4 %endif LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] LUMA_INTRA_SWAP_PQ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] .end: REP_RET INIT_MMX cpuname %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,9 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) lea r6, [r6+r1*8] lea r5, [r5+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) mov r7, r1 lea r0, [pix_tmp+0x40] mov r1, 0x10 call deblock_v_luma_intra ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) shl r7, 3 sub r6, r7 sub r5, r7 shr r7, 3 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) add rsp, 0x88 RET %else cglobal deblock_h_luma_intra, 2,4,8,0x80 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] %define pix_tmp rsp ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] PUSH dword r3m PUSH dword r2m PUSH dword 16 PUSH r0 call deblock_%1_luma_intra %ifidn %1, v8 add dword [rsp], 8 ; pix_tmp+8 call deblock_%1_luma_intra %endif ADD esp, 16 mov r1, r1m mov r0, r0mp lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) RET %endif ; ARCH_X86_64 %endmacro ; DEBLOCK_LUMA_INTRA INIT_XMM sse2 DEBLOCK_LUMA_INTRA v INIT_XMM avx DEBLOCK_LUMA_INTRA v %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_LUMA_INTRA v8 %endif %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp ; out: %1=p0', %2=q0' %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 mova %6, [pw_2] paddw %6, %3 paddw %6, %4 paddw %7, %6, %2 paddw %6, %1 paddw %6, %3 paddw %7, %4 psraw %6, 2 psraw %7, 2 psubw %6, %1 psubw %7, %2 pand %6, %5 pand %7, %5 paddw %1, %6 paddw %2, %7 %endmacro ; out: m0-m3 ; clobbers: m4-m7 %macro CHROMA_H_LOAD 0-1 movq m0, [r0-8] ; p1 p1 p0 p0 movq m2, [r0] ; q0 q0 q1 q1 movq m5, [r0+r1-8] movq m7, [r0+r1] %if mmsize == 8 mova m1, m0 mova m3, m2 punpckldq m0, m5 ; p1 punpckhdq m1, m5 ; p0 punpckldq m2, m7 ; q0 punpckhdq m3, m7 ; q1 %else movq m4, [r0+r1*2-8] movq m6, [r0+r1*2] movq m1, [r0+%1-8] movq m3, [r0+%1] punpckldq m0, m5 ; p1 ... p0 ... punpckldq m2, m7 ; q0 ... q1 ... punpckldq m4, m1 punpckldq m6, m3 punpckhqdq m1, m0, m4 ; p0 punpcklqdq m0, m4 ; p1 punpckhqdq m3, m2, m6 ; q1 punpcklqdq m2, m6 ; q0 %endif %endmacro %macro CHROMA_V_LOAD 1 mova m0, [r0] ; p1 mova m1, [r0+r1] ; p0 mova m2, [%1] ; q0 mova m3, [%1+r1] ; q1 %endmacro ; clobbers: m1, m2, m3 %macro CHROMA_H_STORE 0-1 SBUTTERFLY dq, 1, 2, 3 %if mmsize == 8 movq [r0-4], m1 movq [r0+r1-4], m2 %else movq [r0-4], m1 movq [r0+r1*2-4], m2 movhps [r0+r1-4], m1 movhps [r0+%1-4], m2 %endif %endmacro %macro CHROMA_V_STORE 0 mova [r0+1*r1], m1 mova [r0+2*r1], m2 %endmacro %macro DEBLOCK_CHROMA 0 cglobal deblock_inter_body LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 LOAD_TC m6, r4 pmaxsw m6, m4 pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 ret ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma, 5,7,8 FIX_STRIDES r1 mov r5, r0 sub r0, r1 sub r0, r1 mov r6, 32/mmsize .loop: CHROMA_V_LOAD r5 call deblock_inter_body CHROMA_V_STORE add r0, mmsize add r5, mmsize add r4, mmsize/8 dec r6 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 add r1, r1 mov r5, 32/mmsize %if mmsize == 16 lea r6, [r1*3] %endif .loop: CHROMA_H_LOAD r6 call deblock_inter_body CHROMA_H_STORE r6 lea r0, [r0+r1*(mmsize/4)] add r4, mmsize/8 dec r5 jg .loop RET cglobal deblock_intra_body LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 ret ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma_intra, 4,6,8 add r1, r1 mov r5, 32/mmsize movd m5, r3d mov r4, r0 sub r0, r1 sub r0, r1 SPLATW m5, m5 .loop: CHROMA_V_LOAD r4 call deblock_intra_body CHROMA_V_STORE add r0, mmsize add r4, mmsize dec r5 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 add r1, r1 mov r4, 32/mmsize %if mmsize == 16 lea r5, [r1*3] %endif .loop: CHROMA_H_LOAD r5 call deblock_intra_body CHROMA_H_STORE r5 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra_mbaff, 4,6,8 add r1, r1 %if mmsize == 8 mov r4, 16/mmsize .loop: %else lea r5, [r1*3] %endif CHROMA_H_LOAD r5 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r5 %if mmsize == 8 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop %endif RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_mbaff, 5,7,8 add r1, r1 lea r6, [r1*3] %if mmsize == 8 mov r5, 16/mmsize .loop: %endif CHROMA_H_LOAD r6 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 movd m6, [r4] punpcklbw m6, m6 psraw m6, 8 punpcklwd m6, m6 pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r6 %if mmsize == 8 lea r0, [r0+r1*(mmsize/4)] add r4, mmsize/4 dec r5 jg .loop %endif RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_422_intra, 4,6,8 add r1, r1 mov r4, 64/mmsize %if mmsize == 16 lea r5, [r1*3] %endif .loop: CHROMA_H_LOAD r5 call deblock_intra_body CHROMA_H_STORE r5 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_422, 5,7,8 add r1, r1 mov r5, 64/mmsize lea r6, [r1*3] .loop: CHROMA_H_LOAD r6 LOAD_AB m4, m5, r2m, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 movd m6, [r4-1] psraw m6, 8 SPLATW m6, m6 pmaxsw m6, m4 pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r6 lea r0, [r0+r1*(mmsize/4)] %if mmsize == 16 inc r4 %else mov r2, r5 and r2, 1 add r4, r2 ; increment once every 2 iterations %endif dec r5 jg .loop RET %endmacro ; DEBLOCK_CHROMA %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_CHROMA %endif INIT_XMM sse2 DEBLOCK_CHROMA INIT_XMM avx DEBLOCK_CHROMA %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 %macro CHROMA_V_START 0 dec r2d ; alpha-1 dec r3d ; beta-1 mov t5, r0 sub t5, r1 sub t5, r1 %if mmsize==8 mov dword r0m, 2 .loop: %endif %endmacro %macro CHROMA_H_START 0 dec r2d dec r3d sub r0, 4 lea t6, [r1*3] mov t5, r0 add r0, t6 %endmacro %macro CHROMA_V_LOOP 1 %if mmsize==8 add r0, 8 add t5, 8 %if %1 add r4, 2 %endif dec dword r0m jg .loop %endif %endmacro %macro CHROMA_H_LOOP 1 %if mmsize==8 lea r0, [r0+r1*4] lea t5, [t5+r1*4] %if %1 add r4, 2 %endif dec dword r0m jg .loop %endif %endmacro %define t5 r5 %define t6 r6 %macro DEBLOCK_CHROMA 0 cglobal chroma_inter_body LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 ret ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma, 5,6,8 CHROMA_V_START mova m0, [t5] mova m1, [t5+r1] mova m2, [r0] mova m3, [r0+r1] call chroma_inter_body mova [t5+r1], m1 mova [r0], m2 CHROMA_V_LOOP 1 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 CHROMA_H_START %if mmsize==8 mov dword r0m, 2 .loop: %endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_inter_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) CHROMA_H_LOOP 1 RET %endmacro ; DEBLOCK_CHROMA INIT_XMM sse2 DEBLOCK_CHROMA INIT_XMM avx DEBLOCK_CHROMA %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_CHROMA %endif ;----------------------------------------------------------------------------- ; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %macro DEBLOCK_H_CHROMA_420_MBAFF 0 cglobal deblock_h_chroma_mbaff, 5,7,8 dec r2d dec r3d sub r0, 4 lea t6, [r1*3] mov t5, r0 add r0, t6 TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) RET %endmacro INIT_XMM sse2 DEBLOCK_H_CHROMA_420_MBAFF %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_H_CHROMA_420_MBAFF %endif %macro DEBLOCK_H_CHROMA_422 0 cglobal deblock_h_chroma_422, 5,8,8 %if ARCH_X86_64 %define cntr r7 %else %define cntr dword r0m %endif CHROMA_H_START mov cntr, 32/mmsize .loop: TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 %if mmsize == 16 punpcklbw m6, m6 punpcklbw m6, m6 %else pshufw m6, m6, q0000 %endif pand m7, m6 DEBLOCK_P0_Q0 TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) lea r0, [r0+r1*(mmsize/2)] lea t5, [t5+r1*(mmsize/2)] add r4, mmsize/8 dec cntr jg .loop RET %endmacro INIT_MMX mmx2 DEBLOCK_H_CHROMA_422 INIT_XMM sse2 DEBLOCK_H_CHROMA_422 INIT_XMM avx DEBLOCK_H_CHROMA_422 ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 %macro CHROMA_INTRA_P0 3 pxor m4, %1, %3 pand m4, [pb_1] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro %define t5 r4 %define t6 r5 %macro DEBLOCK_CHROMA_INTRA_BODY 0 cglobal chroma_intra_body LOAD_MASK r2d, r3d mova m5, m1 mova m6, m2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5 psubb m2, m6 pand m1, m7 pand m2, m7 paddb m1, m5 paddb m2, m6 ret %endmacro %macro DEBLOCK_CHROMA_INTRA 0 ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma_intra, 4,5,8 CHROMA_V_START mova m0, [t5] mova m1, [t5+r1] mova m2, [r0] mova m3, [r0+r1] call chroma_intra_body mova [t5+r1], m1 mova [r0], m2 CHROMA_V_LOOP 0 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 CHROMA_H_START %if mmsize==8 mov dword r0m, 2 .loop: %endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) CHROMA_H_LOOP 0 RET cglobal deblock_h_chroma_422_intra, 4,7,8 CHROMA_H_START mov r6d, 32/mmsize .loop: TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) lea r0, [r0+r1*(mmsize/2)] lea t5, [t5+r1*(mmsize/2)] dec r6d jg .loop RET %endmacro ; DEBLOCK_CHROMA_INTRA INIT_XMM sse2 DEBLOCK_CHROMA_INTRA_BODY DEBLOCK_CHROMA_INTRA INIT_XMM avx DEBLOCK_CHROMA_INTRA_BODY DEBLOCK_CHROMA_INTRA INIT_MMX mmx2 DEBLOCK_CHROMA_INTRA_BODY %if ARCH_X86_64 == 0 DEBLOCK_CHROMA_INTRA %endif ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal deblock_h_chroma_intra_mbaff, 4,6,8 CHROMA_H_START TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) RET %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2], ; uint8_t bs[2][4][4], int mvy_limit, int bframe ) ;----------------------------------------------------------------------------- %define scan8start (4+1*8) %define nnz r0+scan8start %define ref r1+scan8start %define mv r2+scan8start*4 %define bs0 r3 %define bs1 r3+32 %macro LOAD_BYTES_MMX 1 movd m2, [%1+8*0-1] movd m0, [%1+8*0] movd m3, [%1+8*2-1] movd m1, [%1+8*2] punpckldq m2, [%1+8*1-1] punpckldq m0, [%1+8*1] punpckldq m3, [%1+8*3-1] punpckldq m1, [%1+8*3] %endmacro %macro DEBLOCK_STRENGTH_REFS_MMX 0 LOAD_BYTES_MMX ref pxor m2, m0 pxor m3, m1 por m2, [bs0+0] por m3, [bs0+8] movq [bs0+0], m2 movq [bs0+8], m3 movd m2, [ref-8*1] movd m3, [ref+8*1] punpckldq m2, m0 ; row -1, row 0 punpckldq m3, m1 ; row 1, row 2 pxor m0, m2 pxor m1, m3 por m0, [bs1+0] por m1, [bs1+8] movq [bs1+0], m0 movq [bs1+8], m1 %endmacro %macro DEBLOCK_STRENGTH_MVS_MMX 2 mova m0, [mv-%2] mova m1, [mv-%2+8] psubw m0, [mv] psubw m1, [mv+8] packsswb m0, m1 ABSB m0, m1 psubusb m0, m7 packsswb m0, m0 por m0, [%1] movd [%1], m0 %endmacro %macro DEBLOCK_STRENGTH_NNZ_MMX 1 por m2, m0 por m3, m1 mova m4, [%1] mova m5, [%1+8] pminub m2, m6 pminub m3, m6 pminub m4, m6 ; mv ? 1 : 0 pminub m5, m6 paddb m2, m2 ; nnz ? 2 : 0 paddb m3, m3 pmaxub m2, m4 pmaxub m3, m5 %endmacro %macro LOAD_BYTES_XMM 1 movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation movu m1, [%1+12] pslldq m0, m2, 1 shufps m2, m1, q3131 ; cur nnz, all rows pslldq m1, 1 shufps m0, m1, q3131 ; left neighbors pslldq m1, m2, 4 movd m3, [%1-8] ; could be palignr if nnz was aligned por m1, m3 ; top neighbors %endmacro INIT_MMX mmx2 cglobal deblock_strength, 6,6 ; Prepare mv comparison register shl r4d, 8 add r4d, 3 - (1<<8) movd m7, r4d SPLATW m7, m7 mova m6, [pb_1] pxor m0, m0 mova [bs0+0], m0 mova [bs0+8], m0 mova [bs1+0], m0 mova [bs1+8], m0 .lists: DEBLOCK_STRENGTH_REFS_MMX mov r4d, 4 .mvs: DEBLOCK_STRENGTH_MVS_MMX bs0, 4 DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8 add r2, 4*8 add r3, 4 dec r4d jg .mvs add r1, 40 add r2, 4*8 sub r3, 16 dec r5d jge .lists ; Check nnz LOAD_BYTES_MMX nnz DEBLOCK_STRENGTH_NNZ_MMX bs0 ; Transpose column output SBUTTERFLY bw, 2, 3, 4 SBUTTERFLY bw, 2, 3, 4 mova [bs0+0], m2 mova [bs0+8], m3 movd m2, [nnz-8*1] movd m3, [nnz+8*1] punpckldq m2, m0 ; row -1, row 0 punpckldq m3, m1 ; row 1, row 2 DEBLOCK_STRENGTH_NNZ_MMX bs1 mova [bs1+0], m2 mova [bs1+8], m3 RET %macro DEBLOCK_STRENGTH_XMM 0 cglobal deblock_strength, 6,6,7 ; Prepare mv comparison register shl r4d, 8 add r4d, 3 - (1<<8) movd m6, r4d SPLATW m6, m6 pxor m4, m4 ; bs0 pxor m5, m5 ; bs1 .lists: ; Check refs LOAD_BYTES_XMM ref pxor m0, m2 pxor m1, m2 por m4, m0 por m5, m1 ; Check mvs %if cpuflag(ssse3) mova m0, [mv+4*8*0] mova m1, [mv+4*8*1] palignr m3, m0, [mv+4*8*0-16], 12 palignr m2, m1, [mv+4*8*1-16], 12 psubw m0, m3 psubw m1, m2 packsswb m0, m1 mova m2, [mv+4*8*2] mova m1, [mv+4*8*3] palignr m3, m2, [mv+4*8*2-16], 12 psubw m2, m3 palignr m3, m1, [mv+4*8*3-16], 12 psubw m1, m3 packsswb m2, m1 %else movu m0, [mv-4+4*8*0] movu m1, [mv-4+4*8*1] movu m2, [mv-4+4*8*2] movu m3, [mv-4+4*8*3] psubw m0, [mv+4*8*0] psubw m1, [mv+4*8*1] psubw m2, [mv+4*8*2] psubw m3, [mv+4*8*3] packsswb m0, m1 packsswb m2, m3 %endif ABSB m0, m1 ABSB m2, m3 psubusb m0, m6 psubusb m2, m6 packsswb m0, m2 por m4, m0 mova m0, [mv+4*8*-1] mova m1, [mv+4*8* 0] mova m2, [mv+4*8* 1] mova m3, [mv+4*8* 2] psubw m0, m1 psubw m1, m2 psubw m2, m3 psubw m3, [mv+4*8* 3] packsswb m0, m1 packsswb m2, m3 ABSB m0, m1 ABSB m2, m3 psubusb m0, m6 psubusb m2, m6 packsswb m0, m2 por m5, m0 add r1, 40 add r2, 4*8*5 dec r5d jge .lists ; Check nnz LOAD_BYTES_XMM nnz por m0, m2 por m1, m2 mova m6, [pb_1] pminub m0, m6 pminub m1, m6 pminub m4, m6 ; mv ? 1 : 0 pminub m5, m6 paddb m0, m0 ; nnz ? 2 : 0 paddb m1, m1 pmaxub m4, m0 pmaxub m5, m1 %if cpuflag(ssse3) pshufb m4, [transpose_shuf] %else movhlps m3, m4 punpcklbw m4, m3 movhlps m3, m4 punpcklbw m4, m3 %endif mova [bs1], m5 mova [bs0], m4 RET %endmacro INIT_XMM sse2 DEBLOCK_STRENGTH_XMM INIT_XMM ssse3 DEBLOCK_STRENGTH_XMM INIT_XMM avx DEBLOCK_STRENGTH_XMM