%endmacro
ALIGN 16
-; input in m0 ... m3 and tcs in tc (r2). Output in m1 and m2
+; input in m0 ... m3 and tcs in r2. Output in m1 and m2
%macro CHROMA_DEBLOCK_BODY 1
psubw m4, m2, m1; q0 - p0
psubw m5, m0, m3; p1 - q1
paddw m5, [pw_4]; +4
psraw m5, 3; >> 3
+%if %1 > 8
psllw m4, %1-8; << (BIT_DEPTH - 8)
psllw m6, %1-8; << (BIT_DEPTH - 8)
+%endif
pmaxsw m5, m4
pminsw m5, m6
paddw m1, m5; p0 + delta0
psubw m2, m5; q0 - delta0
%endmacro
-; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6
+; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
%macro LUMA_DEBLOCK_BODY 2
psllw m9, m2, 1; *2
psubw m10, m1, m9
ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
;beta calculations
- mov r11, [betaq];
- shl r11, %1 - 8
- movd m13, r11d; beta0
- add betaq, 4;
- punpcklwd m13, m13
- mov r12, [betaq];
- shl r12, %1 - 8
- movd m14, r12d; beta1
- punpcklwd m14, m14
- pshufd m13, m14, 0; beta0, beta1
+%if %1 > 8
+ shl betaq, %1 - 8
+%endif
+ movd m13, betad
+ SPLATW m13, m13, 0
;end beta calculations
paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3
paddw m14, m9; 0d0+0d3, 1d0+1d3
;compare
- pcmpgtw m15, m13, m14; beta0, beta1
+ pcmpgtw m15, m13, m14
movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
- cmp r13, 0
- je .bypassluma
+ test r13, r13
+ je .bypassluma
;weak / strong decision compare to beta_2
psraw m15, m13, 2; beta >> 2
; end calc for weak filter
; filtering mask
- mov r2, r13
- shr r2, 3
- movd m15, r2d
+ mov r11, r13
+ shr r11, 3
+ movd m15, r11d
and r13, 1
movd m11, r13d
shufps m11, m15, 0
- shl r2, 1
- or r13, r2
+ shl r11, 1
+ or r13, r11
pcmpeqd m11, [pd_1]; filtering mask
;decide between strong and weak filtering
;tc25 calculations
- mov r2d, [tcq];
- shl r2, %1 - 8
- movd m8, r2d; tc0
+ mov r11d, [tcq];
+%if %1 > 8
+ shl r11, %1 - 8
+%endif
+ movd m8, r11d; tc0
add tcq, 4;
mov r3d, [tcq];
+%if %1 > 8
shl r3, %1 - 8
+%endif
movd m9, r3d; tc1
- add r2d, r3d; tc0 + tc1
- jz .bypassluma
+ add r11d, r3d; tc0 + tc1
+ jz .bypassluma
punpcklwd m8, m8
punpcklwd m9, m9
shufps m8, m9, 0; tc0, tc1
psraw m13, 3; beta >> 3
pcmpgtw m13, m12;
- movmskps r2, m13;
- and r14, r2; strong mask , beta_2 and beta_3 comparisons
+ movmskps r11, m13;
+ and r14, r11; strong mask , beta_2 and beta_3 comparisons
;----beta_3 comparison end-----
;----tc25 comparison---
psubw m12, m3, m4; p0 - q0
pshuflw m12, m12, 0xf0 ;0b11110000;
pcmpgtw m8, m12; tc25 comparisons
- movmskps r2, m8;
- and r14, r2; strong mask, beta_2, beta_3 and tc25 comparisons
+ movmskps r11, m8;
+ and r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
;----tc25 comparison end---
- mov r2, r14;
- shr r2, 1;
- and r14, r2; strong mask, bits 2 and 0
+ mov r11, r14;
+ shr r11, 1;
+ and r14, r11; strong mask, bits 2 and 0
pmullw m14, m9, [pw_m2]; -tc * 2
- psllw m9, 1; tc * 2
+ paddw m9, m9
and r14, 5; 0b101
- mov r2, r14; strong mask
+ mov r11, r14; strong mask
shr r14, 2;
movd m12, r14d; store to xmm for mask generation
shl r14, 1
- and r2, 1
- movd m10, r2d; store to xmm for mask generation
- or r14, r2; final strong mask, bits 1 and 0
+ and r11, 1
+ movd m10, r11d; store to xmm for mask generation
+ or r14, r11; final strong mask, bits 1 and 0
jz .weakfilter
shufps m10, m12, 0
paddw m12, m2, m3; p1 + p0
paddw m12, m4; p1 + p0 + q0
mova m10, m12; copy
- psllw m12, 1; 2*p1 + 2*p0 + 2*q0
+ paddw m12, m12; 2*p1 + 2*p0 + 2*q0
paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
paddw m15, m2; p1'
paddw m8, m1, m0; p3 + p2
- psllw m8, 1; 2*p3 + 2*p2
+ paddw m8, m8; 2*p3 + 2*p2
paddw m8, m1; 2*p3 + 3*p2
paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0
- psllw m13, 1; 4 in every cell
+ paddw m13, m13
paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4
psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
paddw m8, m3, m4; p0 + q0
paddw m8, m5; p0 + q0 + q1
- psllw m8, 1; 2*p0 + 2*q0 + 2*q1
+ paddw m8, m8; 2*p0 + 2*q0 + 2*q1
paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1
paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2
paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
paddw m13, m7; q3 + 2
paddw m13, m6; q3 + q2 + 2
- psllw m13, 1; 2*q3 + 2*q2 + 4
- paddw m13, m6; 2*q3 + 3*q2 + 4
+ paddw m13, m13; 2*q3 + 2*q2 + 4
+ paddw m13, m6; 2*q3 + 3*q2 + 4
paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4
psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
.weakfilter:
not r14; strong mask -> weak mask
and r14, r13; final weak filtering mask, bits 0 and 1
- jz .store
+ jz .store
; weak filtering mask
- mov r2, r14
- shr r2, 1
- movd m12, r2d
+ mov r11, r14
+ shr r11, 1
+ movd m12, r11d
and r14, 1
movd m11, r14d
shufps m11, m12, 0
pcmpeqd m11, [pd_1]; filtering mask
- mov r13, r11; beta0
- shr r13, 1;
- add r11, r13
- shr r11, 3; ((beta0+(beta0>>1))>>3))
-
- mov r13, r12; beta1
+ mov r13, betaq
shr r13, 1;
- add r12, r13
- shr r12, 3; ((beta1+(beta1>>1))>>3))
+ add betaq, r13
+ shr betaq, 3; ((beta + (beta >> 1)) >> 3))
mova m13, [pw_8]
psubw m12, m4, m3 ; q0 - p0
paddw m15, m2; p1'
;beta calculations
- movd m10, r11d; beta0
- punpcklwd m10, m10
- movd m13, r12d; beta1
- punpcklwd m13, m13
- shufps m10, m13, 0; betax0, betax1
+ movd m10, betad
+ SPLATW m10, m10, 0
movd m13, r7d; 1dp0 + 1dp3
movd m8, r8d; 0dp0 + 0dp3
punpcklbw m2, m5
punpcklbw m3, m5
CHROMA_DEBLOCK_BODY 8
- packuswb m1, m1 ; p0' packed in bytes on low quadword
- packuswb m2, m2 ; q0' packed in bytes on low quadword
- movq [pix0q+strideq], m1
- movq [pixq], m2
+ packuswb m1, m2
+ movh[pix0q+strideq], m1
+ movhps [pixq], m1
RET
cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
RET
%if ARCH_X86_64
-INIT_XMM ssse3
+%macro LOOP_FILTER_LUMA 0
;-----------------------------------------------------------------------------
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
sub r0, 4
- lea r5, [3*r1]
+ lea r5, [3 * r1]
mov r6, r0
add r0, r5
TRANSPOSE8x8B_LOAD PASS8ROWS(r6, r0, r1, r5)
- LUMA_DEBLOCK_BODY 8, v
+ LUMA_DEBLOCK_BODY 8, v
.store:
TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
.bypassluma:
cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
sub pixq, 8
- lea r5, [3*strideq]
+ lea r5, [3 * strideq]
mov r6, pixq
add pixq, r5
TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
- LUMA_DEBLOCK_BODY 10, v
+ LUMA_DEBLOCK_BODY 10, v
.store:
TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
.bypassluma:
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int *_beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
;-----------------------------------------------------------------------------
cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
- lea src3strideq, [3*strideq]
+ lea src3strideq, [3 * strideq]
mov pix0q, pixq
sub pix0q, src3strideq
sub pix0q, strideq
- movdqu m0, [pix0q]; p3
- movdqu m1, [pix0q+strideq]; p2
- movdqu m2, [pix0q+2*strideq]; p1
- movdqu m3, [pix0q+src3strideq]; p0
- movdqu m4, [pixq]; q0
- movdqu m5, [pixq+strideq]; q1
- movdqu m6, [pixq+2*strideq]; q2
- movdqu m7, [pixq+src3strideq]; q3
+ movdqu m0, [pix0q]; p3
+ movdqu m1, [pix0q + strideq]; p2
+ movdqu m2, [pix0q + 2 * strideq]; p1
+ movdqu m3, [pix0q + src3strideq]; p0
+ movdqu m4, [pixq]; q0
+ movdqu m5, [pixq + strideq]; q1
+ movdqu m6, [pixq + 2 * strideq]; q2
+ movdqu m7, [pixq + src3strideq]; q3
pxor m8, m8
punpcklbw m0, m8
punpcklbw m1, m8
punpcklbw m5, m8
punpcklbw m6, m8
punpcklbw m7, m8
- LUMA_DEBLOCK_BODY 8, h
+ LUMA_DEBLOCK_BODY 8, h
.store:
- packuswb m1, m1; p2
- packuswb m2, m2; p1
- packuswb m3, m3; p0
- packuswb m4, m4; q0
- packuswb m5, m5; q1
- packuswb m6, m6; q2
- movq [r5+r1], m1; p2
- movq [r5+2*r1], m2; p1
- movq [r5+r6], m3; p0
- movq [r0], m4; q0
- movq [r0+r1], m5; q1
- movq [r0+2*r1], m6; q2
+ packuswb m1, m2
+ packuswb m3, m4
+ packuswb m5, m6
+ movh [r5 + r1], m1
+ movhps [r5 + 2 * r1], m1
+ movh [r5 + r6], m3
+ movhps [r0 ], m3
+ movh [r0 + r1], m5
+ movhps [r0 + 2 * r1], m5
.bypassluma:
RET
cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
- lea src3strideq, [3*strideq]
- mov pix0q, pixq
- sub pix0q, src3strideq
- sub pix0q, strideq
- movdqu m0, [pix0q]; p3
- movdqu m1, [pix0q+strideq]; p2
- movdqu m2, [pix0q+2*strideq]; p1
- movdqu m3, [pix0q+src3strideq]; p0
- movdqu m4, [pixq]; q0
- movdqu m5, [pixq+strideq]; q1
- movdqu m6, [pixq+2*strideq]; q2
- movdqu m7, [pixq+src3strideq]; q3
- LUMA_DEBLOCK_BODY 10, h
+ lea src3strideq, [3 * strideq]
+ mov pix0q, pixq
+ sub pix0q, src3strideq
+ sub pix0q, strideq
+ movdqu m0, [pix0q]; p3
+ movdqu m1, [pix0q + strideq]; p2
+ movdqu m2, [pix0q + 2 * strideq]; p1
+ movdqu m3, [pix0q + src3strideq]; p0
+ movdqu m4, [pixq]; q0
+ movdqu m5, [pixq + strideq]; q1
+ movdqu m6, [pixq + 2 * strideq]; q2
+ movdqu m7, [pixq + src3strideq]; q3
+ LUMA_DEBLOCK_BODY 10, h
.store:
- pxor m8, m8; zeros reg
- CLIPW m1, m8, [pw_pixel_max]
- CLIPW m2, m8, [pw_pixel_max]
- CLIPW m3, m8, [pw_pixel_max]
- CLIPW m4, m8, [pw_pixel_max]
- CLIPW m5, m8, [pw_pixel_max]
- CLIPW m6, m8, [pw_pixel_max]
- movdqu [pix0q+strideq], m1; p2
- movdqu [pix0q+2*strideq], m2; p1
- movdqu [pix0q+src3strideq], m3; p0
- movdqu [pixq], m4; q0
- movdqu [pixq+strideq], m5; q1
- movdqu [pixq+2*strideq], m6; q2
+ pxor m8, m8; zeros reg
+ CLIPW m1, m8, [pw_pixel_max]
+ CLIPW m2, m8, [pw_pixel_max]
+ CLIPW m3, m8, [pw_pixel_max]
+ CLIPW m4, m8, [pw_pixel_max]
+ CLIPW m5, m8, [pw_pixel_max]
+ CLIPW m6, m8, [pw_pixel_max]
+ movdqu [pix0q + strideq], m1; p2
+ movdqu [pix0q + 2 * strideq], m2; p1
+ movdqu [pix0q + src3strideq], m3; p0
+ movdqu [pixq ], m4; q0
+ movdqu [pixq + strideq], m5; q1
+ movdqu [pixq + 2 * strideq], m6; q2
.bypassluma:
RET
+%endmacro
+
+INIT_XMM sse2
+LOOP_FILTER_LUMA
+INIT_XMM ssse3
+LOOP_FILTER_LUMA
%endif