1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
9 ;* This file is part of Libav.
11 ;* Libav is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
16 ;* Libav is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with Libav; if not, write to the Free Software
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
27 %include "x86util.asm"
38 ; expands to [base],...,[base+7*stride]
39 %define PASS8ROWS(base, base3, stride, stride3) \
40 [base], [base+stride], [base+stride*2], [base3], \
41 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
43 ; in: 8 rows of 4 bytes in %1..%8
44 ; out: 4 rows of 8 bytes in m0..m3
45 %macro TRANSPOSE4x8_LOAD 8
74 ; in: 4 rows of 8 bytes in m0..m3
75 ; out: 8 rows of 4 bytes in %1..%8
76 %macro TRANSPOSE8x4_STORE 8
116 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
117 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
118 %macro TRANSPOSE6x8_MEM 9
126 SBUTTERFLY3 bw, m0, m1, m7
127 SBUTTERFLY3 bw, m2, m3, m1
128 SBUTTERFLY3 bw, m4, m5, m3
130 SBUTTERFLY3 bw, m6, %8, m5
131 SBUTTERFLY3 wd, m0, m2, m1
132 SBUTTERFLY3 wd, m4, m6, m2
135 SBUTTERFLY3 wd, m7, [%9+0x10], m6
136 SBUTTERFLY3 wd, m3, m5, m4
137 SBUTTERFLY3 dq, m7, m3, m0
138 SBUTTERFLY3 dq, m1, m2, m5
147 ; in: 8 rows of 8 in %1..%8
148 ; out: 8 rows of 8 in %9..%16
149 %macro TRANSPOSE8x8_MEM 16
157 SBUTTERFLY3 bw, m0, m1, m7
158 SBUTTERFLY3 bw, m2, m3, m1
159 SBUTTERFLY3 bw, m4, m5, m3
160 SBUTTERFLY3 bw, m6, %8, m5
162 SBUTTERFLY3 wd, m0, m2, m3
163 SBUTTERFLY3 wd, m4, m6, m2
164 SBUTTERFLY3 wd, m7, m1, m6
167 SBUTTERFLY3 wd, m2, m5, m1
168 SBUTTERFLY3 dq, m0, m4, m5
169 SBUTTERFLY3 dq, m7, m2, m4
174 SBUTTERFLY3 dq, m3, %11, m0
175 SBUTTERFLY3 dq, m6, m1, m5
182 ; out: %4 = |%1-%2|>%3
193 ; out: %4 = |%1-%2|>%3
214 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
215 ; out: m5=beta-1, m7=mask, %3=alpha-1
222 packuswb m4, m4 ; 16x alpha-1
223 packuswb m5, m5 ; 16x beta-1
227 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
228 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
230 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
236 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
239 %macro DEBLOCK_P0_Q0 0
242 pand m5, [pb_1] ; (p0^q0)&1
245 pavgb m3, m0 ; (p1 - q1 + 256)>>1
246 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
248 pavgb m4, m2 ; (q0 - p0 + 256)>>1
250 paddusb m3, m4 ; d+128+33
263 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
264 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
265 ; clobbers: q2, tmp, tc0
269 pavgb %2, %6 ; avg(p2,avg(p0,q0))
271 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
272 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
282 ;-----------------------------------------------------------------------------
283 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
284 ;-----------------------------------------------------------------------------
286 cglobal x264_deblock_v_luma_sse2, 5,5,10
292 add r4, r0 ; pix-3*stride
294 mova m0, [r4+r1] ; p1
295 mova m1, [r4+2*r1] ; p0
297 mova m3, [r0+r1] ; q1
301 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
308 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
313 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
315 movdqa m4, [r0+2*r1] ; q2
316 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
321 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
328 ;-----------------------------------------------------------------------------
329 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
330 ;-----------------------------------------------------------------------------
332 cglobal x264_deblock_h_luma_sse2, 5,7
339 %define pix_tmp rsp+0x30
345 ; transpose 6x16 -> tmp space
346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
349 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
352 ; alpha, beta, tc0 are still in r2d, r3d, r4
353 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
354 lea r0, [pix_tmp+0x30]
359 call x264_deblock_v_luma_sse2
361 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
364 movq m0, [pix_tmp+0x18]
365 movq m1, [pix_tmp+0x28]
366 movq m2, [pix_tmp+0x38]
367 movq m3, [pix_tmp+0x48]
368 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
374 movq m0, [pix_tmp+0x10]
375 movq m1, [pix_tmp+0x20]
376 movq m2, [pix_tmp+0x30]
377 movq m3, [pix_tmp+0x40]
378 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
389 %macro DEBLOCK_LUMA 3
390 ;-----------------------------------------------------------------------------
391 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
392 ;-----------------------------------------------------------------------------
393 cglobal x264_deblock_%2_luma_%1, 5,5
398 add r4, r0 ; pix-3*stride
399 %assign pad 2*%3+12-(stack_offset&15)
402 mova m0, [r4+r1] ; p1
403 mova m1, [r4+2*r1] ; p0
405 mova m3, [r0+r1] ; q1
411 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
412 mova [esp+%3], m4 ; tc
416 mova [esp], m4 ; mask
419 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
421 pand m4, [esp+%3] ; tc
425 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
427 mova m4, [r0+2*r1] ; q2
428 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
429 mova m5, [esp] ; mask
431 mova m5, [esp+%3] ; tc
435 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
443 ;-----------------------------------------------------------------------------
444 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
445 ;-----------------------------------------------------------------------------
447 cglobal x264_deblock_h_luma_%1, 0,5
453 %assign pad 0x78-(stack_offset&15)
455 %define pix_tmp esp+12
457 ; transpose 6x16 -> tmp space
458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
461 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
464 lea r0, [pix_tmp+0x30]
470 call x264_deblock_%2_luma_%1
472 add dword [esp ], 8 ; pix_tmp+0x38
473 add dword [esp+16], 2 ; tc0+2
474 call x264_deblock_%2_luma_%1
478 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
483 movq m0, [pix_tmp+0x10]
484 movq m1, [pix_tmp+0x20]
485 movq m2, [pix_tmp+0x30]
486 movq m3, [pix_tmp+0x40]
487 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
491 movq m0, [pix_tmp+0x18]
492 movq m1, [pix_tmp+0x28]
493 movq m2, [pix_tmp+0x38]
494 movq m3, [pix_tmp+0x48]
495 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
499 %endmacro ; DEBLOCK_LUMA
502 DEBLOCK_LUMA mmxext, v8, 8
504 DEBLOCK_LUMA sse2, v, 16
510 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
515 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
528 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
535 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
539 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
544 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
552 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
560 mova %1, t1 ; store p0
566 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
568 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
573 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
581 mova %2, t0 ; store p1
582 mova %3, t1 ; store p2
585 %macro LUMA_INTRA_SWAP_PQ 0
591 %define mask1p mask1q
594 %macro DEBLOCK_LUMA_INTRA 2
610 %define mask1q [rsp-24]
614 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
619 %define mask0 spill(2)
620 %define mask1p spill(3)
621 %define mask1q spill(4)
626 ;-----------------------------------------------------------------------------
627 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
628 ;-----------------------------------------------------------------------------
629 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
634 lea r5, [r1*3] ; 3*stride
640 add r4, r0 ; pix-4*stride
648 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
649 SWAP 7, 12 ; m12=mask0
651 pavgb t5, mpb_1 ; alpha/4+1
654 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
655 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
656 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
663 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
667 pavgb m4, [pb_1] ; alpha/4+1
668 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
670 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
673 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
677 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
679 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
688 ;-----------------------------------------------------------------------------
689 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
690 ;-----------------------------------------------------------------------------
691 cglobal x264_deblock_h_luma_intra_%1, 4,7
699 ; transpose 8x16 -> tmp space
700 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
703 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
705 lea r0, [pix_tmp+0x40]
707 call x264_deblock_v_luma_intra_%1
709 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
711 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
716 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
720 cglobal x264_deblock_h_luma_intra_%1, 2,4
724 %assign pad 0x8c-(stack_offset&15)
728 ; transpose 8x16 -> tmp space
729 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
732 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
734 lea r0, [pix_tmp+0x40]
739 call x264_deblock_%2_luma_intra_%1
741 add dword [rsp], 8 ; pix_tmp+8
742 call x264_deblock_%2_luma_intra_%1
751 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
752 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
755 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
759 %endmacro ; DEBLOCK_LUMA_INTRA
762 DEBLOCK_LUMA_INTRA sse2, v
765 DEBLOCK_LUMA_INTRA mmxext, v8
772 %macro CHROMA_V_START 0
780 %macro CHROMA_H_START 0
792 ;-----------------------------------------------------------------------------
793 ; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
794 ;-----------------------------------------------------------------------------
795 cglobal x264_deblock_v_chroma_mmxext, 5,6
801 call x264_chroma_inter_body_mmxext
806 ;-----------------------------------------------------------------------------
807 ; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
808 ;-----------------------------------------------------------------------------
809 cglobal x264_deblock_h_chroma_mmxext, 5,7
811 %define buf0 [rsp-24]
812 %define buf1 [rsp-16]
818 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
821 call x264_chroma_inter_body_mmxext
824 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
828 x264_chroma_inter_body_mmxext:
838 ; in: %1=p0 %2=p1 %3=q1
839 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
840 %macro CHROMA_INTRA_P0 3
843 pand m4, [pb_1] ; m4 = (p0^q1)&1
846 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
852 ;-----------------------------------------------------------------------------
853 ; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
854 ;-----------------------------------------------------------------------------
855 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
861 call x264_chroma_intra_body_mmxext
866 ;-----------------------------------------------------------------------------
867 ; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
868 ;-----------------------------------------------------------------------------
869 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
871 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
872 call x264_chroma_intra_body_mmxext
873 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
877 x264_chroma_intra_body_mmxext:
881 CHROMA_INTRA_P0 m1, m0, m3
882 CHROMA_INTRA_P0 m2, m3, m0