1 ;*****************************************************************************
2 ;* deblock-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;*****************************************************************************
26 pb_00: times 16 db 0x00
27 pb_01: times 16 db 0x01
28 pb_03: times 16 db 0x03
29 pb_a1: times 16 db 0xa1
33 ; expands to [base],...,[base+7*stride]
34 %define PASS8ROWS(base, base3, stride, stride3) \
35 [base], [base+stride], [base+stride*2], [base3], \
36 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
38 ; in: 8 rows of 4 bytes in %1..%8
39 ; out: 4 rows of 8 bytes in m0..m3
40 %macro TRANSPOSE4x8_LOAD 8
69 ; in: 4 rows of 8 bytes in m0..m3
70 ; out: 8 rows of 4 bytes in %1..%8
71 %macro TRANSPOSE8x4_STORE 8
111 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113 %macro TRANSPOSE6x8_MEM 9
121 SBUTTERFLY bw, m0, m1, m7
122 SBUTTERFLY bw, m2, m3, m1
123 SBUTTERFLY bw, m4, m5, m3
125 SBUTTERFLY bw, m6, %8, m5
126 SBUTTERFLY wd, m0, m2, m1
127 SBUTTERFLY wd, m4, m6, m2
130 SBUTTERFLY wd, m7, [%9+0x10], m6
131 SBUTTERFLY wd, m3, m5, m4
132 SBUTTERFLY dq, m7, m3, m0
133 SBUTTERFLY dq, m1, m2, m5
142 ; in: 8 rows of 8 in %1..%8
143 ; out: 8 rows of 8 in %9..%16
144 %macro TRANSPOSE8x8_MEM 16
152 SBUTTERFLY bw, m0, m1, m7
153 SBUTTERFLY bw, m2, m3, m1
154 SBUTTERFLY bw, m4, m5, m3
155 SBUTTERFLY bw, m6, %8, m5
157 SBUTTERFLY wd, m0, m2, m3
158 SBUTTERFLY wd, m4, m6, m2
159 SBUTTERFLY wd, m7, m1, m6
162 SBUTTERFLY wd, m2, m5, m1
163 SBUTTERFLY dq, m0, m4, m5
164 SBUTTERFLY dq, m7, m2, m4
169 SBUTTERFLY dq, m3, %11, m0
170 SBUTTERFLY dq, m6, m1, m5
177 ; out: %4 = |%1-%2|>%3
188 ; out: %4 = |%1-%2|>%3
209 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210 ; out: m5=beta-1, m7=mask, %3=alpha-1
217 packuswb m4, m4 ; 16x alpha-1
218 packuswb m5, m5 ; 16x beta-1
222 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
225 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
231 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
234 %macro DEBLOCK_P0_Q0 0
237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
240 pavgb m3, m0 ; (p1 - q1 + 256)>>1
241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
243 pavgb m4, m2 ; (q0 - p0 + 256)>>1
245 paddusb m3, m4 ; d+128+33
246 mova m6, [pb_a1 GLOBAL]
248 psubusb m3, [pb_a1 GLOBAL]
258 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260 ; clobbers: q2, tmp, tc0
264 pavgb %2, %6 ; avg(p2,avg(p0,q0))
266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
277 ;-----------------------------------------------------------------------------
278 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279 ;-----------------------------------------------------------------------------
281 cglobal x264_deblock_v_luma_sse2, 5,5,10
287 add r4, r0 ; pix-3*stride
289 mova m0, [r4+r1] ; p1
290 mova m1, [r4+2*r1] ; p0
292 mova m3, [r0+r1] ; q1
296 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
303 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
308 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
310 movdqa m4, [r0+2*r1] ; q2
311 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
316 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
323 ;-----------------------------------------------------------------------------
324 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 ;-----------------------------------------------------------------------------
327 cglobal x264_deblock_h_luma_sse2, 5,7
334 %define pix_tmp rsp+0x30
340 ; transpose 6x16 -> tmp space
341 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
344 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
347 ; alpha, beta, tc0 are still in r2d, r3d, r4
348 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
349 lea r0, [pix_tmp+0x30]
354 call x264_deblock_v_luma_sse2
356 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
359 movq m0, [pix_tmp+0x18]
360 movq m1, [pix_tmp+0x28]
361 movq m2, [pix_tmp+0x38]
362 movq m3, [pix_tmp+0x48]
363 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
369 movq m0, [pix_tmp+0x10]
370 movq m1, [pix_tmp+0x20]
371 movq m2, [pix_tmp+0x30]
372 movq m3, [pix_tmp+0x40]
373 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
384 %macro DEBLOCK_LUMA 3
385 ;-----------------------------------------------------------------------------
386 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
387 ;-----------------------------------------------------------------------------
388 cglobal x264_deblock_%2_luma_%1, 5,5
393 add r4, r0 ; pix-3*stride
394 %assign pad 2*%3+12-(stack_offset&15)
397 mova m0, [r4+r1] ; p1
398 mova m1, [r4+2*r1] ; p0
400 mova m3, [r0+r1] ; q1
406 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
407 mova [esp+%3], m4 ; tc
411 mova [esp], m4 ; mask
414 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
416 pand m4, [esp+%3] ; tc
420 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
422 mova m4, [r0+2*r1] ; q2
423 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
424 mova m5, [esp] ; mask
426 mova m5, [esp+%3] ; tc
430 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
438 ;-----------------------------------------------------------------------------
439 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
440 ;-----------------------------------------------------------------------------
442 cglobal x264_deblock_h_luma_%1, 0,5
448 %assign pad 0x78-(stack_offset&15)
450 %define pix_tmp esp+12
452 ; transpose 6x16 -> tmp space
453 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
456 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
459 lea r0, [pix_tmp+0x30]
465 call x264_deblock_%2_luma_%1
467 add dword [esp ], 8 ; pix_tmp+0x38
468 add dword [esp+16], 2 ; tc0+2
469 call x264_deblock_%2_luma_%1
473 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
478 movq m0, [pix_tmp+0x10]
479 movq m1, [pix_tmp+0x20]
480 movq m2, [pix_tmp+0x30]
481 movq m3, [pix_tmp+0x40]
482 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
486 movq m0, [pix_tmp+0x18]
487 movq m1, [pix_tmp+0x28]
488 movq m2, [pix_tmp+0x38]
489 movq m3, [pix_tmp+0x48]
490 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
494 %endmacro ; DEBLOCK_LUMA
497 DEBLOCK_LUMA mmxext, v8, 8
499 DEBLOCK_LUMA sse2, v, 16
505 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
510 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
523 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
530 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
534 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
539 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
547 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
555 mova %1, t1 ; store p0
561 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
563 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
568 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
576 mova %2, t0 ; store p1
577 mova %3, t1 ; store p2
580 %macro LUMA_INTRA_SWAP_PQ 0
586 %define mask1p mask1q
589 %macro DEBLOCK_LUMA_INTRA 2
605 %define mask1q [rsp-24]
609 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
614 %define mask0 spill(2)
615 %define mask1p spill(3)
616 %define mask1q spill(4)
617 %define mpb_00 [pb_00 GLOBAL]
618 %define mpb_01 [pb_01 GLOBAL]
621 ;-----------------------------------------------------------------------------
622 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
623 ;-----------------------------------------------------------------------------
624 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
629 lea r5, [r1*3] ; 3*stride
635 add r4, r0 ; pix-4*stride
642 mova mpb_01, [pb_01 GLOBAL]
643 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
644 SWAP 7, 12 ; m12=mask0
646 pavgb t5, mpb_01 ; alpha/4+1
649 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
650 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
651 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
658 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
661 pavgb m4, [pb_00 GLOBAL]
662 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
663 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
665 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
668 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
672 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
674 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
683 ;-----------------------------------------------------------------------------
684 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
685 ;-----------------------------------------------------------------------------
686 cglobal x264_deblock_h_luma_intra_%1, 4,7
694 ; transpose 8x16 -> tmp space
695 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
698 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
700 lea r0, [pix_tmp+0x40]
702 call x264_deblock_v_luma_intra_%1
704 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
706 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
711 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
715 cglobal x264_deblock_h_luma_intra_%1, 2,4
719 %assign pad 0x8c-(stack_offset&15)
723 ; transpose 8x16 -> tmp space
724 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
727 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
729 lea r0, [pix_tmp+0x40]
734 call x264_deblock_%2_luma_intra_%1
736 add dword [rsp], 8 ; pix_tmp+8
737 call x264_deblock_%2_luma_intra_%1
746 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
747 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
750 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
754 %endmacro ; DEBLOCK_LUMA_INTRA
757 DEBLOCK_LUMA_INTRA sse2, v
760 DEBLOCK_LUMA_INTRA mmxext, v8
767 %macro CHROMA_V_START 0
775 %macro CHROMA_H_START 0
787 ;-----------------------------------------------------------------------------
788 ; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
789 ;-----------------------------------------------------------------------------
790 cglobal x264_deblock_v_chroma_mmxext, 5,6
796 call chroma_inter_body_mmxext
801 ;-----------------------------------------------------------------------------
802 ; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
803 ;-----------------------------------------------------------------------------
804 cglobal x264_deblock_h_chroma_mmxext, 5,7
806 %define buf0 [rsp-24]
807 %define buf1 [rsp-16]
813 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
816 call chroma_inter_body_mmxext
819 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
823 chroma_inter_body_mmxext:
833 ; in: %1=p0 %2=p1 %3=q1
834 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
835 %macro CHROMA_INTRA_P0 3
838 pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
841 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
847 ;-----------------------------------------------------------------------------
848 ; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
849 ;-----------------------------------------------------------------------------
850 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
856 call chroma_intra_body_mmxext
861 ;-----------------------------------------------------------------------------
862 ; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
863 ;-----------------------------------------------------------------------------
864 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
866 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
867 call chroma_intra_body_mmxext
868 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
872 chroma_intra_body_mmxext:
876 CHROMA_INTRA_P0 m1, m0, m3
877 CHROMA_INTRA_P0 m2, m3, m0